Dns and outbound connect prechecks (#5665)

Azure · Jan 30, 2023 · a12e79b · a12e79b
1 parent c2fa9af
commit a12e79b
Show file tree

Hide file tree

Showing 5 changed files with 483 additions and 193 deletions.
diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py
@@ -135,6 +135,8 @@
 Diagnoser_Container_Check_Failed_Fault_Type = "Error occured while performing the diagnoser container checks"
 Cluster_DNS_Check_Fault_Type = "Error occured while performing cluster DNS check"
 Outbound_Connectivity_Check_Fault_Type = "Error occured while performing outbound connectivity check in the cluster"
+Outbound_Connectivity_Failed_Fault_Type = "Outbound network connectivity failed in cluster diagnostic checks"
+DNS_Failed_Fault_Type = "DNS resolution failed in cluster diagnostic checks"
 MSI_Cert_Check_Fault_Type = "Error occurred while trying to perform MSI ceritificate presence check"
 Cluster_Security_Policy_Check_Fault_Type = "Error occured while performing cluster security policy check"
 KAP_Cert_Check_Fault_Type = "Error occurred while trying to perform KAP ceritificate presence check"
@@ -167,6 +169,9 @@
 Arc_Agents_Logs = "arc_agents_logs"
 Arc_Deployment_Logs = "arc_deployment_logs"
 Arc_Diagnostic_Logs = "arc_diagnostic_logs"
+Pre_Onboarding_Check_Logs = "pre_onboarding_check_logs"
+Pre_Onboarding_Helm_Charts_Folder_Name = 'PreOnboardingChecksCharts'
+Pre_Onboarding_Helm_Charts_Release_Name = 'cluster-diagnostic-checks'
 Describe_Non_Ready_Arc_Agents = "describe_non_ready_arc_agents"
 Agent_State = "agent_state.txt"
 Arc_Agents_Events = "arc_agent_events.txt"
@@ -176,7 +181,14 @@
 K8s_Cluster_Info = "k8s_cluster_info.txt"
 Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt"
 Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt"
-
+# Connect Precheck Diagnoser constants
+Cluster_Diagnostic_Checks_Job_Registry_Path = "mcr.microsoft.com/azurearck8s/helmchart/stable/clusterdiagnosticchecks:0.1.0"
+Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing cluster diagnostic checks helm release"
+Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type = "Error occured while executing cluster diagnostic checks"
+Cluster_Diagnostic_Checks_Release_Cleanup_Failed = "Error occured while cleaning up the cluster diagnostic checks helm release"
+Cluster_Diagnostic_Checks_Job_Not_Scheduled = 'Unable to schedule cluster-diagnostic-checks job'
+Cluster_Diagnostic_Checks_Job_Not_Complete = 'Unable to complete cluster-diagnostic-checks job after scheduling'
+Pre_Onboarding_Diagnostic_Checks_Execution_Failed = 'Exception occured while trying to execute pre-onboarding diagnostic checks'
 # Diagnostic Results Name
 Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:"
 DNS_Check_Result_String = "DNS Result:"

diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py
@@ -0,0 +1,221 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+import os
+import shutil
+import subprocess
+from subprocess import Popen, PIPE
+import time
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+import json
+from kubernetes import client, config, watch, utils
+from knack.util import CLIError
+from knack.log import get_logger
+from knack.prompting import NoTTYException, prompt_y_n
+from azure.cli.core.commands.client_factory import get_subscription_id
+from azure.cli.core.util import send_raw_request
+from azure.cli.core import telemetry
+from azure.core.exceptions import ResourceNotFoundError, HttpResponseError
+from msrest.exceptions import AuthenticationError, HttpOperationError, TokenExpiredError
+from msrest.exceptions import ValidationError as MSRestValidationError
+from kubernetes.client.rest import ApiException
+from azext_connectedk8s._client_factory import _resource_client_factory, _resource_providers_client
+import azext_connectedk8s._constants as consts
+import azext_connectedk8s._utils as azext_utils
+from kubernetes import client as kube_client
+from azure.cli.core import get_default_cli
+from azure.cli.core.azclierror import CLIInternalError, ClientRequestError, ArgumentUsageError, ManualInterrupt, AzureResponseError, AzureInternalError, ValidationError
+from argparse import Namespace
+from pydoc import cli
+from logging import exception
+import yaml
+import json
+import datetime
+from subprocess import Popen, PIPE, run, STDOUT, call, DEVNULL
+import shutil
+from knack.log import get_logger
+from azure.cli.core import telemetry
+import azext_connectedk8s._constants as consts
+logger = get_logger(__name__)
+# pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long
+# pylint: disable
+
+diagnoser_output = []
+
+
+def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, filepath_with_timestamp, storage_space_available):
+    global diagnoser_output
+    try:
+        # Setting DNS and Outbound Check as working
+        dns_check = "Starting"
+        outbound_connectivity_check = "Starting"
+        # Executing the cluster_diagnostic_checks job and fetching the logs obtained
+        cluster_diagnostic_checks_container_log = executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud)
+        # If cluster_diagnostic_checks_container_log is not empty then only we will check for the results
+        if(cluster_diagnostic_checks_container_log is not None and cluster_diagnostic_checks_container_log != ""):
+            cluster_diagnostic_checks_container_log_list = cluster_diagnostic_checks_container_log.split("\n")
+            cluster_diagnostic_checks_container_log_list.pop(-1)
+            dns_check_log = ""
+            counter_container_logs = 1
+            # For retrieving only cluster_diagnostic_checks logs from the output
+            for outputs in cluster_diagnostic_checks_container_log_list:
+                if consts.Outbound_Connectivity_Check_Result_String in outputs:
+                    counter_container_logs = 1
+                elif consts.DNS_Check_Result_String in outputs:
+                    dns_check_log += outputs
+                    counter_container_logs = 0
+                elif counter_container_logs == 0:
+                    dns_check_log += "  " + outputs
+            dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output)
+            outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], filepath_with_timestamp, storage_space_available, diagnoser_output)
+        else:
+            return consts.Diagnostic_Check_Incomplete, storage_space_available
+
+        # If both the check passed then we will return cluster diagnostic checks Passed
+        if(dns_check == consts.Diagnostic_Check_Passed and outbound_connectivity_check == consts.Diagnostic_Check_Passed):
+            return consts.Diagnostic_Check_Passed, storage_space_available
+        # If any of the check remain Incomplete than we will return Incomplete
+        elif(dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete):
+            return consts.Diagnostic_Check_Incomplete, storage_space_available
+        else:
+            return consts.Diagnostic_Check_Failed, storage_space_available
+
+    # To handle any exception that may occur during the execution
+    except Exception as e:
+        logger.warning("An exception has occured while trying to execute cluster diagnostic checks container on the cluster. Exception: {}".format(str(e)) + "\n")
+        telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, summary="Error occured while executing the cluster diagnostic checks container")
+
+    return consts.Diagnostic_Check_Incomplete, storage_space_available
+
+
+def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud):
+    job_name = "cluster-diagnostic-checks-job"
+    # Setting the log output as Empty
+    cluster_diagnostic_checks_container_log = ""
+
+    cmd_helm_delete = [helm_client_location, "uninstall", "cluster-diagnostic-checks", "-n", "azure-arc-release"]
+    if kube_config:
+        cmd_helm_delete.extend(["--kubeconfig", kube_config])
+    if kube_context:
+        cmd_helm_delete.extend(["--kube-context", kube_context])
+
+    # To handle the user keyboard Interrupt
+    try:
+        # Executing the cluster diagnostic checks job yaml
+        config.load_kube_config(kube_config, kube_context)
+        # Attempting deletion of cluster diagnostic checks resources to handle the scenario if any stale resources are present
+        response_kubectl_delete_helm = Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
+        output_kubectl_delete_helm, error_kubectl_delete_helm = response_kubectl_delete_helm.communicate()
+        # If any error occured while execution of delete command
+        if (response_kubectl_delete_helm != 0):
+            # Converting the string of multiple errors to list
+            error_msg_list = error_kubectl_delete_helm.decode("ascii").split("\n")
+            error_msg_list.pop(-1)
+            valid_exception_list = []
+            # Checking if any exception occured or not
+            exception_occured_counter = 0
+            for ind_errors in error_msg_list:
+                if('not found' in ind_errors or 'deleted' in ind_errors):
+                    pass
+                else:
+                    valid_exception_list.append(ind_errors)
+                    exception_occured_counter = 1
+            # If any exception occured we will print the exception and return
+            if exception_occured_counter == 1:
+                logger.warning("Cleanup of previous diagnostic checks helm release failed and hence couldn't install the new helm release. Please cleanup older release using \"helm delete cluster-diagnostic-checks -n azuer-arc-release\" and try onboarding again")
+                telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Release_Cleanup_Failed, summary="Error while executing cluster diagnostic checks Job")
+                return
+
+        chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, consts.Pre_Onboarding_Helm_Charts_Folder_Name, consts.Pre_Onboarding_Helm_Charts_Release_Name)
+
+        helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, kube_config, kube_context, helm_client_location)
+
+        # Watching for cluster diagnostic checks container to reach in completed stage
+        w = watch.Watch()
+        is_job_complete = False
+        is_job_scheduled = False
+        # To watch for changes in the pods states till it reach completed state or exit if it takes more than 180 seconds
+        for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='azure-arc-release', label_selector="", timeout_seconds=60):
+            try:
+                # Checking if job get scheduled or not
+                if event["object"].metadata.name == "cluster-diagnostic-checks-job":
+                    is_job_scheduled = True
+                # Checking if job reached completed stage or not
+                if event["object"].metadata.name == "cluster-diagnostic-checks-job" and event["object"].status.conditions[0].type == "Complete":
+                    is_job_complete = True
+                    w.stop()
+            except Exception as e:
+                continue
+            else:
+                continue
+
+        if (is_job_scheduled is False):
+            telemetry.set_exception(exception="Couldn't schedule cluster diagnostic checks job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled,
+                                    summary="Couldn't schedule cluster diagnostic checks job in the cluster")
+            logger.warning("Unable to schedule the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n")
+            Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
+            return
+        elif (is_job_scheduled is True and is_job_complete is False):
+            telemetry.set_exception(exception="Couldn't complete cluster diagnostic checks job after scheduling in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Complete,
+                                    summary="Couldn't complete cluster diagnostic checks job after scheduling in the cluster")
+            logger.warning("Cluster diagnostics job didn't reach completed state in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n")
+            Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
+            return
+        else:
+            # Fetching the cluster diagnostic checks Container logs
+            all_pods = corev1_api_instance.list_namespaced_pod('azure-arc-release')
+            # Traversing through all agents
+            for each_pod in all_pods.items:
+                # Fetching the current Pod name and creating a folder with that name inside the timestamp folder
+                pod_name = each_pod.metadata.name
+                if(pod_name.startswith(job_name)):
+                    # Creating a text file with the name of the container and adding that containers logs in it
+                    cluster_diagnostic_checks_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="cluster-diagnostic-checks-container", namespace='azure-arc-release')
+        # Clearing all the resources after fetching the cluster diagnostic checks container logs
+        Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
+
+    # To handle any exception that may occur during the execution
+    except Exception as e:
+        logger.warning("An exception has occured while trying to execute the cluster diagnostic checks in the cluster. Exception: {}".format(str(e)) + "\n")
+        Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
+        telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, summary="Error while executing cluster diagnostic checks Job")
+        return
+
+    return cluster_diagnostic_checks_container_log
+
+
+def helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, kube_config, kube_context, helm_client_location, onboarding_timeout="60"):
+    cmd_helm_install = [helm_client_location, "upgrade", "--install", "cluster-diagnostic-checks", chart_path, "--namespace", "{}".format(consts.Release_Install_Namespace), "--create-namespace", "--output", "json"]
+    # To set some other helm parameters through file
+    cmd_helm_install.extend(["--set", "global.location={}".format(location)])
+    cmd_helm_install.extend(["--set", "global.azureCloud={}".format(azure_cloud)])
+    if https_proxy:
+        cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)])
+    if http_proxy:
+        cmd_helm_install.extend(["--set", "global.httpProxy={}".format(http_proxy)])
+    if no_proxy:
+        cmd_helm_install.extend(["--set", "global.noProxy={}".format(no_proxy)])
+    if proxy_cert:
+        cmd_helm_install.extend(["--set-file", "global.proxyCert={}".format(proxy_cert)])
+
+    if kube_config:
+        cmd_helm_install.extend(["--kubeconfig", kube_config])
+    if kube_context:
+        cmd_helm_install.extend(["--kube-context", kube_context])
+
+    # Change --timeout format for helm client to understand
+    onboarding_timeout = onboarding_timeout + "s"
+    cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)])
+
+    response_helm_install = Popen(cmd_helm_install, stdout=PIPE, stderr=PIPE)
+    _, error_helm_install = response_helm_install.communicate()
+    if response_helm_install.returncode != 0:
+        if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")):
+            telemetry.set_user_fault()
+        telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type,
+                                summary='Unable to install cluster diagnostic checks helm release')
+        raise CLIInternalError("Unable to install cluster diagnostic checks helm release: " + error_helm_install.decode("ascii"))