Skip to content

Commit

Permalink
Dns and outbound connect prechecks (#5665)
Browse files Browse the repository at this point in the history
  • Loading branch information
rohan-dassani authored Jan 30, 2023
1 parent c2fa9af commit a12e79b
Show file tree
Hide file tree
Showing 5 changed files with 483 additions and 193 deletions.
14 changes: 13 additions & 1 deletion src/connectedk8s/azext_connectedk8s/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@
Diagnoser_Container_Check_Failed_Fault_Type = "Error occured while performing the diagnoser container checks"
Cluster_DNS_Check_Fault_Type = "Error occured while performing cluster DNS check"
Outbound_Connectivity_Check_Fault_Type = "Error occured while performing outbound connectivity check in the cluster"
Outbound_Connectivity_Failed_Fault_Type = "Outbound network connectivity failed in cluster diagnostic checks"
DNS_Failed_Fault_Type = "DNS resolution failed in cluster diagnostic checks"
MSI_Cert_Check_Fault_Type = "Error occurred while trying to perform MSI ceritificate presence check"
Cluster_Security_Policy_Check_Fault_Type = "Error occured while performing cluster security policy check"
KAP_Cert_Check_Fault_Type = "Error occurred while trying to perform KAP ceritificate presence check"
Expand Down Expand Up @@ -167,6 +169,9 @@
Arc_Agents_Logs = "arc_agents_logs"
Arc_Deployment_Logs = "arc_deployment_logs"
Arc_Diagnostic_Logs = "arc_diagnostic_logs"
Pre_Onboarding_Check_Logs = "pre_onboarding_check_logs"
Pre_Onboarding_Helm_Charts_Folder_Name = 'PreOnboardingChecksCharts'
Pre_Onboarding_Helm_Charts_Release_Name = 'cluster-diagnostic-checks'
Describe_Non_Ready_Arc_Agents = "describe_non_ready_arc_agents"
Agent_State = "agent_state.txt"
Arc_Agents_Events = "arc_agent_events.txt"
Expand All @@ -176,7 +181,14 @@
K8s_Cluster_Info = "k8s_cluster_info.txt"
Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt"
Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt"

# Connect Precheck Diagnoser constants
Cluster_Diagnostic_Checks_Job_Registry_Path = "mcr.microsoft.com/azurearck8s/helmchart/stable/clusterdiagnosticchecks:0.1.0"
Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing cluster diagnostic checks helm release"
Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type = "Error occured while executing cluster diagnostic checks"
Cluster_Diagnostic_Checks_Release_Cleanup_Failed = "Error occured while cleaning up the cluster diagnostic checks helm release"
Cluster_Diagnostic_Checks_Job_Not_Scheduled = 'Unable to schedule cluster-diagnostic-checks job'
Cluster_Diagnostic_Checks_Job_Not_Complete = 'Unable to complete cluster-diagnostic-checks job after scheduling'
Pre_Onboarding_Diagnostic_Checks_Execution_Failed = 'Exception occured while trying to execute pre-onboarding diagnostic checks'
# Diagnostic Results Name
Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:"
DNS_Check_Result_String = "DNS Result:"
Expand Down
221 changes: 221 additions & 0 deletions src/connectedk8s/azext_connectedk8s/_precheckutils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

import os
import shutil
import subprocess
from subprocess import Popen, PIPE
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import json
from kubernetes import client, config, watch, utils
from knack.util import CLIError
from knack.log import get_logger
from knack.prompting import NoTTYException, prompt_y_n
from azure.cli.core.commands.client_factory import get_subscription_id
from azure.cli.core.util import send_raw_request
from azure.cli.core import telemetry
from azure.core.exceptions import ResourceNotFoundError, HttpResponseError
from msrest.exceptions import AuthenticationError, HttpOperationError, TokenExpiredError
from msrest.exceptions import ValidationError as MSRestValidationError
from kubernetes.client.rest import ApiException
from azext_connectedk8s._client_factory import _resource_client_factory, _resource_providers_client
import azext_connectedk8s._constants as consts
import azext_connectedk8s._utils as azext_utils
from kubernetes import client as kube_client
from azure.cli.core import get_default_cli
from azure.cli.core.azclierror import CLIInternalError, ClientRequestError, ArgumentUsageError, ManualInterrupt, AzureResponseError, AzureInternalError, ValidationError
from argparse import Namespace
from pydoc import cli
from logging import exception
import yaml
import json
import datetime
from subprocess import Popen, PIPE, run, STDOUT, call, DEVNULL
import shutil
from knack.log import get_logger
from azure.cli.core import telemetry
import azext_connectedk8s._constants as consts
logger = get_logger(__name__)
# pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long
# pylint: disable

diagnoser_output = []


def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, filepath_with_timestamp, storage_space_available):
global diagnoser_output
try:
# Setting DNS and Outbound Check as working
dns_check = "Starting"
outbound_connectivity_check = "Starting"
# Executing the cluster_diagnostic_checks job and fetching the logs obtained
cluster_diagnostic_checks_container_log = executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud)
# If cluster_diagnostic_checks_container_log is not empty then only we will check for the results
if(cluster_diagnostic_checks_container_log is not None and cluster_diagnostic_checks_container_log != ""):
cluster_diagnostic_checks_container_log_list = cluster_diagnostic_checks_container_log.split("\n")
cluster_diagnostic_checks_container_log_list.pop(-1)
dns_check_log = ""
counter_container_logs = 1
# For retrieving only cluster_diagnostic_checks logs from the output
for outputs in cluster_diagnostic_checks_container_log_list:
if consts.Outbound_Connectivity_Check_Result_String in outputs:
counter_container_logs = 1
elif consts.DNS_Check_Result_String in outputs:
dns_check_log += outputs
counter_container_logs = 0
elif counter_container_logs == 0:
dns_check_log += " " + outputs
dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output)
outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], filepath_with_timestamp, storage_space_available, diagnoser_output)
else:
return consts.Diagnostic_Check_Incomplete, storage_space_available

# If both the check passed then we will return cluster diagnostic checks Passed
if(dns_check == consts.Diagnostic_Check_Passed and outbound_connectivity_check == consts.Diagnostic_Check_Passed):
return consts.Diagnostic_Check_Passed, storage_space_available
# If any of the check remain Incomplete than we will return Incomplete
elif(dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete):
return consts.Diagnostic_Check_Incomplete, storage_space_available
else:
return consts.Diagnostic_Check_Failed, storage_space_available

# To handle any exception that may occur during the execution
except Exception as e:
logger.warning("An exception has occured while trying to execute cluster diagnostic checks container on the cluster. Exception: {}".format(str(e)) + "\n")
telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, summary="Error occured while executing the cluster diagnostic checks container")

return consts.Diagnostic_Check_Incomplete, storage_space_available


def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud):
job_name = "cluster-diagnostic-checks-job"
# Setting the log output as Empty
cluster_diagnostic_checks_container_log = ""

cmd_helm_delete = [helm_client_location, "uninstall", "cluster-diagnostic-checks", "-n", "azure-arc-release"]
if kube_config:
cmd_helm_delete.extend(["--kubeconfig", kube_config])
if kube_context:
cmd_helm_delete.extend(["--kube-context", kube_context])

# To handle the user keyboard Interrupt
try:
# Executing the cluster diagnostic checks job yaml
config.load_kube_config(kube_config, kube_context)
# Attempting deletion of cluster diagnostic checks resources to handle the scenario if any stale resources are present
response_kubectl_delete_helm = Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
output_kubectl_delete_helm, error_kubectl_delete_helm = response_kubectl_delete_helm.communicate()
# If any error occured while execution of delete command
if (response_kubectl_delete_helm != 0):
# Converting the string of multiple errors to list
error_msg_list = error_kubectl_delete_helm.decode("ascii").split("\n")
error_msg_list.pop(-1)
valid_exception_list = []
# Checking if any exception occured or not
exception_occured_counter = 0
for ind_errors in error_msg_list:
if('not found' in ind_errors or 'deleted' in ind_errors):
pass
else:
valid_exception_list.append(ind_errors)
exception_occured_counter = 1
# If any exception occured we will print the exception and return
if exception_occured_counter == 1:
logger.warning("Cleanup of previous diagnostic checks helm release failed and hence couldn't install the new helm release. Please cleanup older release using \"helm delete cluster-diagnostic-checks -n azuer-arc-release\" and try onboarding again")
telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Release_Cleanup_Failed, summary="Error while executing cluster diagnostic checks Job")
return

chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, consts.Pre_Onboarding_Helm_Charts_Folder_Name, consts.Pre_Onboarding_Helm_Charts_Release_Name)

helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, kube_config, kube_context, helm_client_location)

# Watching for cluster diagnostic checks container to reach in completed stage
w = watch.Watch()
is_job_complete = False
is_job_scheduled = False
# To watch for changes in the pods states till it reach completed state or exit if it takes more than 180 seconds
for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='azure-arc-release', label_selector="", timeout_seconds=60):
try:
# Checking if job get scheduled or not
if event["object"].metadata.name == "cluster-diagnostic-checks-job":
is_job_scheduled = True
# Checking if job reached completed stage or not
if event["object"].metadata.name == "cluster-diagnostic-checks-job" and event["object"].status.conditions[0].type == "Complete":
is_job_complete = True
w.stop()
except Exception as e:
continue
else:
continue

if (is_job_scheduled is False):
telemetry.set_exception(exception="Couldn't schedule cluster diagnostic checks job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled,
summary="Couldn't schedule cluster diagnostic checks job in the cluster")
logger.warning("Unable to schedule the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n")
Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
return
elif (is_job_scheduled is True and is_job_complete is False):
telemetry.set_exception(exception="Couldn't complete cluster diagnostic checks job after scheduling in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Complete,
summary="Couldn't complete cluster diagnostic checks job after scheduling in the cluster")
logger.warning("Cluster diagnostics job didn't reach completed state in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n")
Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
return
else:
# Fetching the cluster diagnostic checks Container logs
all_pods = corev1_api_instance.list_namespaced_pod('azure-arc-release')
# Traversing through all agents
for each_pod in all_pods.items:
# Fetching the current Pod name and creating a folder with that name inside the timestamp folder
pod_name = each_pod.metadata.name
if(pod_name.startswith(job_name)):
# Creating a text file with the name of the container and adding that containers logs in it
cluster_diagnostic_checks_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="cluster-diagnostic-checks-container", namespace='azure-arc-release')
# Clearing all the resources after fetching the cluster diagnostic checks container logs
Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)

# To handle any exception that may occur during the execution
except Exception as e:
logger.warning("An exception has occured while trying to execute the cluster diagnostic checks in the cluster. Exception: {}".format(str(e)) + "\n")
Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE)
telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, summary="Error while executing cluster diagnostic checks Job")
return

return cluster_diagnostic_checks_container_log


def helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, kube_config, kube_context, helm_client_location, onboarding_timeout="60"):
cmd_helm_install = [helm_client_location, "upgrade", "--install", "cluster-diagnostic-checks", chart_path, "--namespace", "{}".format(consts.Release_Install_Namespace), "--create-namespace", "--output", "json"]
# To set some other helm parameters through file
cmd_helm_install.extend(["--set", "global.location={}".format(location)])
cmd_helm_install.extend(["--set", "global.azureCloud={}".format(azure_cloud)])
if https_proxy:
cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)])
if http_proxy:
cmd_helm_install.extend(["--set", "global.httpProxy={}".format(http_proxy)])
if no_proxy:
cmd_helm_install.extend(["--set", "global.noProxy={}".format(no_proxy)])
if proxy_cert:
cmd_helm_install.extend(["--set-file", "global.proxyCert={}".format(proxy_cert)])

if kube_config:
cmd_helm_install.extend(["--kubeconfig", kube_config])
if kube_context:
cmd_helm_install.extend(["--kube-context", kube_context])

# Change --timeout format for helm client to understand
onboarding_timeout = onboarding_timeout + "s"
cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)])

response_helm_install = Popen(cmd_helm_install, stdout=PIPE, stderr=PIPE)
_, error_helm_install = response_helm_install.communicate()
if response_helm_install.returncode != 0:
if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")):
telemetry.set_user_fault()
telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type,
summary='Unable to install cluster diagnostic checks helm release')
raise CLIInternalError("Unable to install cluster diagnostic checks helm release: " + error_helm_install.decode("ascii"))
Loading

0 comments on commit a12e79b

Please sign in to comment.