Feature: spark debug tool (#455)

* start implementation of cluster debug utility * update debug program * update debug * fix output directory structure * cleanup output, add error checking * sort imports * start untar * extract tar * add debug.py to pylintc ignore, line too long * crlf->lf * add app logs * call get_spark_app_logs, typos * add docs * remove debug.py from pylintrc ignore * added debug.py back to pylint ignore * change pylint ignore * remove commented log * update cluster_run * refactor cluster_copy * update debug, add spinner for run and copy * make new sdk cluster_download endpoint
Azure · Apr 9, 2018 · 44a0765 · 44a0765
1 parent 61e7c59
commit 44a0765
Show file tree

Hide file tree

Showing 10 changed files with 383 additions and 71 deletions.
diff --git a/aztk/client.py b/aztk/client.py
@@ -229,43 +229,48 @@ def __delete_user_on_pool(self, username, pool_id, nodes):
             concurrent.futures.wait(futures)
 
 
-    def __cluster_run(self, cluster_id, container_name, command, internal):
+    def __cluster_run(self, cluster_id, command, internal, container_name=None):
         pool, nodes = self.__get_pool_details(cluster_id)
         nodes = [node for node in nodes]
         if internal:
-            cluster_nodes = [models.RemoteLogin(ip_address=node.ip_address, port="22") for node in nodes]
+            cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes]
         else:
-            cluster_nodes = [self.__get_remote_login_settings(pool.id, node.id) for node in nodes]
+            cluster_nodes = [(node, self.__get_remote_login_settings(pool.id, node.id)) for node in nodes]
         try:
             ssh_key = self.__create_user_on_pool('aztk', pool.id, nodes)
-            asyncio.get_event_loop().run_until_complete(ssh_lib.clus_exec_command(command,
-                                                                                  container_name,
-                                                                                  'aztk',
-                                                                                  cluster_nodes,
-                                                                                  ssh_key=ssh_key.exportKey().decode('utf-8')))
+            output = asyncio.get_event_loop().run_until_complete(ssh_lib.clus_exec_command(command,
+                                                                                           'aztk',
+                                                                                           cluster_nodes,
+                                                                                           ssh_key=ssh_key.exportKey().decode('utf-8'),
+                                                                                           container_name=container_name))
+            return output
         except OSError as exc:
             raise exc
         finally:
             self.__delete_user_on_pool('aztk', pool.id, nodes)
 
-    def __cluster_copy(self, cluster_id, container_name, source_path, destination_path, internal):
+    def __cluster_copy(self, cluster_id, source_path, destination_path, container_name=None, internal=False, get=False):
         pool, nodes = self.__get_pool_details(cluster_id)
         nodes = [node for node in nodes]
         if internal:
-            cluster_nodes = [models.RemoteLogin(ip_address=node.ip_address, port="22") for node in nodes]
+            cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes]
         else:
-            cluster_nodes = [self.__get_remote_login_settings(pool.id, node.id) for node in nodes]
+            cluster_nodes = [(node, self.__get_remote_login_settings(pool.id, node.id)) for node in nodes]
         try:
             ssh_key = self.__create_user_on_pool('aztk', pool.id, nodes)
-            asyncio.get_event_loop().run_until_complete(ssh_lib.clus_copy(container_name=container_name,
-                                                                          username='aztk',
-                                                                          nodes=cluster_nodes,
-                                                                          source_path=source_path,
-                                                                          destination_path=destination_path,
-                                                                          ssh_key=ssh_key.exportKey().decode('utf-8')))
-            self.__delete_user_on_pool('aztk', pool.id, nodes)
+            output = asyncio.get_event_loop().run_until_complete(
+                ssh_lib.clus_copy(container_name=container_name,
+                                  username='aztk',
+                                  nodes=cluster_nodes,
+                                  source_path=source_path,
+                                  destination_path=destination_path,
+                                  ssh_key=ssh_key.exportKey().decode('utf-8'),
+                                  get=get))
+            return output
         except (OSError, batch_error.BatchErrorException) as exc:
             raise exc
+        finally:
+            self.__delete_user_on_pool('aztk', pool.id, nodes)
 
     def __submit_job(self,
                      job_configuration,
@@ -388,5 +393,8 @@ def cluster_run(self, cluster_id, command):
     def cluster_copy(self, cluster_id, source_path, destination_path):
         raise NotImplementedError()
 
+    def cluster_download(self, cluster_id, source_path, destination_path):
+        raise NotImplementedError()
+
     def submit_job(self, job):
         raise NotImplementedError()
diff --git a/aztk/spark/client.py b/aztk/spark/client.py
@@ -9,6 +9,7 @@
 from aztk.spark.helpers import submit as cluster_submit_helper
 from aztk.spark.helpers import job_submission as job_submit_helper
 from aztk.spark.helpers import get_log as get_log_helper
+from aztk.spark.helpers import cluster_diagnostic_helper
 from aztk.spark.utils import util
 from aztk.internal.cluster_data import NodeData
 import yaml
@@ -146,15 +147,23 @@ def get_application_status(self, cluster_id: str, app_name: str):
         except batch_error.BatchErrorException as e:
             raise error.AztkError(helpers.format_batch_exception(e))
 
-    def cluster_run(self, cluster_id: str, command: str, internal: bool = False):
+    def cluster_run(self, cluster_id: str, command: str, host=False, internal: bool = False):
         try:
-            return self.__cluster_run(cluster_id, 'spark', command, internal)
+            return self.__cluster_run(cluster_id, command, internal, container_name='spark' if not host else None)
         except batch_error.BatchErrorException as e:
             raise error.AztkError(helpers.format_batch_exception(e))
 
-    def cluster_copy(self, cluster_id: str, source_path: str, destination_path: str, internal: bool = False):
+    def cluster_copy(self, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False):
         try:
-            return self.__cluster_copy(cluster_id, 'spark', source_path, destination_path, internal)
+            container_name = None if host else 'spark'
+            return self.__cluster_copy(cluster_id, source_path, destination_path, container_name=container_name, get=False, internal=internal)
+        except batch_error.BatchErrorException as e:
+            raise error.AztkError(helpers.format_batch_exception(e))
+
+    def cluster_download(self, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False):
+        try:
+            container_name = None if host else 'spark'
+            return self.__cluster_copy(cluster_id, source_path, destination_path, container_name=container_name, get=True, internal=internal)
         except batch_error.BatchErrorException as e:
             raise error.AztkError(helpers.format_batch_exception(e))
 
@@ -272,3 +281,10 @@ def wait_until_job_finished(self, job_id):
     def wait_until_all_jobs_finished(self, jobs):
         for job in jobs:
             self.wait_until_job_finished(job)
+
+    def run_cluster_diagnostics(self, cluster_id, output_directory):
+        try:
+            output = cluster_diagnostic_helper.run(self, cluster_id, output_directory)
+            return output
+        except batch_error.BatchErrorException as e:
+            raise error.AztkError(helpers.format_batch_exception(e))
diff --git a/aztk/spark/helpers/cluster_diagnostic_helper.py b/aztk/spark/helpers/cluster_diagnostic_helper.py
@@ -0,0 +1,26 @@
+import os
+from aztk.utils import ssh
+from aztk.utils.command_builder import CommandBuilder
+from aztk import models as aztk_models
+import azure.batch.models as batch_models
+
+def run(spark_client, cluster_id, output_directory):
+    # copy debug program to each node
+    spark_client.cluster_copy(cluster_id, os.path.abspath("./aztk/spark/utils/debug.py"), "/tmp/debug.py", host=True)
+    ssh_cmd = _build_diagnostic_ssh_command()
+    run_output = spark_client.cluster_run(cluster_id, ssh_cmd, host=True)
+    local_path = os.path.join(os.path.abspath(output_directory), "debug", "debug.zip")
+    remote_path = "/tmp/debug.zip"
+    output = spark_client.cluster_download(cluster_id, remote_path, local_path, host=True)
+    # write run output to debug/ directory
+    with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), 'w', encoding="UTF-8") as f:
+        [f.write(line + '\n') for node_id, result in run_output for line in result]
+    return output
+
+
+def _build_diagnostic_ssh_command():
+    return "sudo rm -rf /tmp/debug.zip; "\
+           "sudo apt-get install -y python3-pip; "\
+           "sudo -H pip3 install --upgrade pip; "\
+           "sudo -H pip3 install docker; "\
+           "sudo python3 /tmp/debug.py"
diff --git a/aztk/spark/utils/debug.py b/aztk/spark/utils/debug.py
@@ -0,0 +1,160 @@
+"""
+    Diagnostic program that runs on each node in the cluster
+    This program must be run with sudo
+"""
+import io
+import json
+import os
+import socket
+import tarfile
+from subprocess import STDOUT, CalledProcessError, check_output
+from zipfile import ZIP_DEFLATED, ZipFile
+
+import docker # pylint: disable=import-error
+
+
+def main():
+    zipf = create_zip_archive()
+
+    # general node diagnostics
+    zipf.writestr("hostname.txt", data=get_hostname())
+    zipf.writestr("df.txt", data=get_disk_free())
+
+    # docker container diagnostics
+    docker_client = docker.from_env()
+    for filename, data in get_docker_diagnostics(docker_client):
+        zipf.writestr(filename, data=data)
+
+    zipf.close()
+
+
+def create_zip_archive():
+    zip_file_path = "/tmp/debug.zip"
+    return ZipFile(zip_file_path, "w", ZIP_DEFLATED)
+
+
+def get_hostname():
+    return socket.gethostname()
+
+
+def cmd_check_output(cmd):
+    try:
+        output = check_output(cmd, shell=True, stderr=STDOUT)
+    except CalledProcessError as e:
+        return "CMD: {0}\n"\
+               "returncode: {1}"\
+               "output: {2}".format(e.cmd, e.returncode, e.output)
+    else:
+        return output
+
+
+def get_disk_free():
+    return cmd_check_output("df -h")
+
+
+def get_docker_diagnostics(docker_client):
+    '''
+        returns list of tuples (filename, data) to be written in the zip
+    '''
+    output = []
+    output.append(get_docker_images(docker_client))
+    logs = get_docker_containers(docker_client)
+    for item in logs:
+        output.append(item)
+
+    return output
+
+
+def get_docker_images(docker_client):
+    output = ""
+    try:
+        images = docker_client.images.list()
+        for image in images:
+            output += json.dumps(image.attrs, sort_keys=True, indent=4)
+        return ("docker-images.txt", output)
+    except docker.errors.APIError as e:
+        return ("docker-images.err", e.__str__())
+
+
+def get_docker_containers(docker_client):
+    container_attrs = ""
+    logs = []
+    try:
+        containers = docker_client.containers.list()
+        for container in containers:
+            container_attrs += json.dumps(container.attrs, sort_keys=True, indent=4)
+            # get docker container logs
+            logs.append((container.name + "/docker.log", container.logs()))
+            logs.append(get_docker_process_status(container))
+            if container.name == "spark": #TODO: find a more robust way to get specific info off specific containers
+                logs.extend(get_container_aztk_script(container))
+                logs.extend(get_spark_logs(container))
+                logs.extend(get_spark_app_logs(container))
+
+        logs.append(("docker-containers.txt", container_attrs))
+        return logs
+    except docker.errors.APIError as e:
+        return [("docker-containers.err", e.__str__())]
+
+
+def get_docker_process_status(container):
+    try:
+        exit_code, output = container.exec_run("ps -auxw", tty=True, privileged=True)
+        out_file_name = container.name + "/ps_aux.txt"
+        if exit_code == 0:
+            return (out_file_name, output)
+        else:
+            return (out_file_name, "exit_code: {0}\n{1}".format(exit_code, output))
+    except docker.errors.APIError as e:
+        return (container.name + "ps_aux.err", e.__str__())
+
+
+def get_container_aztk_script(container):
+    aztk_path = "/mnt/batch/tasks/startup/wd"
+    try:
+        stream, _ = container.get_archive(aztk_path) # second item is stat info
+        return extract_tar_in_memory(container, stream)
+    except docker.errors.APIError as e:
+        return (container.name + "/" + "aztk-scripts.err", e.__str__())
+
+
+def get_spark_logs(container):
+    spark_logs_path = "/home/spark-current/logs"
+    try:
+        stream, _ = container.get_archive(spark_logs_path) # second item is stat info
+        return extract_tar_in_memory(container, stream)
+    except docker.errors.APIError as e:
+        return [(container.name + "/" + "spark-logs.err", e.__str__())]
+
+
+def get_spark_app_logs(container):
+    spark_app_logs_path = "/home/spark-current/work"
+    try:
+        stream, _ = container.get_archive(spark_app_logs_path)
+        return extract_tar_in_memory(container, stream)
+    except docker.errors.APIError as e:
+        return [(container.name + "/" + "spark-work-logs.err", e.__str__())]
+
+
+def filter_members(members):
+    skip_files = ["id_rsa", "id_rsa.pub", "docker.log"]
+    skip_extensions = [".pyc", ".zip"]
+    for tarinfo in members:
+        if (os.path.splitext(tarinfo.name)[1] not in skip_extensions and
+                os.path.basename(tarinfo.name) not in skip_files):
+            yield tarinfo
+
+
+def extract_tar_in_memory(container, data):
+    data = io.BytesIO(b''.join([item for item in data]))
+    tarf = tarfile.open(fileobj=data)
+    logs = []
+    for member in filter_members(tarf):
+        file_bytes = tarf.extractfile(member)
+        if file_bytes is not None:
+            logs.append((container.name + "/" + member.name, b''.join(file_bytes.readlines())))
+    return logs
+
+
+if __name__ == "__main__":
+    main()