From 7d7a814c5018ae337731936f6738f32b20350a61 Mon Sep 17 00:00:00 2001 From: mmduyzend Date: Thu, 7 Jun 2018 17:57:43 +0100 Subject: [PATCH] Fix: fix typos (#595) --- CHANGELOG.md | 6 +++--- README.md | 6 +++--- account_setup.py | 8 ++++---- aztk/__init__.py | 2 +- aztk/client.py | 10 +++++----- aztk/core/models/fields.py | 4 ++-- aztk/core/models/validators.py | 2 +- aztk/models/scheduling_target.py | 2 +- aztk/models/toolkit.py | 4 ++-- aztk/node_scripts/install/node_scheduling.py | 4 ++-- aztk/node_scripts/install/pick_master.py | 2 +- aztk/node_scripts/install/spark.py | 4 ++-- aztk/node_scripts/setup_host.sh | 2 +- aztk/node_scripts/submit.py | 6 +++--- aztk/node_scripts/wait_until_setup_complete.py | 2 +- aztk/spark/models/plugins/resource_monitor/readme.md | 8 ++++---- aztk/utils/deprecation.py | 2 +- aztk/utils/helpers.py | 2 +- aztk_cli/config.py | 2 +- aztk_cli/config/cluster.yaml | 10 +++++----- aztk_cli/config/job.yaml | 4 ++-- aztk_cli/config/secrets.yaml.template | 2 +- aztk_cli/config/ssh.yaml | 2 +- aztk_cli/spark/endpoints/cluster/cluster_add_user.py | 2 +- aztk_cli/spark/endpoints/init.py | 6 +++--- aztk_cli/utils.py | 4 ++-- docs/00-getting-started.md | 10 +++++----- docs/10-clusters.md | 10 +++++----- docs/11-custom-scripts.md | 4 ++-- docs/12-docker-image.md | 2 +- docs/13-configuration.md | 10 +++++----- docs/15-plugins.md | 8 ++++---- docs/20-spark-submit.md | 4 ++-- docs/30-cloud-storage.md | 6 +++--- docs/51-define-plugin.md | 4 ++-- docs/70-jobs.md | 6 +++--- docs/dev/docs.md | 2 +- docs/index.rst | 2 +- .../integration_tests/sdk/cluster/test_cluster.py | 2 +- tests/spark/integration_tests/sdk/job/test_job.py | 2 +- 40 files changed, 90 insertions(+), 90 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bd368b9..4e0fbe0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ This release includes a number of breaking changes. [Please follow the migration - Docker images have been refactored and moved to a different Dockerhub repository. The new supported images are not backwards compatible. See [the documentation on configuration files.](https://aztk.readthedocs.io/en/v0.7.0/13-configuration.html#cluster-yaml) **Deprecated Features** -- Custom scripts have been removed in favor of Plugins, which are more robust. See, [the documenation on Plugins.](https://aztk.readthedocs.io/en/v0.7.0/15-plugins.html) +- Custom scripts have been removed in favor of Plugins, which are more robust. See, [the documentation on Plugins.](https://aztk.readthedocs.io/en/v0.7.0/15-plugins.html) **Added Features** * add internal flag to node commands (#482) ([1eaa1b6](https://github.com/Azure/aztk/commit/1eaa1b6)), closes [#482](https://github.com/Azure/aztk/issues/482) @@ -33,7 +33,7 @@ This release includes a number of breaking changes. [Please follow the migration * match cluster submit exit code in cli (#478) ([8889059](https://github.com/Azure/aztk/commit/8889059)), closes [#478](https://github.com/Azure/aztk/issues/478) * Plugin V2: Running plugin on host (#461) ([de78983](https://github.com/Azure/aztk/commit/de78983)), closes [#461](https://github.com/Azure/aztk/issues/461) * Plugins (#387) ([c724d94](https://github.com/Azure/aztk/commit/c724d94)), closes [#387](https://github.com/Azure/aztk/issues/387) -* Pypi auto deployement (#428) ([c237501](https://github.com/Azure/aztk/commit/c237501)), closes [#428](https://github.com/Azure/aztk/issues/428) +* Pypi auto deployment (#428) ([c237501](https://github.com/Azure/aztk/commit/c237501)), closes [#428](https://github.com/Azure/aztk/issues/428) * Readthedocs support (#497) ([e361c3b](https://github.com/Azure/aztk/commit/e361c3b)), closes [#497](https://github.com/Azure/aztk/issues/497) * refactor docker images (#510) ([779bffb](https://github.com/Azure/aztk/commit/779bffb)), closes [#510](https://github.com/Azure/aztk/issues/510) * Spark add output logs flag (#468) ([32de752](https://github.com/Azure/aztk/commit/32de752)), closes [#468](https://github.com/Azure/aztk/issues/468) @@ -93,7 +93,7 @@ This release includes a number of breaking changes. [Please follow the migration **Bug Fixes:** - load jars in `.aztk/jars/` in job submission mode - replace outdated error in cluster_create -- fix type error crash if not jars are specificed in job submission +- fix type error crash if no jars are specified in job submission - stop using mutable default parameters - print job application code if exit_code is 0 - job submission crash if executor or driver cores specified diff --git a/README.md b/README.md index 6072463d..a97cbc7e 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ chmod 755 account_setup.sh && /bin/bash account_setup.sh ``` -4. Follow the on screen prompts to create the necessary Azure resources and copy the output into your `.aztk/secrets.yaml` file. For more infomration see [Getting Started Scripts](./01-Getting-Started-Script). +4. Follow the on screen prompts to create the necessary Azure resources and copy the output into your `.aztk/secrets.yaml` file. For more information see [Getting Started Scripts](./01-Getting-Started-Script). ## Quickstart Guide @@ -98,8 +98,8 @@ aztk spark cluster submit \ path\to\pi.py 1000 ``` - The `aztk spark cluster submit` command takes the same parameters as the standard [`spark-submit` command](https://spark.apache.org/docs/latest/submitting-applications.html), except instead of specifying `--master`, AZTK requires that you specify your cluster `--id` and a unique job `--name` -- The job name, `--name`, argument must be atleast 3 characters long - - It can only contain alphanumeric characters including hypens but excluding underscores +- The job name, `--name`, argument must be at least 3 characters long + - It can only contain alphanumeric characters including hyphens but excluding underscores - It cannot contain uppercase letters - Each job you submit **must** have a unique name - Use the `--no-wait` option for your command to return immediately diff --git a/account_setup.py b/account_setup.py index 0ecf5454..821d2b83 100644 --- a/account_setup.py +++ b/account_setup.py @@ -273,7 +273,7 @@ def format_secrets(**kwargs): The following form is returned: service_principal: - tenant_id: + tenant_id: client_id: credential: batch_account_resource_id: @@ -409,16 +409,16 @@ def stop(self): # create AAD application and service principal with Spinner(): profile = credentials.get_cli_profile() - aad_cred, subscirption_id, tenant_id = profile.get_login_credentials( + aad_cred, subscription_id, tenant_id = profile.get_login_credentials( resource=AZURE_PUBLIC_CLOUD.endpoints.active_directory_graph_resource_id ) application_id, service_principal_object_id, application_credential = create_aad_user(aad_cred, tenant_id, **kwargs) - + print("Created Azure Active Directory service principal.") with Spinner(): create_role_assignment(creds, subscription_id, resource_group_id, service_principal_object_id) - print("Configured permsisions.") + print("Configured permissions.") secrets = format_secrets( **{ diff --git a/aztk/__init__.py b/aztk/__init__.py index fb1355f9..ca211f3f 100644 --- a/aztk/__init__.py +++ b/aztk/__init__.py @@ -3,5 +3,5 @@ # Azure storage is logging error in the console which make the CLI quite confusing logging.getLogger("azure.storage").setLevel(logging.CRITICAL) -# msrestazure logs warnring for keyring +# msrestazure logs warning for keyring logging.getLogger("msrestazure").setLevel(logging.CRITICAL) diff --git a/aztk/client.py b/aztk/client.py index 2c9513d1..1253db54 100644 --- a/aztk/client.py +++ b/aztk/client.py @@ -94,7 +94,7 @@ def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration, softw auto_scale_formula = "$TargetDedicatedNodes={0}; $TargetLowPriorityNodes={1}".format( cluster_conf.size, cluster_conf.size_low_priority) - # Confiure the pool + # Configure the pool pool = batch_models.PoolAddParameter( id=pool_id, virtual_machine_configuration=batch_models.VirtualMachineConfiguration( @@ -225,7 +225,7 @@ def __generate_user_on_pool(self, pool_id, nodes): node.id, ssh_pub_key): node for node in nodes} concurrent.futures.wait(futures) - + return generated_username, ssh_key def __create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): @@ -239,8 +239,8 @@ def __create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, pass concurrent.futures.wait(futures) def __delete_user_on_pool(self, username, pool_id, nodes): - with concurrent.futures.ThreadPoolExecutor() as exector: - futures = [exector.submit(self.__delete_user, pool_id, node.id, username) for node in nodes] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(self.__delete_user, pool_id, node.id, username) for node in nodes] concurrent.futures.wait(futures) def __node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): @@ -355,7 +355,7 @@ def __submit_job(self, :param job_configuration -> aztk_sdk.spark.models.JobConfiguration :param start_task -> batch_models.StartTask :param job_manager_task -> batch_models.TaskAddParameter - :param autoscale forumula -> str + :param autoscale_formula -> str :param software_metadata_key -> str :param vm_image_model -> aztk_sdk.models.VmImage :returns None diff --git a/aztk/core/models/fields.py b/aztk/core/models/fields.py index 20cad87a..12d1719e 100644 --- a/aztk/core/models/fields.py +++ b/aztk/core/models/fields.py @@ -61,8 +61,8 @@ def __set__(self, instance, value): def merge(self, instance, value): """ - Method called when merging 2 model together. - This is overriden in some of the fields where merge can be handled differently + Method called when merging 2 models together. + This is overridden in some of the fields where merge can be handled differently """ if value is not None: instance._data[self] = value diff --git a/aztk/core/models/validators.py b/aztk/core/models/validators.py index e4935307..6ed9d8e9 100644 --- a/aztk/core/models/validators.py +++ b/aztk/core/models/validators.py @@ -19,7 +19,7 @@ def validate(self, value): class Required(Validator): """ - Validate the field valiue is not `None` + Validate the field value is not `None` """ def validate(self, value): diff --git a/aztk/models/scheduling_target.py b/aztk/models/scheduling_target.py index 809644c1..7f5886aa 100644 --- a/aztk/models/scheduling_target.py +++ b/aztk/models/scheduling_target.py @@ -18,5 +18,5 @@ class SchedulingTarget(Enum): Any = "any" """ - Any node(Not reconmmended if using low pri) + Any node(Not recommended if using low pri) """ diff --git a/aztk/models/toolkit.py b/aztk/models/toolkit.py index 0442d8b3..c20bcc86 100644 --- a/aztk/models/toolkit.py +++ b/aztk/models/toolkit.py @@ -83,7 +83,7 @@ def get_docker_repo(self, gpu: bool): def _get_docker_tag(self, gpu: bool): environment = self.environment or "base" - environment_def = self._get_environent_definition() + environment_def = self._get_environment_definition() environment_version = self.environment_version or (environment_def and environment_def.default) array = [ @@ -98,7 +98,7 @@ def _get_docker_tag(self, gpu: bool): return '-'.join(array) - def _get_environent_definition(self) -> ToolkitEnvironmentDefinition: + def _get_environment_definition(self) -> ToolkitEnvironmentDefinition: toolkit = TOOLKIT_MAP.get(self.software) if toolkit: diff --git a/aztk/node_scripts/install/node_scheduling.py b/aztk/node_scripts/install/node_scheduling.py index 3dd3fdb9..12237eeb 100644 --- a/aztk/node_scripts/install/node_scheduling.py +++ b/aztk/node_scripts/install/node_scheduling.py @@ -51,8 +51,8 @@ def setup_node_scheduling( enable = True if enable: - log.info("Scheduling will be enabled on this node as it satifies the right conditions") + log.info("Scheduling will be enabled on this node as it satisfies the right conditions") enable_scheduling(batch_client) else: - log.info("Scheduling will be disabled on this node as it does NOT satifies the right conditions") + log.info("Scheduling will be disabled on this node as it does NOT satisfy the right conditions") disable_scheduling(batch_client) diff --git a/aztk/node_scripts/install/pick_master.py b/aztk/node_scripts/install/pick_master.py index 542c00e3..66cb8909 100644 --- a/aztk/node_scripts/install/pick_master.py +++ b/aztk/node_scripts/install/pick_master.py @@ -67,7 +67,7 @@ def find_master(client: batch.BatchServiceClient) -> bool: result = try_assign_self_as_master(client, pool) if result: - print("Assignment was successfull! Node {0} is the new master.".format(config.node_id)) + print("Assignment was successful! Node {0} is the new master.".format(config.node_id)) return True raise CannotAllocateMasterError("Unable to assign node as a master in 5 tries") diff --git a/aztk/node_scripts/install/spark.py b/aztk/node_scripts/install/spark.py index 1a354228..37d912ab 100644 --- a/aztk/node_scripts/install/spark.py +++ b/aztk/node_scripts/install/spark.py @@ -122,7 +122,7 @@ def copyfile(src, dest): def setup_conf(): """ - Copy spark conf files to spark_home if they were uplaoded + Copy spark conf files to spark_home if they were uploaded """ copy_spark_env() copy_core_site() @@ -220,7 +220,7 @@ def configure_history_server_log_path(path_to_log_file): if os.path.exists(directory): print('Skipping. Directory {} already exists.'.format(directory)) else: - print('Create direcotory {}.'.format(directory)) + print('Create directory {}.'.format(directory)) os.makedirs(directory) # Make sure the directory can be accessed by all users diff --git a/aztk/node_scripts/setup_host.sh b/aztk/node_scripts/setup_host.sh index 4cfbe282..c23ff0dd 100644 --- a/aztk/node_scripts/setup_host.sh +++ b/aztk/node_scripts/setup_host.sh @@ -40,7 +40,7 @@ install_prerequisites () { } install_docker_compose () { - echo "Installing Docker-Componse" + echo "Installing Docker-Compose" sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose echo "Finished installing Docker-Compose" diff --git a/aztk/node_scripts/submit.py b/aztk/node_scripts/submit.py index b239c859..d877cb74 100644 --- a/aztk/node_scripts/submit.py +++ b/aztk/node_scripts/submit.py @@ -114,7 +114,7 @@ def __app_submit_cmd( spark_submit_cmd.add_option('--executor-cores', str(executor_cores)) spark_submit_cmd.add_argument( - os.path.expandvars(app) + ' ' + + os.path.expandvars(app) + ' ' + ' '.join(['\'' + str(app_arg) + '\'' for app_arg in (app_args or [])])) with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream: @@ -145,7 +145,7 @@ def upload_log(blob_client, application): use_full_path=False) -def recieve_submit_request(application_file_path): +def receive_submit_request(application_file_path): ''' Handle the request to submit a task @@ -195,7 +195,7 @@ def upload_error_log(error, application_file_path): if __name__ == "__main__": return_code = 1 try: - return_code = recieve_submit_request(os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml')) + return_code = receive_submit_request(os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml')) except Exception as e: upload_error_log(str(e), os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml')) diff --git a/aztk/node_scripts/wait_until_setup_complete.py b/aztk/node_scripts/wait_until_setup_complete.py index 3eb9349d..9bfd9e72 100644 --- a/aztk/node_scripts/wait_until_setup_complete.py +++ b/aztk/node_scripts/wait_until_setup_complete.py @@ -5,5 +5,5 @@ while not os.path.exists('/tmp/setup_complete'): time.sleep(1) -print("SETUP FINSIHED") +print("SETUP FINISHED") os.remove('/tmp/setup_complete') diff --git a/aztk/spark/models/plugins/resource_monitor/readme.md b/aztk/spark/models/plugins/resource_monitor/readme.md index 99945475..c35681fe 100644 --- a/aztk/spark/models/plugins/resource_monitor/readme.md +++ b/aztk/spark/models/plugins/resource_monitor/readme.md @@ -1,8 +1,8 @@ -# Using the Resrouce Monitor Plugin +# Using the Resource Monitor Plugin The resource monitor plugin is useful for tracking performance counters on the cluster. These include counters such as Percent CPU used per core, Disk Read, Disk Write, Network In, Network out, and several others. Simply enabling the plugin in your cluster.yaml will deploy all the necessary components to start tracking metrics. -This plugin takes advanage of the TICK monitoring stack. For more information please visit the [influx data](https://www.influxdata.com/time-series-platform/) web page. +This plugin takes advantage of the TICK monitoring stack. For more information please visit the [influx data](https://www.influxdata.com/time-series-platform/) web page. > **IMPORTANT** All of the data is collected on the cluster's master node and will be lost once the cluster is thrown away. To persist data we recommend pushing to an off-cluster InfluxDB instance. Currently there is no supported way to persist the data from this plugin. @@ -21,14 +21,14 @@ plugins: ``` -Once the cluster is created simply the cluster ssh command and all of the ports will automatically get forwareded. +Once the cluster is created simply the cluster ssh command and all of the ports will automatically get forwarded. ```sh aztk spark cluster ssh --id ``` ### Ports -url | desciption +url | description --- | --- http://localhost:8890 | Cronograf UI diff --git a/aztk/utils/deprecation.py b/aztk/utils/deprecation.py index 07a15614..ba216e51 100644 --- a/aztk/utils/deprecation.py +++ b/aztk/utils/deprecation.py @@ -46,7 +46,7 @@ def deprecate(message: str): def _get_deprecated_version(): """ - Returns the next version where the deprecated funtionality will be removed + Returns the next version where the deprecated functionality will be removed """ if version.major == 0: return "0.{minor}.0".format(minor=version.minor + 1) diff --git a/aztk/utils/helpers.py b/aztk/utils/helpers.py index 5796baf1..972f2f9f 100644 --- a/aztk/utils/helpers.py +++ b/aztk/utils/helpers.py @@ -404,7 +404,7 @@ def read_cluster_config(cluster_id: str, blob_client: blob.BlockBlobService): def bool_env(value: bool): """ - Takes a boolean value(or None) and return the serialized version to be used as an environemnt variable + Takes a boolean value(or None) and return the serialized version to be used as an environment variable Examples: >>> bool_env(True) diff --git a/aztk_cli/config.py b/aztk_cli/config.py index 5206b55c..b617b329 100644 --- a/aztk_cli/config.py +++ b/aztk_cli/config.py @@ -23,7 +23,7 @@ def load_aztk_secrets() -> SecretsConfiguration: if not global_config and not local_config: raise aztk.error.AztkError("There is no secrets.yaml in either ./.aztk/secrets.yaml or .aztk/secrets.yaml") - if global_config: # GLobal config is optional + if global_config: # Global config is optional _merge_secrets_dict(secrets, global_config) if local_config: _merge_secrets_dict(secrets, local_config) diff --git a/aztk_cli/config/cluster.yaml b/aztk_cli/config/cluster.yaml index ac17a698..e31a9c08 100644 --- a/aztk_cli/config/cluster.yaml +++ b/aztk_cli/config/cluster.yaml @@ -1,17 +1,17 @@ ## cluster settings -# id: +# id: -# Toolkit configuration [Required] You can use `aztk toolkit` command to find which are the available tookits +# Toolkit configuration [Required] You can use `aztk toolkit` command to find which toolkits are available toolkit: software: spark version: 2.3.0 - # Which environemnt is needed for spark anaconda, r, miniconda + # Which environment is needed for spark anaconda, r, miniconda environment: {environment} # Optional version for the environment # environment_version: - # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environemnt if using default images) + # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environment if using default images) # docker_repo: @@ -34,7 +34,7 @@ username: spark # - script: <./relative/path/to/other/script.sh or ./relative/path/to/other/script/directory/> # runOn: -# To add your cluster to a virtual network provide the full arm resoruce id below +# To add your cluster to a virtual network provide the full arm resource id below # subnet_id: /subscriptions/********-****-****-****-************/resourceGroups/********/providers/Microsoft.Network/virtualNetworks/*******/subnets/****** # Enable plugins diff --git a/aztk_cli/config/job.yaml b/aztk_cli/config/job.yaml index 14214122..f9900ff6 100644 --- a/aztk_cli/config/job.yaml +++ b/aztk_cli/config/job.yaml @@ -14,12 +14,12 @@ job: toolkit: software: spark version: 2.2.0 - # Which environemnt is needed for spark anaconda, r, miniconda + # Which environment is needed for spark anaconda, r, miniconda environment: {environment} # Optional version for the environment # environment_version: - # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environemnt if using default images) + # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environment if using default images) # docker_repo: # Where do you want to run the driver (Default: dedicated if at least one dedicated node or any otherwise) diff --git a/aztk_cli/config/secrets.yaml.template b/aztk_cli/config/secrets.yaml.template index d303b8f0..a8eaca85 100644 --- a/aztk_cli/config/secrets.yaml.template +++ b/aztk_cli/config/secrets.yaml.template @@ -18,7 +18,7 @@ service_principal: # storage_account_suffix: core.windows.net -# Configuration for private docker repositories. If using public containers you do not need to provide authentification +# Configuration for private docker repositories. If using public containers you do not need to provide authentication docker: # username: # password: diff --git a/aztk_cli/config/ssh.yaml b/aztk_cli/config/ssh.yaml index 1e05f0f5..160bf595 100644 --- a/aztk_cli/config/ssh.yaml +++ b/aztk_cli/config/ssh.yaml @@ -1,6 +1,6 @@ # ssh configuration -# cluster_id: +# cluster_id: # username: username: spark diff --git a/aztk_cli/spark/endpoints/cluster/cluster_add_user.py b/aztk_cli/spark/endpoints/cluster/cluster_add_user.py index 436d28a3..04dedf7a 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_add_user.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_add_user.py @@ -9,7 +9,7 @@ def setup_parser(parser: argparse.ArgumentParser): parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') parser.add_argument('-u', '--username', - help='The usernameto access your spark cluster\'s head node') + help='The username to access your spark cluster\'s head node') auth_group = parser.add_mutually_exclusive_group() auth_group.add_argument('-p', '--password', diff --git a/aztk_cli/spark/endpoints/init.py b/aztk_cli/spark/endpoints/init.py index 851f6593..f93b9908 100644 --- a/aztk_cli/spark/endpoints/init.py +++ b/aztk_cli/spark/endpoints/init.py @@ -11,7 +11,7 @@ def setup_parser(parser: argparse.ArgumentParser): help="Create a .aztk/ folder in your home directory for global configurations.") software_parser = parser.add_mutually_exclusive_group() software_parser.add_argument('--miniconda', action="store_true", required=False) - software_parser.add_argument('--annaconda', action="store_true", required=False) + software_parser.add_argument('--anaconda', action="store_true", required=False) software_parser.add_argument('--r', '--R', action="store_true", required=False) software_parser.add_argument('--java', action="store_true", required=False) software_parser.add_argument('--scala', action="store_true", required=False) @@ -21,8 +21,8 @@ def execute(args: typing.NamedTuple): # software_specific init if args.miniconda: environment = "miniconda" - elif args.annaconda: - environment = "annaconda" + elif args.anaconda: + environment = "anaconda" elif args.r: environment = "r" else: diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index 635d3c72..480eb69f 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -21,7 +21,7 @@ def get_ssh_key_or_prompt(ssh_key, username, password, secrets_config): ssh_key = get_ssh_key.get_user_public_key(ssh_key, secrets_config) if username is not None and password is None and ssh_key is None: - log.warning("It is reccomended to use an SSH key for user creation instead of a password.") + log.warning("It is recommended to use an SSH key for user creation instead of a password.") for i in range(3): if i > 0: log.error("Please try again.") @@ -34,7 +34,7 @@ def get_ssh_key_or_prompt(ssh_key, username, password, secrets_config): else: break else: - raise error.AztkError("Failed to get valid password, cannot add user to cluster. It is recommended that you provide a ssh public key in .aztk/secrets.yaml. Or provide an ssh-key or password with commnad line parameters (--ssh-key or --password). You may also run the 'aztk spark cluster add-user' command to add a user to this cluster.") + raise error.AztkError("Failed to get valid password, cannot add user to cluster. It is recommended that you provide a ssh public key in .aztk/secrets.yaml. Or provide an ssh-key or password with command line parameters (--ssh-key or --password). You may also run the 'aztk spark cluster add-user' command to add a user to this cluster.") return ssh_key, password def print_cluster(client, cluster: models.Cluster, internal: bool = False): diff --git a/docs/00-getting-started.md b/docs/00-getting-started.md index 1778c41d..4d08d472 100644 --- a/docs/00-getting-started.md +++ b/docs/00-getting-started.md @@ -34,7 +34,7 @@ The minimum requirements to get started with this package are: ```bash aztk spark init --global ``` - This will put default configuration files in your home directory, *~/*. Please note that configuration files in your current working directory will take precident over global configuration files in your home directory. + This will put default configuration files in your home directory, *~/*. Please note that configuration files in your current working directory will take precedence over global configuration files in your home directory. ## Setting up your accounts @@ -65,7 +65,7 @@ To get the required keys for your Azure Active Directory (AAD) Service Principal 1. Register an Azure Active Directory (AAD) Application -- Navigate to Azure Active Direcotry by searching in "All Services". Click "Properties" and record the value in the "Directory ID" field. This is your __tenant ID__. +- Navigate to Azure Active Directory by searching in "All Services". Click "Properties" and record the value in the "Directory ID" field. This is your __tenant ID__. ![](./misc/AAD_1.png) @@ -99,7 +99,7 @@ To get the required keys for your Azure Active Directory (AAD) Service Principal ![](./misc/Storage_4.png) -3. Create a Batch Acccount +3. Create a Batch Account - Click the '+' button at the top left of the screen and search for 'Compute'. Select 'Batch' and click 'Create' @@ -121,7 +121,7 @@ To get the required keys for your Azure Active Directory (AAD) Service Principal - Open the secrets.yaml file in the *.aztk* folder in your current working directory (if *.aztk* doesn't exist, run `aztk spark init`). Fill in all of the fields as described below. -- Fill in the Service princripal block with your recorded values as shown below: +- Fill in the service_principal block with your recorded values as shown below: ``` service_principal: tenant_id: @@ -160,7 +160,7 @@ To get the required keys for Azure Batch and Azure Storage, please follow the be - Open the secrets.yaml file in the *.aztk* folder in your current working directory (if *.aztk* doesn't exist, run `aztk spark init`). Fill in all of the fields as described below. -- Go to the accounts in the Azure portal and copy pase the account names, keys and other information needed into the +- Go to the accounts in the Azure portal and copy paste the account names, keys and other information needed into the secrets file. ### Storage account diff --git a/docs/10-clusters.md b/docs/10-clusters.md index 994045ce..8c1c423e 100644 --- a/docs/10-clusters.md +++ b/docs/10-clusters.md @@ -1,5 +1,5 @@ # Clusters -In the Azure Distributed Data Engineering Toolkit, a cluster is primarily designed to run Spark jobs. This document describes how to create a cluster to use for Spark jobs. Alternitavely for getting started and debugging you can also use the cluster in _interactive mode_ which will allow you to log into the master node and interact with the cluster from there. +In the Azure Distributed Data Engineering Toolkit, a cluster is primarily designed to run Spark jobs. This document describes how to create a cluster to use for Spark jobs. Alternatively for getting started and debugging you can also use the cluster in _interactive mode_ which will allow you to log into the master node and interact with the cluster from there. ## Creating a Cluster Creating a Spark cluster only takes a few simple steps after which you will be able to SSH into the master node of the cluster and interact with Spark. You will be able to view the Spark Web UI, Spark Jobs UI, submit Spark jobs (with *spark-submit*), and even interact with Spark in a Jupyter notebook. @@ -28,7 +28,7 @@ By default, you cannot create clusters of more than 20 cores in total. Visit [th You can create your cluster with [low-priority](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms) VMs at an 80% discount by using `--size-low-pri` instead of `--size`. Note that these are great for experimental use, but can be taken away at any time. We recommend against this option when doing long running jobs or for critical workloads. #### Mixed Mode -You can create clusters with a mixed of [low-priority](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms) and dedicated VMs to reach the optimal balance of price and availability. In Mixed Mode, your cluster will have both dedicated instances and low priority instances. To mimize the potential impact on your Spark workloads, the Spark master node will always be provisioned on one of the dedicated nodes while each of the low priority nodes will be Spark workers. +You can create clusters with a mixed of [low-priority](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms) and dedicated VMs to reach the optimal balance of price and availability. In Mixed Mode, your cluster will have both dedicated instances and low priority instances. To minimize the potential impact on your Spark workloads, the Spark master node will always be provisioned on one of the dedicated nodes while each of the low priority nodes will be Spark workers. Please note, to use Mixed Mode clusters, you need to authenticate using Azure Active Directory (AAD) by configuring the Service Principal in `.aztk/secrets.yaml`. You also need to create a [Virtual Network \(VNET\)](https://azure.microsoft.com/en-us/services/virtual-network/), and provide the resource ID to a Subnet within the VNET in your ./aztk/cluster.yaml` configuration file. @@ -51,7 +51,7 @@ aztk spark cluster get --id Note that the cluster is not fully usable until a master node has been selected and it's state is `idle`. -For example here cluster 'spark' has 2 nodes and node `tvm-257509324_2-20170820t200959z` is the mastesr and ready to run a job. +For example here cluster 'spark' has 2 nodes and node `tvm-257509324_2-20170820t200959z` is the master and ready to run a job. ```sh Cluster spark @@ -164,7 +164,7 @@ Note that an SSH tunnel and shell will be opened with the default SSH client if ### Debugging your Spark Cluster -If your cluster is in an unknown or unusbale state, you can debug by running: +If your cluster is in an unknown or unusable state, you can debug by running: ```sh aztk spark cluster debug --id --output @@ -184,7 +184,7 @@ __Please be careful sharing the output of the `debug` command as secrets and app ### Interact with your Spark cluster -By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *locahost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.html#sshyaml). +By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *localhost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.html#sshyaml). ## Next Steps - [Run a Spark job](20-spark-submit.html) diff --git a/docs/11-custom-scripts.md b/docs/11-custom-scripts.md index 9e40559b..474d4101 100644 --- a/docs/11-custom-scripts.md +++ b/docs/11-custom-scripts.md @@ -29,7 +29,7 @@ custom_scripts: runOn: all-nodes ``` -The above configuration takes the absolute path `/custom-scripts/` and uploads every file within it. These files will all be executed, although order of exection is not guarenteed. If your custom scripts have dependencies, specify the order by providing the full path to the file as seen in the first example. +The above configuration takes the absolute path `/custom-scripts/` and uploads every file within it. These files will all be executed, although order of execution is not guaranteed. If your custom scripts have dependencies, specify the order by providing the full path to the file as seen in the first example. ## Scripting considerations @@ -39,7 +39,7 @@ The above configuration takes the absolute path `/custom-scripts/` and uploads e - The default OS is Ubuntu 16.04. - The scripts run on the specified nodes in the cluster _after_ Spark has been installed. - The scripts execute in the order provided -- If a script directory is provided, order of execution is not guarenteed +- If a script directory is provided, order of execution is not guaranteed - The environment variable $SPARK_HOME points to the root Spark directory. - The environment variable $IS\_MASTER identifies if this is the node running the master role. The node running the master role _also_ runs a worker role on it. - The Spark cluster is set up using Standalone Mode diff --git a/docs/12-docker-image.md b/docs/12-docker-image.md index df921ed5..2cadfea1 100644 --- a/docs/12-docker-image.md +++ b/docs/12-docker-image.md @@ -69,7 +69,7 @@ FROM my_username/my_repo:latest Please note that for this method to work, your Docker image must have been built on Ubuntu. -## Custom Docker Image Rquirements +## Custom Docker Image Requirements If you are building your own custom image and __not__ building on top of a supported image, the following requirements are necessary. Please make sure that the following environment variables are set: diff --git a/docs/13-configuration.md b/docs/13-configuration.md index b5dd7436..a701ec23 100644 --- a/docs/13-configuration.md +++ b/docs/13-configuration.md @@ -11,7 +11,7 @@ This is the default cluster configuration: # id: id: spark_cluster -# Toolkit configuration [Required] You can use `aztk toolkit` command to find which are the available tookits +# Toolkit configuration [Required] You can use `aztk toolkit` command to find which toolkits are available toolkit: software: spark version: 2.2 @@ -19,7 +19,7 @@ toolkit: # Optional version for the environment # environment_version: - # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environemnt if using default images) + # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environment if using default images) # docker_repo: @@ -91,11 +91,11 @@ connect: true Running the command `aztk spark cluster ssh --id ` will ssh into the master node of the Spark cluster. It will also forward the Spark Job UI to localhost:4040, the Spark master's web UI to localhost:8080, and Jupyter to localhost:8888. -Note that all of the settings in ssh.yaml will be overrided by parameters passed on the command line. +Note that all of the settings in ssh.yaml will be overridden by parameters passed on the command line. ## Spark Configuration -The repository comes with default Spark configuration files which provision your Spark cluster just the same as you would locally. After running `aztk spark init` to initialize your working environment, you can view and edit these files at `.aztk/spark-defaults.conf`, `.aztk/spark-env.sh` and `.aztk/core-site.xml`. Please note that you can bring your own Spark configuration files by copying your `spark-defaults.conf`, `spark-env.sh` and `core-site.xml` into your `.aztk/` direcotry. +The repository comes with default Spark configuration files which provision your Spark cluster just the same as you would locally. After running `aztk spark init` to initialize your working environment, you can view and edit these files at `.aztk/spark-defaults.conf`, `.aztk/spark-env.sh` and `.aztk/core-site.xml`. Please note that you can bring your own Spark configuration files by copying your `spark-defaults.conf`, `spark-env.sh` and `core-site.xml` into your `.aztk/` directory. If using `aztk` job submission, please note that both `spark.shuffle.service.enabled` and `spark.dynamicAllocation.enabled` must be set to true so that the number of executors registered with an application can scale as nodes in the job's cluster come online. @@ -128,7 +128,7 @@ If using WASB, ADL or other cloud storage services, be sure to set your keys in ## Configuring Spark Storage -The Spark cluster can be configured to use different cloud supported storage offerrings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.html) documentation. +The Spark cluster can be configured to use different cloud supported storage offerings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.html) documentation. ## Placing JARS diff --git a/docs/15-plugins.md b/docs/15-plugins.md index c6d9573c..89f8288a 100644 --- a/docs/15-plugins.md +++ b/docs/15-plugins.md @@ -1,11 +1,11 @@ # Plugins -Plugins are a successor to [custom scripts](11-custom-scripts.html) and are the reconmmended way of running custom code on the cluster. +Plugins are a successor to [custom scripts](11-custom-scripts.html) and are the recommended way of running custom code on the cluster. Plugins can either be one of the Aztk [supported plugins](#supported-plugins) or the path to a [local file](#custom-script-plugin). ## Supported Plugins -AZTK ships with a library of default plugins that enable auxillary services to use with your Spark cluster. +AZTK ships with a library of default plugins that enable auxiliary services to use with your Spark cluster. Currently the following plugins are supported: @@ -18,7 +18,7 @@ Currently the following plugins are supported: - mvBLAS ### Enable a plugin using the CLI -If you are uing the `aztk` CLI and wish to enable a supported plugin, you need to update you `.aztk/cluster.yaml` configuration file. +If you are using the `aztk` CLI and wish to enable a supported plugin, you need to update you `.aztk/cluster.yaml` configuration file. Add or uncomment the `plugins` section and set the plugins you desire to enable as follows: ```yaml @@ -33,7 +33,7 @@ plugins: ``` ### Enable a plugin using the SDK -If you are uing the `aztk` SDK and wish to enable a supported plugin, you need to import the necessary plugins from the `aztk.spark.models.plugin` module and add them to your ClusterConfiguration object's plugin list: +If you are using the `aztk` SDK and wish to enable a supported plugin, you need to import the necessary plugins from the `aztk.spark.models.plugin` module and add them to your ClusterConfiguration object's plugin list: ```python from aztk.spark.models.plugins import RStudioServerPlugin, HDFSPlugin cluster_config = ClusterConfiguration( diff --git a/docs/20-spark-submit.md b/docs/20-spark-submit.md index 022c172e..c0c8887a 100644 --- a/docs/20-spark-submit.md +++ b/docs/20-spark-submit.md @@ -19,10 +19,10 @@ To run a remotely hosted pi.py file on a Spark cluster, specify the remote path aztk spark cluster submit --id spark --name pipy --remote wasbs://path@remote/pi.py 100 ``` -NOTE: The job name (--name) must be atleast 3 characters long, can only contain alphanumeric characters including hyphens but excluding underscores, and cannot contain uppercase letters. Each job you submit **must** have a unique name. +NOTE: The job name (--name) must be at least 3 characters long, can only contain alphanumeric characters including hyphens but excluding underscores, and cannot contain uppercase letters. Each job you submit **must** have a unique name. ## Monitoring job -If you have set up a [SSH tunnel](./10-clusters.html#ssh-and-port-forwarding) with port fowarding, you can naviate to http://localhost:8080 and http://localhost:4040 to view the progess of the job using the Spark UI +If you have set up a [SSH tunnel](./10-clusters.html#ssh-and-port-forwarding) with port forwarding, you can navigate to http://localhost:8080 and http://localhost:4040 to view the progress of the job using the Spark UI ## Getting output logs diff --git a/docs/30-cloud-storage.md b/docs/30-cloud-storage.md index fdfa1517..24f6b61d 100644 --- a/docs/30-cloud-storage.md +++ b/docs/30-cloud-storage.md @@ -1,5 +1,5 @@ # Cloud storage -Cloud stoarge for spark enables you to have a persisted storage system backed by a cloud provider. Spark supports this by placing the appropriate storage jars and updating the core-site.xml file accordingly. +Cloud storage for spark enables you to have a persisted storage system backed by a cloud provider. Spark supports this by placing the appropriate storage jars and updating the core-site.xml file accordingly. ## Azure Storage Blobs (WASB) @@ -26,9 +26,9 @@ dataframe.write.csv('wasbs://MY_CONTAINER@MY_STORAGE_ACCOUNt.blob.core.windows.n Pre-built into this package is native support for connecting your Spark cluster to Azure Data Lake (aka ADL). The required ADL jars are automatically placed in the Spark cluster and the permissions are pulled from your core-site.xml file under *.aztk/core-site.xml*. -To connect to your Azure Storage account, make sure that the storage fields in your *.aztk/core-site.xml* file are properly filled out. This tool already has the the basic template for using ADL filled out inthe *.aztk/core-site.xml* file. Simply uncomment the in the "ADL (Azure Data Lake) Configuration" section and fill out the properties for MY\_AAD\_TENANT\_ID, MY\_AAD\_CLIENT\_ID and MY\_AAD\_CREDENTIAL. +To connect to your Azure Storage account, make sure that the storage fields in your *.aztk/core-site.xml* file are properly filled out. This tool already has the the basic template for using ADL filled out in the *.aztk/core-site.xml* file. Simply uncomment the in the "ADL (Azure Data Lake) Configuration" section and fill out the properties for MY\_AAD\_TENANT\_ID, MY\_AAD\_CLIENT\_ID and MY\_AAD\_CREDENTIAL. -Once you have correctly filled out the *.aztk/core-site.xml* with your Azure Data Lake credentials, you will be able to access your ADL stroage repositories from your Spark job. +Once you have correctly filled out the *.aztk/core-site.xml* with your Azure Data Lake credentials, you will be able to access your ADL storage repositories from your Spark job. Reading and writing to and from Azure Data Lake Storage is easily achieved by using the `adl` syntax. For example, reading a csv file using Pyspark would be: diff --git a/docs/51-define-plugin.md b/docs/51-define-plugin.md index d889101c..5fd5859a 100644 --- a/docs/51-define-plugin.md +++ b/docs/51-define-plugin.md @@ -84,7 +84,7 @@ Path to the local file you want to upload(Could form the plugins parameters) #### public | `optional` | `bool` If the port should be open publicly(Default: `False`) -## Environment variables availables in the plugin +## Environment variables available in the plugin AZTK provide a few environment variables that can be used in your plugin script @@ -93,7 +93,7 @@ AZTK provide a few environment variables that can be used in your plugin script * `AZTK_MASTER_IP`: Internal ip of the master ## Debug your plugin -When your plugin is not working as expected there is a few things you do to invesigate issues +When your plugin is not working as expected there is a few things you do to investigate issues Check the logs, you can either use the debug tool or [BatchLabs](https://github.com/Azure/BatchLabs) Navigate to `startup/wd/logs/plugins` diff --git a/docs/70-jobs.md b/docs/70-jobs.md index c9b91e29..1c0ba1bf 100644 --- a/docs/70-jobs.md +++ b/docs/70-jobs.md @@ -35,7 +35,7 @@ Each Job has one or more applications given as a List in Job.yaml. Applications ``` _Please note: the only required fields are name and application. All other fields may be removed or left blank._ -NOTE: The Applcaition name can only contain alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. Each application **must** have a unique name. +NOTE: The Application name can only contain alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. Each application **must** have a unique name. Jobs also require a definition of the cluster on which the Applications will run. The following properties define a cluster: ```yaml @@ -54,7 +54,7 @@ Jobs also require a definition of the cluster on which the Applications will run - custom - scripts ``` -_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-iamge.html)._ +_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-image.html)._ _The only required fields are vm_size and either size or size_low_priority, all other fields can be left blank or removed._ @@ -141,7 +141,7 @@ aztk spark job delete --id ``` Deleting a Job also permanently deletes any data or logs associated with that cluster. If you wish to persist this data, use the `--keep-logs` flag. -__You are only charged for the job while it is active, Jobs handle provisioning and destorying infrastructure, so you are only charged for the time that your applications are running.__ +__You are only charged for the job while it is active, Jobs handle provisioning and destroying infrastructure, so you are only charged for the time that your applications are running.__ ### Stopping a Job diff --git a/docs/dev/docs.md b/docs/dev/docs.md index 0ef51ebc..8f57e022 100644 --- a/docs/dev/docs.md +++ b/docs/dev/docs.md @@ -1,4 +1,4 @@ -# Writting docs +# Writing docs Docs are located in the docs folder. We are using `sphinx` to generate the docs and then hosting them on `readthedocs`. diff --git a/docs/index.rst b/docs/index.rst index 68a6d51b..4bfc81df 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -38,7 +38,7 @@ This toolkit is built on top of Azure Batch but does not require any Azure Batch .. toctree:: :maxdepth: 2 - :caption: Developper documentation: + :caption: Developer documentation: dev/docs dev/writing-models diff --git a/tests/spark/integration_tests/sdk/cluster/test_cluster.py b/tests/spark/integration_tests/sdk/cluster/test_cluster.py index 69708f7d..37c53339 100644 --- a/tests/spark/integration_tests/sdk/cluster/test_cluster.py +++ b/tests/spark/integration_tests/sdk/cluster/test_cluster.py @@ -45,7 +45,7 @@ ) ) else: - # fallback to local secrets if envrionment variables don't exist + # fallback to local secrets if environment variables don't exist spark_client = aztk.spark.Client(config.load_aztk_secrets()) diff --git a/tests/spark/integration_tests/sdk/job/test_job.py b/tests/spark/integration_tests/sdk/job/test_job.py index 27458f60..dba40771 100644 --- a/tests/spark/integration_tests/sdk/job/test_job.py +++ b/tests/spark/integration_tests/sdk/job/test_job.py @@ -38,7 +38,7 @@ ) ) else: - # fallback to local secrets if envrionment variables don't exist + # fallback to local secrets if environment variables don't exist spark_client = aztk.spark.Client(config.load_aztk_secrets())