From d6b9f41a1d82bc6241e35b620130f6c366c768b9 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Thu, 30 Nov 2023 15:33:32 +0800 Subject: [PATCH] Params for build and test CI scripts on Databricks To fix https://github.com/NVIDIA/spark-rapids/issues/9903 Add params to support specify build, test args on Databricks for CI scripts, e.g. `python jenkins/databricks/run-build.py -e SKIP_DEP_INSTALL=1` : Run maven build on Databricks and skip installing dependencies into maven repo `python jenkins/databricks/run-test.py -e TEST_MODE=DELTA_LAKE_ONLY` : only run deltalake integration tests on Databricks Signed-off-by: Tim Liu --- jenkins/databricks/build.sh | 4 ++++ jenkins/databricks/common_vars.sh | 5 +++++ jenkins/databricks/params.py | 21 ++++++++++++++------- jenkins/databricks/run-build.py | 15 ++++++++------- jenkins/databricks/run-tests.py | 18 +++++++++--------- 5 files changed, 40 insertions(+), 23 deletions(-) diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index a68b272257b9..a1c7f2c2e392 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -130,6 +130,10 @@ install_dependencies() ########################## # Main script starts here ########################## +## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...' +if [ -n "$EXTRA_ENVS" ]; then + export ${EXTRA_ENVS//','/' '} +fi initialize if [[ $SKIP_DEP_INSTALL == "1" ]] diff --git a/jenkins/databricks/common_vars.sh b/jenkins/databricks/common_vars.sh index 5f02cbd94393..805eb989c539 100644 --- a/jenkins/databricks/common_vars.sh +++ b/jenkins/databricks/common_vars.sh @@ -15,6 +15,11 @@ # limitations under the License. # +## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...' +if [ -n "$EXTRA_ENVS" ]; then + export ${EXTRA_ENVS//','/' '} +fi + SPARK_VER=${SPARK_VER:-$(< /databricks/spark/VERSION)} export SPARK_SHIM_VER=${SPARK_SHIM_VER:-spark${SPARK_VER//.}db} diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py index 22a36fdf7c89..dce2436a6e6f 100644 --- a/jenkins/databricks/params.py +++ b/jenkins/databricks/params.py @@ -26,11 +26,13 @@ base_spark_pom_version = '3.2.1' base_spark_version_to_install_databricks_jars = base_spark_pom_version clusterid = '' -# can take comma seperated maven options, e.g., -Pfoo=1,-Dbar=2,... +# can take comma separated maven options, e.g., -Pfoo=1,-Dbar=2,... mvn_opt = '' jar_path = '' -# `spark_conf` can take comma seperated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...' +# can take comma separated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...' spark_conf = '' +# can take comma separated environments, e.g., foo=abc,bar=123,...' +extra_envs = '' def usage(): @@ -48,11 +50,12 @@ def usage(): ' -j ' ' -n ' ' -f ' - ' -i ') + ' -i ' + ' -e ') try: - opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:', + opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:', ['workspace=', 'token=', 'clusterid=', @@ -62,9 +65,10 @@ def usage(): 'sparktgz=', 'basesparkpomversion=', 'mvnoptions=', - 'jarpath', - 'sparkconf', - 'sparkinstallver=']) + 'jarpath=', + 'sparkconf=', + 'sparkinstallver=', + 'extraenvs=']) except getopt.GetoptError: usage() sys.exit(2) @@ -97,6 +101,8 @@ def usage(): spark_conf = arg elif opt in ('-i', '--sparkinstallver'): base_spark_version_to_install_databricks_jars = arg + elif opt in ('-e', '--extraenvs'): + extra_envs = arg print('-w is ' + workspace) print('-c is ' + clusterid) @@ -109,3 +115,4 @@ def usage(): print('-j is ' + jar_path) print('-f is ' + spark_conf) print('-i is ' + base_spark_version_to_install_databricks_jars) +print('-e is ' + extra_envs) diff --git a/jenkins/databricks/run-build.py b/jenkins/databricks/run-build.py index 38c349237aae..277c4f7024c8 100644 --- a/jenkins/databricks/run-build.py +++ b/jenkins/databricks/run-build.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,24 +29,25 @@ def main(): print("Master node address is: %s" % master_addr) print("Copying script") - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest) + ssh_args = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s" % params.private_key_file + rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % (ssh_args, params.local_script, master_addr, params.script_dest) print("rsync command: %s" % rsync_command) subprocess.check_call(rsync_command, shell = True) print("Copying source") - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.source_tgz, master_addr, params.tgz_dest) + rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % (ssh_args, params.source_tgz, master_addr, params.tgz_dest) print("rsync command: %s" % rsync_command) subprocess.check_call(rsync_command, shell = True) - ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \ - "'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s \ + ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \ + "'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s EXTRA_ENVS=%s \ bash %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % \ - (master_addr, params.private_key_file, params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.script_dest, ' '.join(params.script_args)) + (params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.extra_envs, params.script_dest, ' '.join(params.script_args)) print("ssh command: %s" % ssh_command) subprocess.check_call(ssh_command, shell = True) print("Copying built tarball back") - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (params.private_key_file, master_addr) + rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr) print("rsync command to get built tarball: %s" % rsync_command) subprocess.check_call(rsync_command, shell = True) diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py index 19710f9bb28d..cd0f8f0e04c3 100644 --- a/jenkins/databricks/run-tests.py +++ b/jenkins/databricks/run-tests.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,15 +30,16 @@ def main(): print("Master node address is: %s" % master_addr) print("Copying script") - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\"" \ - " %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest) + ssh_args = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s" % params.private_key_file + rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % \ + (ssh_args, params.local_script, master_addr, params.script_dest) print("rsync command: %s" % rsync_command) subprocess.check_call(rsync_command, shell=True) - ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \ - "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s bash %s %s 2>&1 | tee testout; " \ + ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \ + "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s bash %s %s 2>&1 | tee testout; " \ "if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \ - (master_addr, params.private_key_file, params.jar_path, params.spark_conf, params.base_spark_pom_version, + (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs, params.script_dest, ' '.join(params.script_args)) print("ssh command: %s" % ssh_command) try: @@ -46,9 +47,8 @@ def main(): finally: print("Copying test report tarball back") report_path_prefix = params.jar_path if params.jar_path else "/home/ubuntu/spark-rapids" - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\"" \ - " ubuntu@%s:%s/integration_tests/target/run_dir*/TEST-pytest-*.xml ./" % \ - (params.private_key_file, master_addr, report_path_prefix) + rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:%s/integration_tests/target/run_dir*/TEST-pytest-*.xml ./" % \ + (ssh_args, master_addr, report_path_prefix) print("rsync command: %s" % rsync_command) subprocess.check_call(rsync_command, shell = True)