Params for build and test CI scripts on Databricks

To fix NVIDIA#9903 Add params to support specify build, test args on Databricks for CI scripts, e.g. `python jenkins/databricks/run-build.py -e SKIP_DEP_INSTALL=1` : Run maven build on Databricks and skip installing dependencies into maven repo `python jenkins/databricks/run-test.py -e TEST_MODE=DELTA_LAKE_ONLY` : only run deltalake integration tests on Databricks Signed-off-by: Tim Liu <timl@nvidia.com>
NvTimLiu · Nov 30, 2023 · d6b9f41 · d6b9f41
1 parent 7c653bf
commit d6b9f41
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 23 deletions.
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
@@ -130,6 +130,10 @@ install_dependencies()
 ##########################
 # Main script starts here
 ##########################
+## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...'
+if [ -n "$EXTRA_ENVS" ]; then
+    export ${EXTRA_ENVS//','/' '}
+fi
 
 initialize
 if [[ $SKIP_DEP_INSTALL == "1" ]]

diff --git a/jenkins/databricks/common_vars.sh b/jenkins/databricks/common_vars.sh
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...'
+if [ -n "$EXTRA_ENVS" ]; then
+    export ${EXTRA_ENVS//','/' '}
+fi
+
 SPARK_VER=${SPARK_VER:-$(< /databricks/spark/VERSION)}
 export SPARK_SHIM_VER=${SPARK_SHIM_VER:-spark${SPARK_VER//.}db}
 

diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py
@@ -26,11 +26,13 @@
 base_spark_pom_version = '3.2.1'
 base_spark_version_to_install_databricks_jars = base_spark_pom_version
 clusterid = ''
-# can take comma seperated maven options, e.g., -Pfoo=1,-Dbar=2,...
+# can take comma separated maven options, e.g., -Pfoo=1,-Dbar=2,...
 mvn_opt = ''
 jar_path = ''
-# `spark_conf` can take comma seperated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...'
+# can take comma separated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...'
 spark_conf = ''
+# can take comma separated environments, e.g., foo=abc,bar=123,...'
+extra_envs = ''
 
 
 def usage():
@@ -48,11 +50,12 @@ def usage():
           ' -j <jarpath>'
           ' -n <skipstartingcluster>'
           ' -f <sparkconf>'
-          ' -i <sparkinstallver>')
+          ' -i <sparkinstallver>'
+          ' -e <extraenvs>')
 
 
 try:
-    opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:',
+    opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:',
                                       ['workspace=',
                                        'token=',
                                        'clusterid=',
@@ -62,9 +65,10 @@ def usage():
                                        'sparktgz=',
                                        'basesparkpomversion=',
                                        'mvnoptions=',
-                                       'jarpath',
-                                       'sparkconf',
-                                       'sparkinstallver='])
+                                       'jarpath=',
+                                       'sparkconf=',
+                                       'sparkinstallver=',
+                                       'extraenvs='])
 except getopt.GetoptError:
     usage()
     sys.exit(2)
@@ -97,6 +101,8 @@ def usage():
         spark_conf = arg
     elif opt in ('-i', '--sparkinstallver'):
         base_spark_version_to_install_databricks_jars = arg
+    elif opt in ('-e', '--extraenvs'):
+        extra_envs = arg
 
 print('-w is ' + workspace)
 print('-c is ' + clusterid)
@@ -109,3 +115,4 @@ def usage():
 print('-j is ' + jar_path)
 print('-f is ' + spark_conf)
 print('-i is ' + base_spark_version_to_install_databricks_jars)
+print('-e is ' + extra_envs)
diff --git a/jenkins/databricks/run-build.py b/jenkins/databricks/run-build.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,24 +29,25 @@ def main():
   print("Master node address is: %s" % master_addr)
 
   print("Copying script")
-  rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest)
+  ssh_args = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s" % params.private_key_file
+  rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % (ssh_args, params.local_script, master_addr, params.script_dest)
   print("rsync command: %s" % rsync_command)
   subprocess.check_call(rsync_command, shell = True)
 
   print("Copying source")
-  rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.source_tgz, master_addr, params.tgz_dest)
+  rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % (ssh_args, params.source_tgz, master_addr, params.tgz_dest)
   print("rsync command: %s" % rsync_command)
   subprocess.check_call(rsync_command, shell = True)
 
-  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \
-        "'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s \
+  ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \
+        "'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s EXTRA_ENVS=%s \
         bash %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % \
-        (master_addr, params.private_key_file, params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.script_dest, ' '.join(params.script_args))
+        (params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.extra_envs, params.script_dest, ' '.join(params.script_args))
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)
 
   print("Copying built tarball back")
-  rsync_command = "rsync  -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (params.private_key_file, master_addr)
+  rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
   print("rsync command to get built tarball: %s" % rsync_command)
   subprocess.check_call(rsync_command, shell = True)
 

diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,25 +30,25 @@ def main():
     print("Master node address is: %s" % master_addr)
 
     print("Copying script")
-    rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\"" \
-        " %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest)
+    ssh_args = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s" % params.private_key_file
+    rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % \
+        (ssh_args, params.local_script, master_addr, params.script_dest)
     print("rsync command: %s" % rsync_command)
     subprocess.check_call(rsync_command, shell=True)
 
-    ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \
-        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s bash %s %s 2>&1 | tee testout; " \
+    ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \
+        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s bash %s %s 2>&1 | tee testout; " \
         "if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \
-        (master_addr, params.private_key_file, params.jar_path, params.spark_conf, params.base_spark_pom_version,
+        (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs,
          params.script_dest, ' '.join(params.script_args))
     print("ssh command: %s" % ssh_command)
     try:
         subprocess.check_call(ssh_command, shell=True)
     finally:
         print("Copying test report tarball back")
         report_path_prefix = params.jar_path if params.jar_path else "/home/ubuntu/spark-rapids"
-        rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\"" \
-            " ubuntu@%s:%s/integration_tests/target/run_dir*/TEST-pytest-*.xml ./" % \
-            (params.private_key_file, master_addr, report_path_prefix)
+        rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:%s/integration_tests/target/run_dir*/TEST-pytest-*.xml ./" % \
+            (ssh_args, master_addr, report_path_prefix)
         print("rsync command: %s" % rsync_command)
         subprocess.check_call(rsync_command, shell = True)