Skip to content

Commit

Permalink
Params for build and test CI scripts on Databricks
Browse files Browse the repository at this point in the history
To fix NVIDIA#9903

Add params to support specify build, test args on Databricks for CI scripts, e.g.

    `python jenkins/databricks/run-build.py -e SKIP_DEP_INSTALL=1` : Run maven build on Databricks and skip installing dependencies into maven repo

    `python jenkins/databricks/run-test.py -e TEST_MODE=DELTA_LAKE_ONLY` : only run deltalake integration tests on Databricks

Signed-off-by: Tim Liu <timl@nvidia.com>
  • Loading branch information
NvTimLiu committed Nov 30, 2023
1 parent 7c653bf commit d6b9f41
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 23 deletions.
4 changes: 4 additions & 0 deletions jenkins/databricks/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ install_dependencies()
##########################
# Main script starts here
##########################
## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...'
if [ -n "$EXTRA_ENVS" ]; then
export ${EXTRA_ENVS//','/' '}
fi

initialize
if [[ $SKIP_DEP_INSTALL == "1" ]]
Expand Down
5 changes: 5 additions & 0 deletions jenkins/databricks/common_vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
# limitations under the License.
#

## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...'
if [ -n "$EXTRA_ENVS" ]; then
export ${EXTRA_ENVS//','/' '}
fi

SPARK_VER=${SPARK_VER:-$(< /databricks/spark/VERSION)}
export SPARK_SHIM_VER=${SPARK_SHIM_VER:-spark${SPARK_VER//.}db}

Expand Down
21 changes: 14 additions & 7 deletions jenkins/databricks/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@
base_spark_pom_version = '3.2.1'
base_spark_version_to_install_databricks_jars = base_spark_pom_version
clusterid = ''
# can take comma seperated maven options, e.g., -Pfoo=1,-Dbar=2,...
# can take comma separated maven options, e.g., -Pfoo=1,-Dbar=2,...
mvn_opt = ''
jar_path = ''
# `spark_conf` can take comma seperated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...'
# can take comma separated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...'
spark_conf = ''
# can take comma separated environments, e.g., foo=abc,bar=123,...'
extra_envs = ''


def usage():
Expand All @@ -48,11 +50,12 @@ def usage():
' -j <jarpath>'
' -n <skipstartingcluster>'
' -f <sparkconf>'
' -i <sparkinstallver>')
' -i <sparkinstallver>'
' -e <extraenvs>')


try:
opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:',
opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:',
['workspace=',
'token=',
'clusterid=',
Expand All @@ -62,9 +65,10 @@ def usage():
'sparktgz=',
'basesparkpomversion=',
'mvnoptions=',
'jarpath',
'sparkconf',
'sparkinstallver='])
'jarpath=',
'sparkconf=',
'sparkinstallver=',
'extraenvs='])
except getopt.GetoptError:
usage()
sys.exit(2)
Expand Down Expand Up @@ -97,6 +101,8 @@ def usage():
spark_conf = arg
elif opt in ('-i', '--sparkinstallver'):
base_spark_version_to_install_databricks_jars = arg
elif opt in ('-e', '--extraenvs'):
extra_envs = arg

print('-w is ' + workspace)
print('-c is ' + clusterid)
Expand All @@ -109,3 +115,4 @@ def usage():
print('-j is ' + jar_path)
print('-f is ' + spark_conf)
print('-i is ' + base_spark_version_to_install_databricks_jars)
print('-e is ' + extra_envs)
15 changes: 8 additions & 7 deletions jenkins/databricks/run-build.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,24 +29,25 @@ def main():
print("Master node address is: %s" % master_addr)

print("Copying script")
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest)
ssh_args = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s" % params.private_key_file
rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % (ssh_args, params.local_script, master_addr, params.script_dest)
print("rsync command: %s" % rsync_command)
subprocess.check_call(rsync_command, shell = True)

print("Copying source")
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.source_tgz, master_addr, params.tgz_dest)
rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % (ssh_args, params.source_tgz, master_addr, params.tgz_dest)
print("rsync command: %s" % rsync_command)
subprocess.check_call(rsync_command, shell = True)

ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \
"'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s \
ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \
"'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s EXTRA_ENVS=%s \
bash %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % \
(master_addr, params.private_key_file, params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.script_dest, ' '.join(params.script_args))
(params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.extra_envs, params.script_dest, ' '.join(params.script_args))
print("ssh command: %s" % ssh_command)
subprocess.check_call(ssh_command, shell = True)

print("Copying built tarball back")
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (params.private_key_file, master_addr)
rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
print("rsync command to get built tarball: %s" % rsync_command)
subprocess.check_call(rsync_command, shell = True)

Expand Down
18 changes: 9 additions & 9 deletions jenkins/databricks/run-tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,25 +30,25 @@ def main():
print("Master node address is: %s" % master_addr)

print("Copying script")
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\"" \
" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest)
ssh_args = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s" % params.private_key_file
rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % \
(ssh_args, params.local_script, master_addr, params.script_dest)
print("rsync command: %s" % rsync_command)
subprocess.check_call(rsync_command, shell=True)

ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \
"'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s bash %s %s 2>&1 | tee testout; " \
ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \
"'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s bash %s %s 2>&1 | tee testout; " \
"if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \
(master_addr, params.private_key_file, params.jar_path, params.spark_conf, params.base_spark_pom_version,
(params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs,
params.script_dest, ' '.join(params.script_args))
print("ssh command: %s" % ssh_command)
try:
subprocess.check_call(ssh_command, shell=True)
finally:
print("Copying test report tarball back")
report_path_prefix = params.jar_path if params.jar_path else "/home/ubuntu/spark-rapids"
rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\"" \
" ubuntu@%s:%s/integration_tests/target/run_dir*/TEST-pytest-*.xml ./" % \
(params.private_key_file, master_addr, report_path_prefix)
rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:%s/integration_tests/target/run_dir*/TEST-pytest-*.xml ./" % \
(ssh_args, master_addr, report_path_prefix)
print("rsync command: %s" % rsync_command)
subprocess.check_call(rsync_command, shell = True)

Expand Down

0 comments on commit d6b9f41

Please sign in to comment.