NVIDIA · tgravescs · May 24, 2022 · May 23, 2022 · May 23, 2022 · May 24, 2022
@@ -71,21 +71,18 @@ PARAM_MAP = [
 ID_RUNTIME = 0
 ID_SPARK = 1
 ID_INITSCRIPTS = 2
-ID_PROFILES = 3
-ID_INSTALL = 4
+ID_INSTALL = 3
 RUNTIME_MAP = [
         '7.3': [
                 '7.3.x-gpu-ml-scala2.12',
                 '3.0.1',
                 'init_cudf_udf.sh,init_cuda11_runtime.sh',
-                'databricks301,!snapshot-shims',
                 '3.0.1'
         ],
         '9.1': [
                 '9.1.x-gpu-ml-scala2.12',
                 '3.1.2',
                 'init_cudf_udf.sh',
-                'databricks312,!snapshot-shims',
                 '3.1.2'
         ],
         '10.4': [
@@ -393,7 +390,6 @@ pipeline {
                         DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
                         BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
                         BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
-                        BUILD_PROFILES = "${RUNTIME_MAP["$DB_RUNTIME"][ID_PROFILES]}"
                         INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
                                 "${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
                     }
@@ -428,7 +424,6 @@ pipeline {
                         DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
                         BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
                         BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
-                        BUILD_PROFILES = "${RUNTIME_MAP["$DB_RUNTIME"][ID_PROFILES]}"
                         INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
                                 "${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
                     }
@@ -463,7 +458,6 @@ pipeline {
                         DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
                         BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
                         BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
-                        BUILD_PROFILES = "${RUNTIME_MAP["$DB_RUNTIME"][ID_PROFILES]}"
                         INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
                                 "${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
                     }

@@ -17,15 +17,16 @@
 
 set -ex
 
-SPARKSRCTGZ=$1
-# version of Apache Spark we are building against
-BASE_SPARK_VERSION=$2
-BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=$3
+## Environments SPARKSRCTGZ, BASE_SPARK_VERSION, BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS, MVN_OPT
+## can be overwritten by shell variables, e.g. "BASE_SPARK_VERSION=3.1.2 MVN_OPT=-DskipTests bash build.sh"
 
-# Move MVN_OPT to last, as it is empty in most cases
-MVN_OPT=$4
-MVN_OPT=${MVN_OPT:-''}
+SPARKSRCTGZ=${SPARKSRCTGZ:-''}
+# version of Apache Spark we are building against
 BASE_SPARK_VERSION=${BASE_SPARK_VERSION:-'3.1.2'}
+BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS:-$BASE_SPARK_VERSION}
+## '-Pfoo=1,-Dbar=2,...' to '-Pfoo=1 -Dbar=2 ...'
+MVN_OPT=${MVN_OPT//','/' '}
+
 BUILDVER=$(echo ${BASE_SPARK_VERSION} | sed 's/\.//g')db
 # the version of Spark used when we install the Databricks jars in .m2
 BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS:-$BASE_SPARK_VERSION}
@@ -36,7 +37,7 @@ SPARK_MAJOR_VERSION_STRING=spark_${SPARK_MAJOR_VERSION_NUM_STRING}
 
 echo "tgz is $SPARKSRCTGZ"
 echo "Base Spark version is $BASE_SPARK_VERSION"
-echo "build profiles $MVN_OPT"
+echo "maven options is $MVN_OPT"
 echo "BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS is $BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS"
 
 sudo apt install -y maven rsync
@@ -444,7 +445,7 @@ mvn -B install:install-file \
    -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
    -Dpackaging=jar
 
-mvn -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests
+mvn -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests $MVN_OPT
 
 cd /home/ubuntu
 tar -zcf spark-rapids-built.tgz spark-rapids
@@ -26,7 +26,8 @@
 base_spark_pom_version = '3.1.1'
 base_spark_version_to_install_databricks_jars = base_spark_pom_version
 clusterid = ''
-build_profiles = ''
+# can take comma seperated maven options, e.g., -Pfoo=1,-Dbar=2,...
+mvn_opt = ''
 jar_path = ''
 # `spark_conf` can take comma seperated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...'
 spark_conf = ''
@@ -43,7 +44,7 @@ def usage():
           ' -d <scriptdestination>'
           ' -z <sparktgz>'
           ' -v <basesparkpomversion>'
-          ' -b <buildprofiles>'
+          ' -b <mvnoptions>'
           ' -j <jarpath>'
           ' -n <skipstartingcluster>'
           ' -f <sparkconf>'
@@ -60,7 +61,7 @@ def usage():
                                        'dest=',
                                        'sparktgz=',
                                        'basesparkpomversion=',
-                                       'buildprofiles=',
+                                       'mvnoptions=',
                                        'jarpath',
                                        'sparkconf',
                                        'sparkinstallver='])
@@ -89,7 +90,7 @@ def usage():
     elif opt in ('-v', '--basesparkpomversion'):
         base_spark_pom_version = arg
     elif opt in ('-b', '--bulidprofiles'):
-        build_profiles = arg
+        mvn_opt = arg
     elif opt in ('-j', '--jarpath'):
         jar_path = arg
     elif opt in ('-f', '--sparkconf'):

@@ -38,7 +38,10 @@ def main():
   print("rsync command: %s" % rsync_command)
   subprocess.check_call(rsync_command, shell = True)
 
-  ssh_command = "bash -c 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % (master_addr, params.private_key_file, params.script_dest, params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.build_profiles)
+  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \
+        "'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s \
+        bash %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % \
+        (master_addr, params.private_key_file, params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.script_dest, ' '.join(params.script_args))
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)