Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Databricks build scripts [databricks] #5588

Merged
merged 3 commits into from
May 24, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions jenkins/Jenkinsfile-blossom.premerge
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,18 @@ PARAM_MAP = [
ID_RUNTIME = 0
ID_SPARK = 1
ID_INITSCRIPTS = 2
ID_PROFILES = 3
ID_INSTALL = 4
ID_INSTALL = 3
RUNTIME_MAP = [
'7.3': [
'7.3.x-gpu-ml-scala2.12',
'3.0.1',
'init_cudf_udf.sh,init_cuda11_runtime.sh',
'databricks301,!snapshot-shims',
'3.0.1'
],
'9.1': [
'9.1.x-gpu-ml-scala2.12',
'3.1.2',
'init_cudf_udf.sh',
'databricks312,!snapshot-shims',
'3.1.2'
],
'10.4': [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume the below databricks321,!snapshot-shims should be removed

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yes, my fault, this changed of the line 321 was wrongly revoked in my local branch. Fixed

Expand Down Expand Up @@ -393,7 +390,6 @@ pipeline {
DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
BUILD_PROFILES = "${RUNTIME_MAP["$DB_RUNTIME"][ID_PROFILES]}"
INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
"${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
}
Expand Down Expand Up @@ -428,7 +424,6 @@ pipeline {
DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
BUILD_PROFILES = "${RUNTIME_MAP["$DB_RUNTIME"][ID_PROFILES]}"
INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
"${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
}
Expand Down Expand Up @@ -463,7 +458,6 @@ pipeline {
DATABRICKS_RUNTIME = "${RUNTIME_MAP["$DB_RUNTIME"][ID_RUNTIME]}"
BASE_SPARK_VERSION = "${RUNTIME_MAP["$DB_RUNTIME"][ID_SPARK]}"
BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = "${RUNTIME_MAP["$DB_RUNTIME"][ID_INSTALL]}"
BUILD_PROFILES = "${RUNTIME_MAP["$DB_RUNTIME"][ID_PROFILES]}"
INIT_SCRIPTS = getInitScripts("$INIT_SCRIPTS_DIR",
"${RUNTIME_MAP["$DB_RUNTIME"][ID_INITSCRIPTS]}")
}
Expand Down
19 changes: 10 additions & 9 deletions jenkins/databricks/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,16 @@

set -ex

SPARKSRCTGZ=$1
# version of Apache Spark we are building against
BASE_SPARK_VERSION=$2
BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=$3
## Environments SPARKSRCTGZ, BASE_SPARK_VERSION, BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS, MVN_OPT
## can be overwritten by shell variables, e.g. "BASE_SPARK_VERSION=3.1.2 MVN_OPT=-DskipTests bash build.sh"

# Move MVN_OPT to last, as it is empty in most cases
MVN_OPT=$4
MVN_OPT=${MVN_OPT:-''}
SPARKSRCTGZ=${SPARKSRCTGZ:-''}
# version of Apache Spark we are building against
BASE_SPARK_VERSION=${BASE_SPARK_VERSION:-'3.1.2'}
BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS:-$BASE_SPARK_VERSION}
## '-Pfoo=1,-Dbar=2,...' to '-Pfoo=1 -Dbar=2 ...'
MVN_OPT=${MVN_OPT//','/' '}

BUILDVER=$(echo ${BASE_SPARK_VERSION} | sed 's/\.//g')db
# the version of Spark used when we install the Databricks jars in .m2
BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS:-$BASE_SPARK_VERSION}
Expand All @@ -36,7 +37,7 @@ SPARK_MAJOR_VERSION_STRING=spark_${SPARK_MAJOR_VERSION_NUM_STRING}

echo "tgz is $SPARKSRCTGZ"
echo "Base Spark version is $BASE_SPARK_VERSION"
echo "build profiles $MVN_OPT"
echo "maven options is $MVN_OPT"
echo "BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS is $BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS"

sudo apt install -y maven rsync
Expand Down Expand Up @@ -444,7 +445,7 @@ mvn -B install:install-file \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests
mvn -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests $MVN_OPT

cd /home/ubuntu
tar -zcf spark-rapids-built.tgz spark-rapids
9 changes: 5 additions & 4 deletions jenkins/databricks/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
base_spark_pom_version = '3.1.1'
base_spark_version_to_install_databricks_jars = base_spark_pom_version
clusterid = ''
build_profiles = ''
# can take comma seperated maven options, e.g., -Pfoo=1,-Dbar=2,...
mvn_opt = ''
jar_path = ''
# `spark_conf` can take comma seperated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...'
spark_conf = ''
Expand All @@ -43,7 +44,7 @@ def usage():
' -d <scriptdestination>'
' -z <sparktgz>'
' -v <basesparkpomversion>'
' -b <buildprofiles>'
' -b <mvnoptions>'
' -j <jarpath>'
' -n <skipstartingcluster>'
' -f <sparkconf>'
Expand All @@ -60,7 +61,7 @@ def usage():
'dest=',
'sparktgz=',
'basesparkpomversion=',
'buildprofiles=',
'mvnoptions=',
'jarpath',
'sparkconf',
'sparkinstallver='])
Expand Down Expand Up @@ -89,7 +90,7 @@ def usage():
elif opt in ('-v', '--basesparkpomversion'):
base_spark_pom_version = arg
elif opt in ('-b', '--bulidprofiles'):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we change --bulidprofiles to --mvnoptions?

build_profiles = arg
mvn_opt = arg
elif opt in ('-j', '--jarpath'):
jar_path = arg
elif opt in ('-f', '--sparkconf'):
Expand Down
5 changes: 4 additions & 1 deletion jenkins/databricks/run-build.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ def main():
print("rsync command: %s" % rsync_command)
subprocess.check_call(rsync_command, shell = True)

ssh_command = "bash -c 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % (master_addr, params.private_key_file, params.script_dest, params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.build_profiles)
ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid this mismatching, pass parameters through shell environment for the script file build.sh instead, e.g.,
SPARK_VER=3.1.2 MVN_OPT=' ' SOURCE_FILE=spark-rapids.tgz bash build.sh

"'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s \
bash %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % \
(master_addr, params.private_key_file, params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.script_dest, ' '.join(params.script_args))
print("ssh command: %s" % ssh_command)
subprocess.check_call(ssh_command, shell = True)

Expand Down