Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shim layer support for Spark 3.0.0 Databricks #442

Merged
merged 16 commits into from
Jul 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion integration_tests/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,13 @@
<spark.test.version>3.0.0</spark.test.version>
</properties>
<profiles>
<profile>
<profile>
<id>spark300dbtests</id>
<properties>
<spark.test.version>3.0.0-databricks</spark.test.version>
</properties>
</profile>
<profile>
<id>spark301tests</id>
<properties>
<spark.test.version>3.0.1-SNAPSHOT</spark.test.version>
Expand Down
2 changes: 2 additions & 0 deletions integration_tests/src/main/python/join_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ def do_join(spark):

# local sort because of https://github.com/NVIDIA/spark-rapids/issues/84
@ignore_order(local=True)
@pytest.mark.xfail(condition=is_databricks_runtime(),
reason='https://github.com/NVIDIA/spark-rapids/issues/441')
@pytest.mark.parametrize('data_gen', all_gen, ids=idfn)
def test_broadcast_nested_loop_join_special_case(data_gen):
def do_join(spark):
Expand Down
7 changes: 3 additions & 4 deletions jenkins/Jenkinsfile.databricksnightly
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pipeline {
choice(name: 'DEPLOY_TO', choices: ['Urm', 'Local'],
description: 'Where to deploy artifacts to')
string(name: 'DATABRICKS_VERSION',
defaultValue: '0.2-databricks-SNAPSHOT', description: 'Version to set')
defaultValue: '0.2.0-SNAPSHOT', description: 'Version to set')
string(name: 'CUDF_VERSION',
defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use')
string(name: 'CUDA_VERSION',
Expand All @@ -61,7 +61,7 @@ pipeline {
URM_CREDS = credentials("svcngcc_artifactory")
DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN")
SCALA_VERSION = '2.12'
SPARK_VERSION = '3.0.0'
SPARK_VERSION = '3.0.0-databricks'
CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar'
URM_URL = "${urmUrl}"
Expand All @@ -76,8 +76,7 @@ pipeline {
steps {
script {
sshagent(credentials : ['svcngcc_pubpriv']) {
sh "mvn -B versions:set -DnewVersion=$DATABRICKS_VERSION && git clean -d -f"
sh "patch -p1 < ./jenkins/databricks/dbimports.patch"
sh "rm spark-rapids-ci.tgz"
sh "tar -zcvf spark-rapids-ci.tgz *"
sh "python3.6 ./jenkins/databricks/run-tests.py -c $CLUSTER_ID -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
sh "./jenkins/databricks/deploy.sh"
Expand Down
7 changes: 3 additions & 4 deletions jenkins/Jenkinsfile.databricksrelease
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pipeline {
string(name: 'DEPLOY_TO', defaultValue: 'https://oss.sonatype.org/service/local/staging/deploy/maven2',
description: 'The repo URL where to deploy the artifacts')
string(name: 'DATABRICKS_VERSION',
defaultValue: '0.2-databricks-SNAPSHOT', description: 'Version to set')
defaultValue: '0.2.0-SNAPSHOT', description: 'Version to set')
string(name: 'CUDF_VERSION',
defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use')
string(name: 'CUDA_VERSION',
Expand All @@ -62,7 +62,7 @@ pipeline {
DIST_PL='dist'
SQL_PL='sql-plugin'
SCALA_VERSION = '2.12'
SPARK_VERSION = '3.0.0'
SPARK_VERSION = '3.0.0-databricks'
CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar'
LOCAL_URL = "${localUrl}"
Expand All @@ -73,8 +73,7 @@ pipeline {
steps {
script {
sshagent(credentials : ['svcngcc_pubpriv']) {
sh "mvn versions:set -DnewVersion=0.2.0-databricks && git clean -d -f"
sh "patch -p1 < ./jenkins/databricks/dbimports.patch"
sh "rm spark-rapids-ci.tgz"
sh "tar -zcvf spark-rapids-ci.tgz * || true"
sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
}
Expand Down
9 changes: 7 additions & 2 deletions jenkins/databricks/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ SPARK_VERSION=$5
CUDF_VERSION=$6
CUDA_VERSION=$7
CI_CUDF_JAR=$8
BASE_SPARK_POM_VERSION=$9

echo "Spark version is $SPARK_VERSION"
echo "scala version is: $SCALA_VERSION"
Expand All @@ -40,7 +41,7 @@ rm -rf spark-rapids
mkdir spark-rapids
tar -zxvf $SPARKTGZ -C spark-rapids
cd spark-rapids
mvn -B clean package || true
mvn -B -Pdatabricks clean package -DskipTests || true
M2DIR=/home/ubuntu/.m2/repository
CUDF_JAR=${M2DIR}/ai/rapids/cudf/${CUDF_VERSION}/cudf-${CUDF_VERSION}-${CUDA_VERSION}.jar

Expand All @@ -50,13 +51,17 @@ SQLJAR=----workspace_spark_3_0--sql--core--core-hive-2.3__hadoop-2.7_${SCALA_VER
CATALYSTJAR=----workspace_spark_3_0--sql--catalyst--catalyst-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
ANNOTJAR=----workspace_spark_3_0--common--tags--tags-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
COREJAR=----workspace_spark_3_0--core--core-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
# install the 3.0.0 pom file so we get dependencies
COREPOM=spark-core_${SCALA_VERSION}-${BASE_SPARK_POM_VERSION}.pom
COREPOMPATH=$M2DIR/org/apache/spark/spark-core_${SCALA_VERSION}/${BASE_SPARK_POM_VERSION}
mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$COREJAR \
-DgroupId=org.apache.spark \
-DartifactId=spark-core_$SCALA_VERSION \
-Dversion=$SPARK_VERSION \
-Dpackaging=jar
-Dpackaging=jar \
-DpomFile=$COREPOMPATH/$COREPOM

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
Expand Down
4 changes: 2 additions & 2 deletions jenkins/databricks/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ cd spark-rapids
echo "Maven mirror is $MVN_URM_MIRROR"
SERVER_ID='snapshots'
SERVER_URL="$URM_URL-local"
FPATH=./dist/target/rapids-4-spark_$SCALA_VERSION-$DATABRICKS_VERSION.jar
DBJARFPATH=./shims/spark300db/target/rapids-4-spark-shims-spark300-databricks_$SCALA_VERSION-$DATABRICKS_VERSION.jar
mvn -B deploy:deploy-file $MVN_URM_MIRROR -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
-Dfile=$FPATH -DpomFile=dist/pom.xml
-Dfile=$DBJARFPATH -DpomFile=shims/spark300db/pom.xml
14 changes: 9 additions & 5 deletions jenkins/databricks/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,20 @@ def main():
cudf_version = '0.15-SNAPSHOT'
cuda_version = 'cuda10-1'
ci_cudf_jar = 'cudf-0.14-cuda10-1.jar'
base_spark_pom_version = '3.0.0'

try:
opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:p:l:nd:z:j:b:k:a:f:u:m:',
['workspace=', 'token=', 'clusterid=', 'private=', 'nostart=', 'localscript=', 'dest=', 'sparktgz=', 'cirapidsjar=', 'databricksversion=', 'sparkversion=', 'scalaversion=', 'cudfversion=', 'cudaversion=', 'cicudfjar='])
opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:p:l:nd:z:j:b:k:a:f:u:m:v:',
['workspace=', 'token=', 'clusterid=', 'private=', 'nostart=', 'localscript=', 'dest=', 'sparktgz=', 'cirapidsjar=', 'databricksversion=', 'sparkversion=', 'scalaversion=', 'cudfversion=', 'cudaversion=', 'cicudfjar=', 'basesparkpomversion='])
except getopt.GetoptError:
print(
'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar>')
'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar> -v <basesparkpomversion>')
sys.exit(2)

for opt, arg in opts:
if opt == '-h':
print(
'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar>')
'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar> -v <basesparkpomversion>')
sys.exit()
elif opt in ('-s', '--workspace'):
workspace = arg
Expand Down Expand Up @@ -96,6 +97,8 @@ def main():
cuda_version = arg
elif opt in ('-m', '--cicudfjar'):
ci_cudf_jar = arg
elif opt in ('-v', '--basesparkpomversion'):
base_spark_pom_version = arg

print('-s is ' + workspace)
print('-c is ' + clusterid)
Expand All @@ -114,6 +117,7 @@ def main():
print('-f is ' + cudf_version)
print('-u is ' + cuda_version)
print('-m is ' + ci_cudf_jar)
print('-v is ' + base_spark_pom_version)

if skip_start is None:
jsonout = cluster_state(workspace, clusterid, token)
Expand Down Expand Up @@ -161,7 +165,7 @@ def main():
print("rsync command: %s" % rsync_command)
subprocess.check_call(rsync_command, shell = True)

ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, private_key_file, script_dest, tgz_dest, db_version, scala_version, ci_rapids_jar, spark_version, cudf_version, cuda_version, ci_cudf_jar)
ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s %s %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, private_key_file, script_dest, tgz_dest, db_version, scala_version, ci_rapids_jar, spark_version, cudf_version, cuda_version, ci_cudf_jar, base_spark_pom_version)
print("ssh command: %s" % ssh_command)
subprocess.check_call(ssh_command, shell = True)

Expand Down
4 changes: 2 additions & 2 deletions jenkins/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ if [ "$SIGN_FILE" == true ]; then
SQL_ART_VER=`mvn exec:exec -q -pl $SQL_PL -Dexec.executable=echo -Dexec.args='${project.version}'`
JS_FPATH="${SQL_PL}/target/${SQL_ART_ID}-${SQL_ART_VER}"
SRC_DOC_JARS="-Dsources=${JS_FPATH}-sources.jar -Djavadoc=${JS_FPATH}-javadoc.jar"
DEPLOY_CMD="mvn -B gpg:sign-and-deploy-file -s jenkins/settings.xml -Dgpg.passphrase=$GPG_PASSPHRASE"
DEPLOY_CMD="mvn -B -Pinclude-databricks gpg:sign-and-deploy-file -s jenkins/settings.xml -Dgpg.passphrase=$GPG_PASSPHRASE"
else
DEPLOY_CMD="mvn -B deploy:deploy-file -s jenkins/settings.xml"
DEPLOY_CMD="mvn -B -Pinclude-databricks deploy:deploy-file -s jenkins/settings.xml"
fi

echo "Deploy CMD: $DEPLOY_CMD"
Expand Down
2 changes: 1 addition & 1 deletion jenkins/spark-nightly-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ set -ex

. jenkins/version-def.sh

mvn -U -B clean deploy $MVN_URM_MIRROR -Dmaven.repo.local=$WORKSPACE/.m2
mvn -U -B -Pinclude-databricks clean deploy $MVN_URM_MIRROR -Dmaven.repo.local=$WORKSPACE/.m2
# Run unit tests against other spark versions
mvn -U -B -Pspark301tests test $MVN_URM_MIRROR -Dmaven.repo.local=$WORKSPACE/.m2
# spark310 unit tests fail - https://github.com/NVIDIA/spark-rapids/issues/382
Expand Down
2 changes: 1 addition & 1 deletion jenkins/spark-premerge-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
rm -f $SPARK_HOME.tgz

mvn -U -B $MVN_URM_MIRROR clean verify -Dpytest.TEST_TAGS=''
mvn -U -B $MVN_URM_MIRROR -Pinclude-databricks clean verify -Dpytest.TEST_TAGS=''
# Run the unit tests for other Spark versions but dont run full python integration tests
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark301tests test -Dpytest.TEST_TAGS=''
# spark310 unit tests fail - https://github.com/NVIDIA/spark-rapids/issues/382
Expand Down
26 changes: 26 additions & 0 deletions shims/aggregator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,32 @@
<description>The RAPIDS SQL plugin for Apache Spark Shim Aggregator</description>
<version>0.2.0-SNAPSHOT</version>

<profiles>
<profile>
<id>databricks</id>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-shims-spark300-databricks_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
</profile>
<profile>
<!-- use a separate profile to just pull databricks from maven repository without building it -->
<id>include-databricks</id>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-shims-spark300-databricks_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
</profile>
</profiles>

<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
Expand Down
9 changes: 9 additions & 0 deletions shims/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@
<description>The RAPIDS SQL plugin for Apache Spark Shims</description>
<version>0.2.0-SNAPSHOT</version>

<profiles>
<profile>
<id>databricks</id>
<modules>
<module>spark300db</module>
</modules>
</profile>
</profiles>

<modules>
<module>spark300</module>
<module>spark301</module>
Expand Down
64 changes: 64 additions & 0 deletions shims/spark300db/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright (c) 2020, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-shims_2.12</artifactId>
<version>0.2.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-shims-spark300-databricks_2.12</artifactId>
<name>RAPIDS Accelerator for Apache Spark SQL Plugin Spark 3.0.0 Databricks Shim</name>
<description>The RAPIDS SQL plugin for Apache Spark 3.0.0 Databricks Shim</description>
<version>0.2.0-SNAPSHOT</version>

<properties>
<spark30db.version>3.0.0-databricks</spark30db.version>
</properties>

<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-shims-spark300_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark30db.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
<version>${spark30db.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark30db.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
com.nvidia.spark.rapids.shims.spark300db.SparkShimServiceProvider
Loading