NVIDIA · tgravescs · Jul 28, 2020 · Jul 23, 2020 · Jul 23, 2020 · Jul 24, 2020
@@ -32,7 +32,13 @@
         <spark.test.version>3.0.0</spark.test.version>
     </properties>
     <profiles>
-       <profile>
+        <profile>
+            <id>spark300dbtests</id>
+            <properties>
+                <spark.test.version>3.0.0-databricks</spark.test.version>
+            </properties>
+        </profile>
+        <profile>
             <id>spark301tests</id>
             <properties>
                 <spark.test.version>3.0.1-SNAPSHOT</spark.test.version>

diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
@@ -97,6 +97,8 @@ def do_join(spark):
 
 # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84
 @ignore_order(local=True)
+@pytest.mark.xfail(condition=is_databricks_runtime(),
+    reason='https://github.com/NVIDIA/spark-rapids/issues/441')
 @pytest.mark.parametrize('data_gen', all_gen, ids=idfn)
 def test_broadcast_nested_loop_join_special_case(data_gen):
     def do_join(spark):

@@ -44,7 +44,7 @@ pipeline {
         choice(name: 'DEPLOY_TO', choices: ['Urm', 'Local'],
             description: 'Where to deploy artifacts to')
         string(name: 'DATABRICKS_VERSION',
-                defaultValue: '0.2-databricks-SNAPSHOT', description: 'Version to set')
+                defaultValue: '0.2.0-SNAPSHOT', description: 'Version to set')
         string(name: 'CUDF_VERSION',
                 defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use')
         string(name: 'CUDA_VERSION',
@@ -61,7 +61,7 @@ pipeline {
         URM_CREDS = credentials("svcngcc_artifactory")
         DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN")
         SCALA_VERSION = '2.12'
-        SPARK_VERSION = '3.0.0'
+        SPARK_VERSION = '3.0.0-databricks'
         CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
         CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar'
         URM_URL = "${urmUrl}"
@@ -76,8 +76,7 @@ pipeline {
             steps {
                 script {
                     sshagent(credentials : ['svcngcc_pubpriv']) {
-                        sh "mvn -B versions:set -DnewVersion=$DATABRICKS_VERSION && git clean -d -f"
-                        sh "patch -p1 < ./jenkins/databricks/dbimports.patch"
+                        sh "rm spark-rapids-ci.tgz"
                         sh "tar -zcvf spark-rapids-ci.tgz *"
                         sh "python3.6 ./jenkins/databricks/run-tests.py -c $CLUSTER_ID -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
                         sh "./jenkins/databricks/deploy.sh"

@@ -45,7 +45,7 @@ pipeline {
         string(name: 'DEPLOY_TO', defaultValue: 'https://oss.sonatype.org/service/local/staging/deploy/maven2',
             description: 'The repo URL where to deploy the artifacts')
         string(name: 'DATABRICKS_VERSION',
-                defaultValue: '0.2-databricks-SNAPSHOT', description: 'Version to set')
+                defaultValue: '0.2.0-SNAPSHOT', description: 'Version to set')
         string(name: 'CUDF_VERSION',
                 defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use')
         string(name: 'CUDA_VERSION',
@@ -62,7 +62,7 @@ pipeline {
         DIST_PL='dist'
         SQL_PL='sql-plugin'
         SCALA_VERSION = '2.12'
-        SPARK_VERSION = '3.0.0'
+        SPARK_VERSION = '3.0.0-databricks'
         CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar'
         CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar'
         LOCAL_URL = "${localUrl}"
@@ -73,8 +73,7 @@ pipeline {
             steps {
                 script {
                     sshagent(credentials : ['svcngcc_pubpriv']) {
-                        sh "mvn versions:set -DnewVersion=0.2.0-databricks && git clean -d -f"
-                        sh "patch -p1 < ./jenkins/databricks/dbimports.patch"
+                        sh "rm spark-rapids-ci.tgz"
                         sh "tar -zcvf spark-rapids-ci.tgz * || true"
                         sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR"
                     }

@@ -25,6 +25,7 @@ SPARK_VERSION=$5
 CUDF_VERSION=$6
 CUDA_VERSION=$7
 CI_CUDF_JAR=$8
+BASE_SPARK_POM_VERSION=$9
 
 echo "Spark version is $SPARK_VERSION"
 echo "scala version is: $SCALA_VERSION"
@@ -40,7 +41,7 @@ rm -rf spark-rapids
 mkdir spark-rapids
 tar -zxvf $SPARKTGZ -C spark-rapids
 cd spark-rapids
-mvn -B clean package || true
+mvn -B -Pdatabricks clean package -DskipTests || true
 M2DIR=/home/ubuntu/.m2/repository
 CUDF_JAR=${M2DIR}/ai/rapids/cudf/${CUDF_VERSION}/cudf-${CUDF_VERSION}-${CUDA_VERSION}.jar
 
@@ -50,13 +51,17 @@ SQLJAR=----workspace_spark_3_0--sql--core--core-hive-2.3__hadoop-2.7_${SCALA_VER
 CATALYSTJAR=----workspace_spark_3_0--sql--catalyst--catalyst-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
 ANNOTJAR=----workspace_spark_3_0--common--tags--tags-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
 COREJAR=----workspace_spark_3_0--core--core-hive-2.3__hadoop-2.7_${SCALA_VERSION}_deploy.jar
+# install the 3.0.0 pom file so we get dependencies
+COREPOM=spark-core_${SCALA_VERSION}-${BASE_SPARK_POM_VERSION}.pom
+COREPOMPATH=$M2DIR/org/apache/spark/spark-core_${SCALA_VERSION}/${BASE_SPARK_POM_VERSION}
 mvn -B install:install-file \
    -Dmaven.repo.local=$M2DIR \
    -Dfile=$JARDIR/$COREJAR \
    -DgroupId=org.apache.spark \
    -DartifactId=spark-core_$SCALA_VERSION \
    -Dversion=$SPARK_VERSION \
-   -Dpackaging=jar
+   -Dpackaging=jar \
+   -DpomFile=$COREPOMPATH/$COREPOM
 
 mvn -B install:install-file \
    -Dmaven.repo.local=$M2DIR \

@@ -24,6 +24,6 @@ cd spark-rapids
 echo "Maven mirror is $MVN_URM_MIRROR"
 SERVER_ID='snapshots'
 SERVER_URL="$URM_URL-local"
-FPATH=./dist/target/rapids-4-spark_$SCALA_VERSION-$DATABRICKS_VERSION.jar
+DBJARFPATH=./shims/spark300db/target/rapids-4-spark-shims-spark300-databricks_$SCALA_VERSION-$DATABRICKS_VERSION.jar
 mvn -B deploy:deploy-file $MVN_URM_MIRROR -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
-    -Dfile=$FPATH -DpomFile=dist/pom.xml 
+    -Dfile=$DBJARFPATH -DpomFile=shims/spark300db/pom.xml
@@ -52,19 +52,20 @@ def main():
   cudf_version = '0.15-SNAPSHOT'
   cuda_version = 'cuda10-1'
   ci_cudf_jar = 'cudf-0.14-cuda10-1.jar'
+  base_spark_pom_version = '3.0.0'
 
   try:
-      opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:p:l:nd:z:j:b:k:a:f:u:m:',
-                                 ['workspace=', 'token=', 'clusterid=', 'private=', 'nostart=', 'localscript=', 'dest=', 'sparktgz=', 'cirapidsjar=', 'databricksversion=', 'sparkversion=', 'scalaversion=', 'cudfversion=', 'cudaversion=', 'cicudfjar='])
+      opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:p:l:nd:z:j:b:k:a:f:u:m:v:',
+                                 ['workspace=', 'token=', 'clusterid=', 'private=', 'nostart=', 'localscript=', 'dest=', 'sparktgz=', 'cirapidsjar=', 'databricksversion=', 'sparkversion=', 'scalaversion=', 'cudfversion=', 'cudaversion=', 'cicudfjar=', 'basesparkpomversion='])
   except getopt.GetoptError:
       print(
-          'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar>')
+          'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar> -v <basesparkpomversion>')
       sys.exit(2)
 
   for opt, arg in opts:
       if opt == '-h':
           print(
-              'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar>')
+              'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -j <cirapidsjar> -b <databricksversion> -k <sparkversion> -a <scalaversion> -f <cudfversion> -u <cudaversion> -m <cicudfjar> -v <basesparkpomversion>')
           sys.exit()
       elif opt in ('-s', '--workspace'):
           workspace = arg
@@ -96,6 +97,8 @@ def main():
           cuda_version = arg
       elif opt in ('-m', '--cicudfjar'):
           ci_cudf_jar = arg
+      elif opt in ('-v', '--basesparkpomversion'):
+          base_spark_pom_version = arg
 
   print('-s is ' + workspace)
   print('-c is ' + clusterid)
@@ -114,6 +117,7 @@ def main():
   print('-f is ' + cudf_version)
   print('-u is ' + cuda_version)
   print('-m is ' + ci_cudf_jar)
+  print('-v is ' + base_spark_pom_version)
 
   if skip_start is None:
       jsonout = cluster_state(workspace, clusterid, token)
@@ -161,7 +165,7 @@ def main():
   print("rsync command: %s" % rsync_command)
   subprocess.check_call(rsync_command, shell = True)
 
-  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, private_key_file, script_dest, tgz_dest, db_version, scala_version, ci_rapids_jar, spark_version, cudf_version, cuda_version, ci_cudf_jar)
+  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s %s %s %s %s %s %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, private_key_file, script_dest, tgz_dest, db_version, scala_version, ci_rapids_jar, spark_version, cudf_version, cuda_version, ci_cudf_jar, base_spark_pom_version)
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)
 

@@ -60,9 +60,9 @@ if [ "$SIGN_FILE" == true ]; then
     SQL_ART_VER=`mvn exec:exec -q -pl $SQL_PL -Dexec.executable=echo -Dexec.args='${project.version}'`
     JS_FPATH="${SQL_PL}/target/${SQL_ART_ID}-${SQL_ART_VER}"
     SRC_DOC_JARS="-Dsources=${JS_FPATH}-sources.jar -Djavadoc=${JS_FPATH}-javadoc.jar"
-    DEPLOY_CMD="mvn -B gpg:sign-and-deploy-file -s jenkins/settings.xml -Dgpg.passphrase=$GPG_PASSPHRASE"
+    DEPLOY_CMD="mvn -B -Pinclude-databricks gpg:sign-and-deploy-file -s jenkins/settings.xml -Dgpg.passphrase=$GPG_PASSPHRASE"
 else
-    DEPLOY_CMD="mvn -B deploy:deploy-file -s jenkins/settings.xml"
+    DEPLOY_CMD="mvn -B -Pinclude-databricks deploy:deploy-file -s jenkins/settings.xml"
 fi
 
 echo "Deploy CMD: $DEPLOY_CMD"

@@ -19,7 +19,7 @@ set -ex
 
 . jenkins/version-def.sh
 
-mvn -U -B clean deploy $MVN_URM_MIRROR -Dmaven.repo.local=$WORKSPACE/.m2
+mvn -U -B -Pinclude-databricks clean deploy $MVN_URM_MIRROR -Dmaven.repo.local=$WORKSPACE/.m2
 # Run unit tests against other spark versions
 mvn -U -B -Pspark301tests test $MVN_URM_MIRROR -Dmaven.repo.local=$WORKSPACE/.m2
 # spark310 unit tests fail - https://github.com/NVIDIA/spark-rapids/issues/382

@@ -37,7 +37,7 @@ export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
 tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
     rm -f $SPARK_HOME.tgz
 
-mvn -U -B $MVN_URM_MIRROR clean verify -Dpytest.TEST_TAGS=''
+mvn -U -B $MVN_URM_MIRROR -Pinclude-databricks clean verify -Dpytest.TEST_TAGS=''
 # Run the unit tests for other Spark versions but dont run full python integration tests
 env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark301tests test -Dpytest.TEST_TAGS=''
 # spark310 unit tests fail - https://github.com/NVIDIA/spark-rapids/issues/382

@@ -32,6 +32,32 @@
     <description>The RAPIDS SQL plugin for Apache Spark Shim Aggregator</description>
     <version>0.2.0-SNAPSHOT</version>
 
+   <profiles>
+        <profile>
+            <id>databricks</id>
+            <dependencies>
+                <dependency>
+                    <groupId>com.nvidia</groupId>
+                    <artifactId>rapids-4-spark-shims-spark300-databricks_${scala.binary.version}</artifactId>
+                    <version>${project.version}</version>
+                    <scope>compile</scope>
+                </dependency>
+            </dependencies>
+        </profile>
+        <profile>
+            <!-- use a separate profile to just pull databricks from maven repository without building it -->
+            <id>include-databricks</id>
+            <dependencies>
+                <dependency>
+                    <groupId>com.nvidia</groupId>
+                    <artifactId>rapids-4-spark-shims-spark300-databricks_${scala.binary.version}</artifactId>
+                    <version>${project.version}</version>
+                    <scope>compile</scope>
+                </dependency>
+            </dependencies>
+        </profile>
+    </profiles>
+
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>

@@ -32,6 +32,15 @@
     <description>The RAPIDS SQL plugin for Apache Spark Shims</description>
     <version>0.2.0-SNAPSHOT</version>
 
+    <profiles>
+        <profile>
+            <id>databricks</id>
+            <modules>
+                <module>spark300db</module>
+            </modules>
+        </profile>
+    </profiles>
+
     <modules>
         <module>spark300</module>
         <module>spark301</module>

@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright (c) 2020, NVIDIA CORPORATION.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>com.nvidia</groupId>
+        <artifactId>rapids-4-spark-shims_2.12</artifactId>
+        <version>0.2.0-SNAPSHOT</version>
+	<relativePath>../pom.xml</relativePath>
+    </parent>
+    <groupId>com.nvidia</groupId>
+    <artifactId>rapids-4-spark-shims-spark300-databricks_2.12</artifactId>
+    <name>RAPIDS Accelerator for Apache Spark SQL Plugin Spark 3.0.0 Databricks Shim</name>
+    <description>The RAPIDS SQL plugin for Apache Spark 3.0.0 Databricks Shim</description>
+    <version>0.2.0-SNAPSHOT</version>
+
+    <properties>
+        <spark30db.version>3.0.0-databricks</spark30db.version>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-shims-spark300_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_${scala.binary.version}</artifactId>
+	    <version>${spark30db.version}</version>
+	    <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
+	    <version>${spark30db.version}</version>
+	    <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_${scala.binary.version}</artifactId>
+	    <version>${spark30db.version}</version>
+	    <scope>provided</scope>
+        </dependency>
+    </dependencies>
+
+</project>
diff --git a/...0db/src/main/resources/META-INF/services/com.nvidia.spark.rapids.SparkShimServiceProvider b/...0db/src/main/resources/META-INF/services/com.nvidia.spark.rapids.SparkShimServiceProvider
@@ -0,0 +1 @@
+com.nvidia.spark.rapids.shims.spark300db.SparkShimServiceProvider