Add udf jar to nightly integration tests (#1420)

* Add udf jar to nightly integration tests Signed-off-by: Jason Lowe <jlowe@nvidia.com> * Update integration test docs for udf-examples jar Signed-off-by: Jason Lowe <jlowe@nvidia.com> * Skip RAPIDS UDF tests if UDF fails to load Signed-off-by: Jason Lowe <jlowe@nvidia.com>
NVIDIA · Dec 17, 2020 · 2136cbe · 2136cbe
1 parent 42dd326
commit 2136cbe
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 11 deletions.
diff --git a/integration_tests/README.md b/integration_tests/README.md
@@ -99,14 +99,15 @@ The test files are everything under `./integration_tests/src/test/resources/`  B
 where you placed them because you will need to tell the tests where they are.
 
 When running these tests you will need to include the test jar, the integration test jar,
-scala-test and scalactic. You can find scala-test and scalactic under `~/.m2/repository`.
+the udf-examples jar, scala-test and scalactic. You can find scala-test and scalactic under
+`~/.m2/repository`.
 
 It is recommended that you use `spark-shell` and the scalatest shell to run each test
 individually, so you don't risk running unit tests along with the integration tests.
 http://www.scalatest.org/user_guide/using_the_scalatest_shell
 
 ```shell 
-spark-shell --jars rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-0.4.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
+spark-shell --jars rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT-tests.jar,rapids-4-spark-udf-examples-0.4.0-SNAPSHOT,rapids-4-spark-integration-tests_2.12-0.4.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
 ```
 
 First you import the `scalatest_shell` and tell the tests where they can find the test files you
@@ -130,7 +131,7 @@ If you just want to verify the SQL replacement is working you will need to add t
 example assumes CUDA 10.1 is being used.
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,rapids-4-spark-udf-examples-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT-cuda10-1.jar" ./runtests.py
 ```
 
 You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -182,7 +183,7 @@ The TPCxBB, TPCH, TPCDS, and Mortgage tests in this framework can be enabled by
 As an example, here is the `spark-submit` command with the TPCxBB parameters on CUDA 10.1:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv"
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,rapids-4-spark-udf-examples-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT-cuda10-1.jar,rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv"
 ```
 
 Be aware that running these tests with read data requires at least an entire GPU, and preferable several GPUs/executors
@@ -211,7 +212,7 @@ To run cudf_udf tests, need following configuration changes:
 As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 10.1:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar" ./runtests.py --cudf_udf
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,rapids-4-spark-udf-examples-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT-cuda10-1.jar,rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar" ./runtests.py --cudf_udf
 ```
 
 ## Writing tests

diff --git a/integration_tests/src/main/python/rapids_udf_test.py b/integration_tests/src/main/python/rapids_udf_test.py
@@ -17,17 +17,24 @@
 from asserts import assert_gpu_and_cpu_are_equal_sql
 from data_gen import *
 from spark_session import with_spark_session
+from pyspark.sql.utils import AnalysisException
 
 def skip_if_no_hive(spark):
     if spark.conf.get("spark.sql.catalogImplementation") != "hive":
         pytest.skip("The Spark session does not have Hive support")
 
+def load_udf_or_skip_test(spark, udfname, udfclass):
+    spark.sql("DROP TEMPORARY FUNCTION IF EXISTS {}".format(udfname))
+    try:
+        spark.sql("CREATE TEMPORARY FUNCTION {} AS '{}'".format(udfname, udfclass))
+    except AnalysisException:
+        pytest.skip("UDF {} failed to load, udf-examples jar is probably missing".format(udfname))
+
 def test_hive_simple_udf():
     with_spark_session(skip_if_no_hive)
     data_gens = [["i", int_gen], ["s", StringGen('([^%]{0,1}(%[0-9A-F][0-9A-F]){0,1}){0,30}')]]
     def evalfn(spark):
-        spark.sql("DROP TEMPORARY FUNCTION IF EXISTS urldecode")
-        spark.sql("CREATE TEMPORARY FUNCTION urldecode AS 'com.nvidia.spark.rapids.udf.URLDecode'")
+        load_udf_or_skip_test(spark, "urldecode", "com.nvidia.spark.rapids.udf.URLDecode")
         return gen_df(spark, data_gens)
     assert_gpu_and_cpu_are_equal_sql(
         evalfn,
@@ -38,8 +45,7 @@ def test_hive_generic_udf():
     with_spark_session(skip_if_no_hive)
     data_gens = [["s", StringGen('.{0,30}')]]
     def evalfn(spark):
-        spark.sql("DROP TEMPORARY FUNCTION IF EXISTS urlencode")
-        spark.sql("CREATE TEMPORARY FUNCTION urlencode AS 'com.nvidia.spark.rapids.udf.URLEncode'")
+        load_udf_or_skip_test(spark, "urlencode", "com.nvidia.spark.rapids.udf.URLEncode")
         return gen_df(spark, data_gens)
     assert_gpu_and_cpu_are_equal_sql(
         evalfn,

diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh
@@ -33,6 +33,8 @@ $MVN_GET_CMD \
     -DgroupId=ai.rapids -DartifactId=cudf -Dversion=$CUDF_VER -Dclassifier=$CUDA_CLASSIFIER
 $MVN_GET_CMD \
     -DgroupId=com.nvidia -DartifactId=rapids-4-spark_$SCALA_BINARY_VER -Dversion=$PROJECT_VER
+$MVN_GET_CMD \
+    -DgroupId=com.nvidia -DartifactId=rapids-4-spark-udf-examples -Dversion=$PROJECT_VER
 $MVN_GET_CMD \
     -DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_VER
 if [ "$CUDA_CLASSIFIER"x == x ];then
@@ -41,6 +43,7 @@ else
     CUDF_JAR="$ARTF_ROOT/cudf-$CUDF_VER-$CUDA_CLASSIFIER.jar"
 fi
 RAPIDS_PLUGIN_JAR="$ARTF_ROOT/rapids-4-spark_${SCALA_BINARY_VER}-$PROJECT_VER.jar"
+RAPIDS_UDF_JAR="$ARTF_ROOT/rapids-4-spark-udf-examples-$PROJECT_VER.jar"
 RAPIDS_TEST_JAR="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_VER.jar"
 
 $MVN_GET_CMD \
@@ -69,8 +72,8 @@ BASE_SPARK_SUBMIT_ARGS="--master spark://$HOSTNAME:7077 \
     --executor-memory 12G \
     --total-executor-cores 6 \
     --conf spark.sql.shuffle.partitions=12 \
-    --conf spark.driver.extraClassPath=${CUDF_JAR}:${RAPIDS_PLUGIN_JAR} \
-    --conf spark.executor.extraClassPath=${CUDF_JAR}:${RAPIDS_PLUGIN_JAR} \
+    --conf spark.driver.extraClassPath=${CUDF_JAR}:${RAPIDS_PLUGIN_JAR}:${RAPIDS_UDF_JAR} \
+    --conf spark.executor.extraClassPath=${CUDF_JAR}:${RAPIDS_PLUGIN_JAR}:${RAPIDS_UDF_JAR} \
     --conf spark.driver.extraJavaOptions=-Duser.timezone=UTC \
     --conf spark.executor.extraJavaOptions=-Duser.timezone=UTC \
     --conf spark.sql.session.timeZone=UTC"