From 2136cbe9423b4798417d9e2f694714efbb5f758f Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Thu, 17 Dec 2020 16:08:18 -0600 Subject: [PATCH] Add udf jar to nightly integration tests (#1420) * Add udf jar to nightly integration tests Signed-off-by: Jason Lowe * Update integration test docs for udf-examples jar Signed-off-by: Jason Lowe * Skip RAPIDS UDF tests if UDF fails to load Signed-off-by: Jason Lowe --- integration_tests/README.md | 11 ++++++----- .../src/main/python/rapids_udf_test.py | 14 ++++++++++---- jenkins/spark-tests.sh | 7 +++++-- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/integration_tests/README.md b/integration_tests/README.md index 9c04174ccef..62dbcdabc2a 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -99,14 +99,15 @@ The test files are everything under `./integration_tests/src/test/resources/` B where you placed them because you will need to tell the tests where they are. When running these tests you will need to include the test jar, the integration test jar, -scala-test and scalactic. You can find scala-test and scalactic under `~/.m2/repository`. +the udf-examples jar, scala-test and scalactic. You can find scala-test and scalactic under +`~/.m2/repository`. It is recommended that you use `spark-shell` and the scalatest shell to run each test individually, so you don't risk running unit tests along with the integration tests. http://www.scalatest.org/user_guide/using_the_scalatest_shell ```shell -spark-shell --jars rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-0.4.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar +spark-shell --jars rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT-tests.jar,rapids-4-spark-udf-examples-0.4.0-SNAPSHOT,rapids-4-spark-integration-tests_2.12-0.4.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar ``` First you import the `scalatest_shell` and tell the tests where they can find the test files you @@ -130,7 +131,7 @@ If you just want to verify the SQL replacement is working you will need to add t example assumes CUDA 10.1 is being used. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,rapids-4-spark-udf-examples-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT-cuda10-1.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -182,7 +183,7 @@ The TPCxBB, TPCH, TPCDS, and Mortgage tests in this framework can be enabled by As an example, here is the `spark-submit` command with the TPCxBB parameters on CUDA 10.1: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv" +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,rapids-4-spark-udf-examples-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT-cuda10-1.jar,rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv" ``` Be aware that running these tests with read data requires at least an entire GPU, and preferable several GPUs/executors @@ -211,7 +212,7 @@ To run cudf_udf tests, need following configuration changes: As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 10.1: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar" ./runtests.py --cudf_udf +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar,rapids-4-spark-udf-examples-0.4.0-SNAPSHOT.jar,cudf-0.18-SNAPSHOT-cuda10-1.jar,rapids-4-spark-tests_2.12-0.4.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-0.4.0-SNAPSHOT.jar" ./runtests.py --cudf_udf ``` ## Writing tests diff --git a/integration_tests/src/main/python/rapids_udf_test.py b/integration_tests/src/main/python/rapids_udf_test.py index b9b4ff6180c..f0bad69eeca 100644 --- a/integration_tests/src/main/python/rapids_udf_test.py +++ b/integration_tests/src/main/python/rapids_udf_test.py @@ -17,17 +17,24 @@ from asserts import assert_gpu_and_cpu_are_equal_sql from data_gen import * from spark_session import with_spark_session +from pyspark.sql.utils import AnalysisException def skip_if_no_hive(spark): if spark.conf.get("spark.sql.catalogImplementation") != "hive": pytest.skip("The Spark session does not have Hive support") +def load_udf_or_skip_test(spark, udfname, udfclass): + spark.sql("DROP TEMPORARY FUNCTION IF EXISTS {}".format(udfname)) + try: + spark.sql("CREATE TEMPORARY FUNCTION {} AS '{}'".format(udfname, udfclass)) + except AnalysisException: + pytest.skip("UDF {} failed to load, udf-examples jar is probably missing".format(udfname)) + def test_hive_simple_udf(): with_spark_session(skip_if_no_hive) data_gens = [["i", int_gen], ["s", StringGen('([^%]{0,1}(%[0-9A-F][0-9A-F]){0,1}){0,30}')]] def evalfn(spark): - spark.sql("DROP TEMPORARY FUNCTION IF EXISTS urldecode") - spark.sql("CREATE TEMPORARY FUNCTION urldecode AS 'com.nvidia.spark.rapids.udf.URLDecode'") + load_udf_or_skip_test(spark, "urldecode", "com.nvidia.spark.rapids.udf.URLDecode") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, @@ -38,8 +45,7 @@ def test_hive_generic_udf(): with_spark_session(skip_if_no_hive) data_gens = [["s", StringGen('.{0,30}')]] def evalfn(spark): - spark.sql("DROP TEMPORARY FUNCTION IF EXISTS urlencode") - spark.sql("CREATE TEMPORARY FUNCTION urlencode AS 'com.nvidia.spark.rapids.udf.URLEncode'") + load_udf_or_skip_test(spark, "urlencode", "com.nvidia.spark.rapids.udf.URLEncode") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh index 8dca6a1e9bd..65b41a6155b 100755 --- a/jenkins/spark-tests.sh +++ b/jenkins/spark-tests.sh @@ -33,6 +33,8 @@ $MVN_GET_CMD \ -DgroupId=ai.rapids -DartifactId=cudf -Dversion=$CUDF_VER -Dclassifier=$CUDA_CLASSIFIER $MVN_GET_CMD \ -DgroupId=com.nvidia -DartifactId=rapids-4-spark_$SCALA_BINARY_VER -Dversion=$PROJECT_VER +$MVN_GET_CMD \ + -DgroupId=com.nvidia -DartifactId=rapids-4-spark-udf-examples -Dversion=$PROJECT_VER $MVN_GET_CMD \ -DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_VER if [ "$CUDA_CLASSIFIER"x == x ];then @@ -41,6 +43,7 @@ else CUDF_JAR="$ARTF_ROOT/cudf-$CUDF_VER-$CUDA_CLASSIFIER.jar" fi RAPIDS_PLUGIN_JAR="$ARTF_ROOT/rapids-4-spark_${SCALA_BINARY_VER}-$PROJECT_VER.jar" +RAPIDS_UDF_JAR="$ARTF_ROOT/rapids-4-spark-udf-examples-$PROJECT_VER.jar" RAPIDS_TEST_JAR="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_VER.jar" $MVN_GET_CMD \ @@ -69,8 +72,8 @@ BASE_SPARK_SUBMIT_ARGS="--master spark://$HOSTNAME:7077 \ --executor-memory 12G \ --total-executor-cores 6 \ --conf spark.sql.shuffle.partitions=12 \ - --conf spark.driver.extraClassPath=${CUDF_JAR}:${RAPIDS_PLUGIN_JAR} \ - --conf spark.executor.extraClassPath=${CUDF_JAR}:${RAPIDS_PLUGIN_JAR} \ + --conf spark.driver.extraClassPath=${CUDF_JAR}:${RAPIDS_PLUGIN_JAR}:${RAPIDS_UDF_JAR} \ + --conf spark.executor.extraClassPath=${CUDF_JAR}:${RAPIDS_PLUGIN_JAR}:${RAPIDS_UDF_JAR} \ --conf spark.driver.extraJavaOptions=-Duser.timezone=UTC \ --conf spark.executor.extraJavaOptions=-Duser.timezone=UTC \ --conf spark.sql.session.timeZone=UTC"