Merge branch 'branch-22.04' into str_to_map

Signed-off-by: Nghia Truong <nghiatruong.vn@gmail.com> # Conflicts: # integration_tests/src/main/python/map_test.py
NVIDIA · Feb 23, 2022 · 7c05021 · 7c05021
2 parents 4ef4a8e + f986670
commit 7c05021
Show file tree

Hide file tree

Showing 96 changed files with 827 additions and 2,927 deletions.
diff --git a/build/buildall b/build/buildall
@@ -15,15 +15,23 @@
 # limitations under the License.
 #
 
-set -ex
+set -e
 
 shopt -s extglob
 
+BLOOP_VERSION=${BLOOP_VERSION:-"1.4.13"}
+BLOOP_SCALA_VERSION=${BLOOP_SCALA_VERSION:-"2.13"}
+SKIP_CLEAN=1
+
 function print_usage() {
   echo "Usage: buildall [OPTION]"
   echo "Options:"
   echo "   -h, --help"
   echo "        print this help message"
+  echo "   --debug"
+  echo "        enable bash -x tracing right after this option is parsed"
+  echo "   --clean"
+  echo "        include Maven clean phase"
   echo "   -gb, --generate-bloop"
   echo "        generate projects for Bloop clients: IDE (Scala Metals, IntelliJ) or Bloop CLI"
   echo "   -p=DIST_PROFILE, --profile=DIST_PROFILE"
@@ -50,7 +58,7 @@ function bloopInstall() {
       mkdir -p "$bloop_config_dir"
       rm -f "$bloop_config_dir"/*
 
-      mvn install ch.epfl.scala:maven-bloop_2.13:1.4.9:bloopInstall -pl dist -am \
+      mvn install ch.epfl.scala:maven-bloop_${BLOOP_SCALA_VERSION}:${BLOOP_VERSION}:bloopInstall -pl dist -am \
         -Dbloop.configDirectory="$bloop_config_dir" \
         -DdownloadSources=true \
         -Dbuildver="$bv" \
@@ -102,6 +110,14 @@ case "$1" in
   BUILD_PARALLEL="${1#*=}"
   ;;
 
+--debug)
+  set -x
+  ;;
+
+--clean)
+  SKIP_CLEAN="0"
+  ;;
+
 *)
   echo >&2 "Unknown arg: $1"
   print_usage
@@ -145,6 +161,7 @@ case $DIST_PROFILE in
       311
       311cdh
       312
+      313
       320
       321
     )
@@ -183,8 +200,10 @@ export BASE_VER=${SPARK_SHIM_VERSIONS[0]}
 export NUM_SHIMS=${#SPARK_SHIM_VERSIONS[@]}
 export BUILD_PARALLEL=${BUILD_PARALLEL:-4}
 
-echo Clean once across all modules
-mvn -q clean
+if [[ "$SKIP_CLEAN" != "1" ]]; then
+  echo Clean once across all modules
+  mvn -q clean
+fi
 
 echo "Building a combined dist jar with Shims for ${SPARK_SHIM_VERSIONS[@]} ..."
 
@@ -219,7 +238,7 @@ function build_single_shim() {
       -Dbuildver="$BUILD_VER" \
       -Drat.skip="$SKIP_CHECKS" \
       -Dmaven.javadoc.skip="$SKIP_CHECKS" \
-      -Dskip="$SKIP_CHECKS" \
+      -Dskip \
       -Dmaven.scalastyle.skip="$SKIP_CHECKS" \
       -pl aggregator -am > "$LOG_FILE" 2>&1 || {
         [[ "$LOG_FILE" != "/dev/tty" ]] && tail -20 "$LOG_FILE" || true

diff --git a/dist/pom.xml b/dist/pom.xml
@@ -58,12 +58,12 @@
             311,
             311cdh,
             312,
+            313,
             320,
             321
         </noSnapshot.buildvers>
         <snapshot.buildvers>
             304,
-            313,
             322,
             330
         </snapshot.buildvers>

diff --git a/docs/additional-functionality/rapids-shuffle.md b/docs/additional-functionality/rapids-shuffle.md
@@ -293,6 +293,7 @@ In this section, we are using a docker container built using the sample dockerfi
     | 3.1.1         | com.nvidia.spark.rapids.spark311.RapidsShuffleManager    |
     | 3.1.1 CDH     | com.nvidia.spark.rapids.spark311cdh.RapidsShuffleManager |
     | 3.1.2         | com.nvidia.spark.rapids.spark312.RapidsShuffleManager    |
+    | 3.1.3         | com.nvidia.spark.rapids.spark313.RapidsShuffleManager    |
     | 3.2.0         | com.nvidia.spark.rapids.spark320.RapidsShuffleManager    |
     | 3.2.1         | com.nvidia.spark.rapids.spark321.RapidsShuffleManager    |
     | Databricks 7.3| com.nvidia.spark.rapids.spark301db.RapidsShuffleManager  |

diff --git a/docs/additional-functionality/rapids-udfs.md b/docs/additional-functionality/rapids-udfs.md
@@ -134,44 +134,8 @@ type `DECIMAL64(scale=-2)`.
 
 ## RAPIDS Accelerated UDF Examples
 
-Source code for examples of RAPIDS accelerated Hive UDFs is provided
-in the [udf-examples](../../udf-examples) project.
-
-### Spark Scala UDF Examples
-
-- [URLDecode](../../udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLDecode.scala)
-decodes URL-encoded strings using the
-[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
-- [URLEncode](../../udf-examples/src/main/scala/com/nvidia/spark/rapids/udf/scala/URLEncode.scala)
-URL-encodes strings using the
-[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
-
-### Spark Java UDF Examples
-
-- [URLDecode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLDecode.java)
-decodes URL-encoded strings using the
-[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
-- [URLEncode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java)
-URL-encodes strings using the
-[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
-- [CosineSimilarity](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java)
-computes the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
-between two float vectors using [native code](../../udf-examples/src/main/cpp/src)
-
-### Hive UDF Examples
-
-- [URLDecode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLDecode.java)
-implements a Hive simple UDF using the
-[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
-to decode URL-encoded strings
-- [URLEncode](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/URLEncode.java)
-implements a Hive generic UDF using the
-[Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable)
-to URL-encode strings
-- [StringWordCount](../../udf-examples/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java)
-implements a Hive simple UDF using
-[native code](../../udf-examples/src/main/cpp/src) to count words in strings
-
+<!-- Note: should update the branch name to tag when releasing-->
+Source code for examples of RAPIDS accelerated UDFs is provided in the [udf-examples](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04/examples/RAPIDS-accelerated-UDFs) project.
 
 ## GPU Support for Pandas UDF
 

diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -1194,13 +1194,13 @@ Accelerator supports are described below.
 <td>S</td>
 <td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
 <td>S</td>
-<td><em>PS<br/>max DECIMAL precision of 18</em></td>
+<td>S</td>
 <td>S</td>
 <td><b>NS</b></td>
 <td><b>NS</b></td>
 <td><b>NS</b></td>
 <td><b>NS</b></td>
-<td><em>PS<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, ARRAY, MAP, STRUCT, UDT</em></td>
+<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, ARRAY, MAP, STRUCT, UDT</em></td>
 <td><b>NS</b></td>
 </tr>
 <tr>

diff --git a/integration_tests/README.md b/integration_tests/README.md
@@ -245,15 +245,15 @@ The test files are everything under `./integration_tests/src/test/resources/`  B
 where you placed them because you will need to tell the tests where they are.
 
 When running these tests you will need to include the test jar, the integration test jar,
-the udf-examples jar, scala-test and scalactic. You can find scala-test and scalactic under
+the scala-test and scalactic. You can find scala-test and scalactic under
 `~/.m2/repository`.
 
 It is recommended that you use `spark-shell` and the scalatest shell to run each test
 individually, so you don't risk running unit tests along with the integration tests.
 http://www.scalatest.org/user_guide/using_the_scalatest_shell
 
 ```shell 
-spark-shell --jars rapids-4-spark-tests_2.12-22.04.0-SNAPSHOT-tests.jar,rapids-4-spark-udf-examples_2.12-22.04.0-SNAPSHOT.jar,rapids-4-spark-integration-tests_2.12-22.04.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
+spark-shell --jars rapids-4-spark-tests_2.12-22.04.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-22.04.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
 ```
 
 First you import the `scalatest_shell` and tell the tests where they can find the test files you
@@ -276,7 +276,7 @@ If you just want to verify the SQL replacement is working you will need to add t
 example assumes CUDA 11.0 is being used.
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar,rapids-4-spark-udf-examples_2.12-22.04.0-SNAPSHOT.jar,cudf-22.04.0-SNAPSHOT-cuda11.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar,cudf-22.04.0-SNAPSHOT-cuda11.jar" ./runtests.py
 ```
 
 You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -375,7 +375,7 @@ To run cudf_udf tests, need following configuration changes:
 As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar,rapids-4-spark-udf-examples_2.12-22.04.0-SNAPSHOT.jar,cudf-22.04.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-22.04.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar" ./runtests.py --cudf_udf
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar,cudf-22.04.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-22.04.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-22.04.0-SNAPSHOT.jar" ./runtests.py --cudf_udf
 ```
 
 ## Writing tests

diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -35,10 +35,6 @@ def pytest_addoption(parser):
     parser.addoption(
         "--cudf_udf", action='store_true', default=False, help="if true enable cudf_udf test"
     )
-    parser.addoption(
-        "--rapids_udf_example_native", action='store_true', default=False,
-        help="if true enable tests for RAPIDS UDF examples with native code"
-    )
     parser.addoption(
         "--test_type", action='store', default="developer",
         help="the type of tests that are being run to help check all the correct tests are run - developer, pre-commit, or nightly"

diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
@@ -60,17 +60,16 @@
             <version>${project.version}</version>
             <scope>provided</scope>
         </dependency>
-        <dependency>
-            <groupId>com.nvidia</groupId>
-            <artifactId>rapids-4-spark-udf-examples_${scala.binary.version}</artifactId>
-            <version>${project.version}</version>
-            <scope>test</scope>
-        </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_${scala.binary.version}</artifactId>
             <version>${spark.test.version}</version>
         </dependency>
+        <dependency>
+            <!-- for hive udf test cases -->
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-hive_${scala.binary.version}</artifactId>
+        </dependency>
     </dependencies>
 
     <profiles>
@@ -108,6 +107,17 @@
                     <artifactId>curator-recipes</artifactId>
                     <version>4.3.0.7.2.7.0-184</version>
                 </dependency>
+                <dependency>
+                    <groupId>org.apache.spark</groupId>
+                    <artifactId>spark-hive_${scala.binary.version}</artifactId>
+                    <version>${spark311cdh.version}</version>
+                    <exclusions>
+                        <exclusion>
+                            <groupId>org.apache.spark</groupId>
+                            <artifactId>spark-core_${scala.binary.version}</artifactId>
+                        </exclusion>
+                    </exclusions>
+                </dependency>
             </dependencies>
         </profile>
         <profile>
@@ -178,6 +188,30 @@
                     <version>${spark.version}</version>
                     <scope>provided</scope>
                 </dependency>
+                <dependency>
+                    <groupId>org.apache.hive</groupId>
+                    <artifactId>hive-exec</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
+                <dependency>
+                    <groupId>org.apache.hive</groupId>
+                    <artifactId>hive-serde</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
+                <dependency>
+                    <groupId>org.apache.commons</groupId>
+                    <artifactId>commons-io</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
+                <dependency>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-common</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
             </dependencies>
         </profile>
     </profiles>

diff --git a/integration_tests/pytest.ini b/integration_tests/pytest.ini
@@ -1,4 +1,4 @@
-; Copyright (c) 2020-2021, NVIDIA CORPORATION.
+; Copyright (c) 2020-2022, NVIDIA CORPORATION.
 ;
 ; Licensed under the Apache License, Version 2.0 (the "License");
 ; you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@ markers =
     limit(num_rows): Limit the number of rows that will be check in a result
     qarun: Mark qa test
     cudf_udf: Mark udf cudf test
-    rapids_udf_example_native: test UDFs that require custom cuda compilation
     validate_execs_in_gpu_plan([execs]): Exec class names to validate they exist in the GPU plan.
     shuffle_test: Mark to include test in the RAPIDS Shuffle Manager
     premerge_ci_1: Mark test that will run in first k8s pod in case of parallel build premerge job

diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
@@ -42,14 +42,12 @@ else
         CUDF_JARS=$(echo "$LOCAL_JAR_PATH"/cudf-*.jar)
         PLUGIN_JARS=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark_*.jar)
         TEST_JARS=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark-integration-tests*-$SPARK_SHIM_VER*.jar)
-        UDF_EXAMPLE_JARS=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark-udf-examples*.jar)
     else
         CUDF_JARS=$(echo "$SCRIPTPATH"/target/dependency/cudf-*.jar)
         PLUGIN_JARS=$(echo "$SCRIPTPATH"/../dist/target/rapids-4-spark_*.jar)
         TEST_JARS=$(echo "$SCRIPTPATH"/target/rapids-4-spark-integration-tests*-$SPARK_SHIM_VER*.jar)
-        UDF_EXAMPLE_JARS=$(echo "$SCRIPTPATH"/../udf-examples/target/rapids-4-spark-udf-examples*.jar)
     fi
-    ALL_JARS="$CUDF_JARS $PLUGIN_JARS $TEST_JARS $UDF_EXAMPLE_JARS"
+    ALL_JARS="$CUDF_JARS $PLUGIN_JARS $TEST_JARS"
     echo "AND PLUGIN JARS: $ALL_JARS"
     if [[ "${TEST}" != "" ]];
     then