Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create non-shim specific version of ParquetCachedBatchSerializer #3473

Merged
merged 18 commits into from
Sep 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dist/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ If you have to change the contents of the uber jar the following files control w

1. `unshimmed-base.txt` - this has classes and files that should go into the base jar with their normal package name (not shaded). This includes user visible classes (ie com/nvidia/spark/SQLPlugin), python files, and other files that aren't version specific. Uses Spark 3.0.1 built jar for these base classes.
2. `unshimmed-extras.txt` - This is applied to all the individual Spark specific verson jars to pull any files that need to go into the base of the jar and not into the Spark specific directory from all of the other Spark version jars.
3. `unshimmed-spark311.txt` - This is applied to all the Spark 3.1.1 specific verson to pull any files that need to go into the base of the jar and not into the Spark specific directory from all of the other Spark version jars.
51 changes: 41 additions & 10 deletions dist/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -245,13 +245,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/spark311"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
Expand Down Expand Up @@ -516,13 +522,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/spark311"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
Expand Down Expand Up @@ -728,13 +740,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
dest="${project.build.directory}/parallel-world/spark312"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>

<unzip
Expand Down Expand Up @@ -1014,13 +1032,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/spark311"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
Expand Down Expand Up @@ -1342,13 +1366,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/spark311"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
Expand Down Expand Up @@ -1622,6 +1652,7 @@
<patternset id="sharedWorld">
<includesfile name="${project.basedir}/unshimmed-base.txt"/>
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
Expand Down
2 changes: 2 additions & 0 deletions dist/unshimmed-spark311.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
com/nvidia/spark/ParquetCachedBatchSerializer*
com/nvidia/spark/GpuCachedBatchSerializer*
2 changes: 1 addition & 1 deletion docs/additional-functionality/cache-serializer.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ nav_order: 2

To use this serializer please run Spark with the following conf.
```
spark-shell --conf spark.sql.cache.serializer=com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer"
spark-shell --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer"
```


Expand Down
20 changes: 9 additions & 11 deletions jenkins/databricks/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,16 @@ IS_SPARK_311_OR_LATER=0
[[ "$(printf '%s\n' "3.1.1" "$BASE_SPARK_VER" | sort -V | head -n1)" = "3.1.1" ]] && IS_SPARK_311_OR_LATER=1

TEST_TYPE="nightly"
PCBS_CONF="com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer"
PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
if [ -d "$LOCAL_JAR_PATH" ]; then
## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo
LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE

# Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
## Run cache tests
#if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
# PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
# LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
#fi
if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
fi

## Run cudf-udf tests
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
Expand All @@ -80,12 +79,11 @@ else
## Run tests with jars building from the spark-rapids source code
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE

# Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
## Run cache tests
#if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
# PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
# bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
#fi
if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
fi

## Run cudf-udf tests
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
Expand Down
7 changes: 2 additions & 5 deletions jenkins/spark-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,6 @@ IS_SPARK_311_OR_LATER=0
export SPARK_TASK_MAXFAILURES=1
[[ "$IS_SPARK_311_OR_LATER" -eq "0" ]] && SPARK_TASK_MAXFAILURES=4

IS_SPARK_311=0
[[ "$SPARK_VER" == "3.1.1" ]] && IS_SPARK_311=1

export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"

#stop and restart SPARK ETL
Expand Down Expand Up @@ -138,7 +135,7 @@ run_test() {

cache_serializer)
SPARK_SUBMIT_FLAGS="$BASE_SPARK_SUBMIT_ARGS $SEQ_CONF \
--conf spark.sql.cache.serializer=com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer" \
--conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer" \
./run_pyspark_from_build.sh -k cache_test
;;

Expand Down Expand Up @@ -179,7 +176,7 @@ fi
run_test cudf_udf_test

# Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
if [[ "$IS_SPARK_311" -eq "1" ]]; then
if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
run_test cache_serializer
fi

Expand Down
Loading