From 96c90b9bf648fe2c10ce508a41dca38270b30dea Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Mon, 21 Mar 2022 12:43:19 -0500 Subject: [PATCH] Remove support for Spark 3.0.x [databricks] (#4988) * Remove support for Spark 3.0.x Signed-off-by: Jason Lowe * Remove more references to 301 shim and checks for 311 or later * Fix test_generic_reductions --- CONTRIBUTING.md | 19 +- api_validation/README.md | 8 +- api_validation/auditAllVersions.sh | 3 +- api_validation/pom.xml | 13 - .../spark/rapids/api/ApiValidation.scala | 2 +- build/buildall | 14 +- build/coverage-report | 4 +- dist/README.md | 10 +- dist/maven-antrun/build-parallel-worlds.xml | 4 +- dist/pom.xml | 8 +- ...txt => unshimmed-common-from-spark311.txt} | 0 docs/FAQ.md | 6 +- .../rapids-shuffle.md | 13 - docs/configs.md | 4 +- docs/dev/README.md | 2 +- .../get-started/getting-started-databricks.md | 6 +- docs/supported_ops.md | 715 ++++++++---------- .../src/main/python/arithmetic_ops_test.py | 3 +- .../src/main/python/array_test.py | 28 +- .../src/main/python/date_time_test.py | 5 +- .../src/main/python/explain_test.py | 3 +- .../src/main/python/hash_aggregate_test.py | 39 +- integration_tests/src/main/python/map_test.py | 22 +- .../src/main/python/parquet_write_test.py | 33 +- .../src/main/python/repart_test.py | 4 +- .../src/main/python/sort_test.py | 7 +- .../src/main/python/spark_session.py | 3 - .../src/main/python/string_test.py | 19 +- jenkins/databricks/build.sh | 36 +- jenkins/databricks/deploy.sh | 4 +- jenkins/databricks/params.py | 4 +- jenkins/deploy.sh | 2 +- jenkins/spark-nightly-build.sh | 2 +- jenkins/spark-premerge-build.sh | 11 +- jenkins/spark-tests.sh | 2 +- jenkins/version-def.sh | 4 +- pom.xml | 356 +-------- .../RapidsShuffleInternalManager.scala | 79 -- .../spark301/SparkShimServiceProvider.scala | 32 - .../spark301/RapidsShuffleManager.scala | 26 - .../com/nvidia/spark/rapids/SparkShims.scala | 24 - .../nvidia/spark/rapids/shims/AQEUtils.scala | 28 - .../rapids/shims/AggregationTagging.scala | 22 - .../rapids/shims/GpuHashPartitioning.scala | 45 -- .../spark/rapids/shims/GpuJoinUtils.scala | 31 - .../rapids/shims/GpuRegExpReplaceExec.scala | 79 -- .../shims/GpuRunningWindowExecMeta.scala | 39 - .../rapids/shims/GpuWindowInPandasExec.scala | 78 -- .../shims/ShimBroadcastExchangeLike.scala | 38 - .../spark/rapids/shims/Spark30XdbShims.scala | 693 ----------------- .../rapids/shims/Spark30XdbShimsBase.scala | 135 ---- .../RapidsShuffleInternalManager.scala | 78 -- .../spark301db/SparkShimServiceProvider.scala | 33 - .../spark301db/RapidsShuffleManager.scala | 26 - .../rapids/shims/GpuShuffleExchangeExec.scala | 58 -- .../rapids/shims/ShuffledBatchRDDUtil.scala | 107 --- .../shims/GpuFlatMapGroupsInPandasExec.scala | 167 ---- .../sql/rapids/shims/GpuFileScanRDD.scala | 195 ----- .../spark/rapids/shims/SparkShims.scala | 41 - .../spark/rapids/shims/GpuOrcScan.scala | 69 -- .../spark/rapids/shims/GpuParquetScan.scala | 71 -- .../rapids/shims/GpuRowBasedScalaUDF.scala | 76 -- .../shims/OffsetWindowFunctionMeta.scala | 67 -- .../spark/sql/rapids/aggregate/GpuSum.scala | 21 - .../sql/rapids/shims/GpuSchemaUtils.scala | 31 - .../spark/rapids/shims/GpuJoinUtils.scala | 31 - .../rapids/shims/GpuRegExpReplaceMeta.scala | 79 -- .../spark/rapids/shims/Spark30XShims.scala | 393 ---------- .../rapids/shims/GpuShuffleExchangeExec.scala | 56 -- .../rapids/shims/ShuffledBatchRDDUtil.scala | 107 --- .../rapids/shims/Spark301until320Shims.scala | 383 ---------- .../api/python/ShimBasePythonRunner.scala | 45 -- .../spark/rapids/shims/RapidsErrorUtils.scala | 34 - .../spark/sql/catalyst/csv/GpuCsvUtils.scala | 21 - .../sql/catalyst/json/GpuJsonUtils.scala | 21 - .../json/rapids/shims/FileOptionsShims.scala | 32 - .../RapidsShuffleInternalManager.scala | 79 -- .../spark302/SparkShimServiceProvider.scala | 33 - .../spark302/RapidsShuffleManager.scala | 26 - .../RapidsShuffleInternalManager.scala | 80 -- .../spark303/SparkShimServiceProvider.scala | 32 - .../spark303/RapidsShuffleManager.scala | 26 - .../com/nvidia/spark/rapids/SparkShims.scala | 44 -- .../RapidsShuffleInternalManager.scala | 79 -- .../spark304/SparkShimServiceProvider.scala | 32 - .../spark304/RapidsShuffleManager.scala | 26 - .../nvidia/spark/rapids/shims/AQEUtils.scala | 0 .../rapids/shims/AggregationTagging.scala | 0 .../rapids/shims/GpuWindowInPandasExec.scala | 0 .../shims/ShimBroadcastExchangeLike.scala | 0 .../shims/GpuFlatMapGroupsInPandasExec.scala | 0 .../spark/rapids/shims/SparkShims.scala | 2 +- .../nvidia/spark/rapids/shims/OrcShims.scala | 2 +- .../shims/AvoidAdaptiveTransitionToRow.scala | 0 .../nvidia/spark/rapids/shims/HashUtils.scala | 0 .../shims/OrcShims311until320Base.scala} | 2 +- .../rapids/shims/RapidsOrcScanMeta.scala | 0 .../rapids/shims/RapidsParquetScanMeta.scala | 0 .../rapids/shims/ShimAQEShuffleReadExec.scala | 0 .../rapids/shims/ShimDataSourceRDD.scala | 0 .../spark/rapids/shims/TypeSigUtil.scala | 2 +- .../spark/rapids/shims/YearParseUtil.scala | 0 .../spark/rapids/shims/gpuWindows.scala | 0 .../shims/GpuShuffleBlockResolver.scala | 0 .../shims/storage/ShimDiskBlockManager.scala | 0 .../execution/ShimTrampolineUtil.scala | 0 .../rapids/shims/datetimeExpressions.scala | 0 .../nvidia/spark/rapids/shims/OrcShims.scala | 2 +- .../spark/rapids/shims/Spark31XShims.scala | 349 ++++++++- .../api/python/ShimBasePythonRunner.scala | 0 .../spark/rapids/shims/AnsiCheckUtil.scala | 0 .../rapids/shims/GpuRangePartitioning.scala | 0 .../spark/rapids/shims/GpuTypeShims.scala | 0 .../rapids/shims/ParquetFieldIdShims.scala | 0 .../shims/RapidsFileSourceMetaUtils.scala | 0 .../rapids/shims/GpuHashPartitioning.scala | 0 .../spark/rapids/shims/RapidsErrorUtils.scala | 0 .../rapids/shims/Spark31Xuntil33XShims.scala} | 2 +- .../spark/sql/catalyst/csv/GpuCsvUtils.scala | 0 .../sql/catalyst/json/GpuJsonUtils.scala | 0 .../json/rapids/shims/FileOptionsShims.scala | 0 .../spark/rapids/shims/SparkShims.scala | 2 +- .../spark/rapids/shims/SparkShims.scala | 2 +- .../spark/rapids/shims/SparkShims.scala | 2 +- .../spark/rapids/shims/SparkShims.scala | 2 +- .../spark/rapids/shims/SparkShims.scala | 2 +- .../spark/rapids/shims/SparkShims.scala | 2 +- .../com/nvidia/spark/rapids/RapidsConf.scala | 6 +- .../com/nvidia/spark/rapids/ShimLoader.scala | 1 - .../nvidia/spark/rapids/VersionUtils.scala | 4 - tests/README.md | 12 +- .../spark/rapids/AdaptiveQueryExecSuite.scala | 9 - .../rapids/BroadcastNestedLoopJoinSuite.scala | 11 +- .../rapids/CostBasedOptimizerSuite.scala | 6 +- .../nvidia/spark/rapids/OrcScanSuite.scala | 3 +- .../rapids/SparkQueryCompareTestSuite.scala | 10 - 136 files changed, 806 insertions(+), 5095 deletions(-) rename dist/{unshimmed-common-from-spark301.txt => unshimmed-common-from-spark311.txt} (100%) delete mode 100644 sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/shims/spark301/RapidsShuffleInternalManager.scala delete mode 100644 sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/shims/spark301/SparkShimServiceProvider.scala delete mode 100644 sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/spark301/RapidsShuffleManager.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/SparkShims.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuJoinUtils.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuRegExpReplaceExec.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/Spark30XdbShims.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/Spark30XdbShimsBase.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/spark301db/RapidsShuffleInternalManager.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/spark301db/SparkShimServiceProvider.scala delete mode 100644 sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/spark301db/RapidsShuffleManager.scala delete mode 100644 sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala delete mode 100644 sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala delete mode 100644 sql-plugin/src/main/301db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala delete mode 100644 sql-plugin/src/main/301db/scala/org/apache/spark/sql/rapids/shims/GpuFileScanRDD.scala delete mode 100644 sql-plugin/src/main/301until304/scala/com/nvidia/spark/rapids/shims/SparkShims.scala delete mode 100644 sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuOrcScan.scala delete mode 100644 sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuParquetScan.scala delete mode 100644 sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuRowBasedScalaUDF.scala delete mode 100644 sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala delete mode 100644 sql-plugin/src/main/301until310-all/scala/org/apache/spark/sql/rapids/aggregate/GpuSum.scala delete mode 100644 sql-plugin/src/main/301until310-all/scala/org/apache/spark/sql/rapids/shims/GpuSchemaUtils.scala delete mode 100644 sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/GpuJoinUtils.scala delete mode 100644 sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/GpuRegExpReplaceMeta.scala delete mode 100644 sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/Spark30XShims.scala delete mode 100644 sql-plugin/src/main/301until310-nondb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala delete mode 100644 sql-plugin/src/main/301until310-nondb/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala delete mode 100644 sql-plugin/src/main/301until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark301until320Shims.scala delete mode 100644 sql-plugin/src/main/301until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala delete mode 100644 sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala delete mode 100644 sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala delete mode 100644 sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala delete mode 100644 sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala delete mode 100644 sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/shims/spark302/RapidsShuffleInternalManager.scala delete mode 100644 sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/shims/spark302/SparkShimServiceProvider.scala delete mode 100644 sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/spark302/RapidsShuffleManager.scala delete mode 100644 sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/shims/spark303/RapidsShuffleInternalManager.scala delete mode 100644 sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/shims/spark303/SparkShimServiceProvider.scala delete mode 100644 sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/spark303/RapidsShuffleManager.scala delete mode 100644 sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/SparkShims.scala delete mode 100644 sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/shims/spark304/RapidsShuffleInternalManager.scala delete mode 100644 sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/shims/spark304/SparkShimServiceProvider.scala delete mode 100644 sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/spark304/RapidsShuffleManager.scala rename sql-plugin/src/main/{301+-nondb => 311+-nondb}/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala (100%) rename sql-plugin/src/main/{301+-nondb => 311+-nondb}/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala (100%) rename sql-plugin/src/main/{301+-nondb => 311+-nondb}/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala (100%) rename sql-plugin/src/main/{301+-nondb => 311+-nondb}/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala (100%) rename sql-plugin/src/main/{301+-nondb => 311+-nondb}/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/HashUtils.scala (100%) rename sql-plugin/src/main/{301until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims301until320Base.scala => 311until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala} (99%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/ShimDataSourceRDD.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala (98%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala (100%) rename sql-plugin/src/main/{301until320-all/scala/org/apache/spark/sql => 311until320-all/scala/org/apache/spark/sql/rapids}/execution/ShimTrampolineUtil.scala (100%) rename sql-plugin/src/main/{301until320-all => 311until320-all}/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala (100%) rename sql-plugin/src/main/{301until320-noncdh => 311until320-noncdh}/scala/com/nvidia/spark/rapids/shims/OrcShims.scala (95%) rename sql-plugin/src/main/{301db => 311until320-nondb}/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala (100%) rename sql-plugin/src/main/{301until330-all => 311until330-all}/scala/com/nvidia/spark/rapids/shims/AnsiCheckUtil.scala (100%) rename sql-plugin/src/main/{301until330-all => 311until330-all}/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala (100%) rename sql-plugin/src/main/{301until330-all => 311until330-all}/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala (100%) rename sql-plugin/src/main/{301until330-all => 311until330-all}/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala (100%) rename sql-plugin/src/main/{301until330-all => 311until330-all}/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala (100%) rename sql-plugin/src/main/{301until330-nondb => 311until330-nondb}/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala (100%) rename sql-plugin/src/main/{301db => 311until330-nondb}/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala (100%) rename sql-plugin/src/main/{301until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark30Xuntil33XShims.scala => 311until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala} (96%) rename sql-plugin/src/main/{301db => 311until330-nondb}/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala (100%) rename sql-plugin/src/main/{301db => 311until330-nondb}/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala (100%) rename sql-plugin/src/main/{301db => 311until330-nondb}/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala (100%) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1e514b11518..22f426293ff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,7 +40,7 @@ mvn verify After a successful build the RAPIDS Accelerator jar will be in the `dist/target/` directory. This will build the plugin for a single version of Spark. By default this is Apache Spark -3.0.1. To build against other versions of Spark you use the `-Dbuildver=XXX` command line option +3.1.1. To build against other versions of Spark you use the `-Dbuildver=XXX` command line option to Maven. For instance to build Spark 3.1.1 you would use: ```shell script @@ -72,12 +72,11 @@ You can also install some manually and build a combined jar. For instance to bui ```shell script mvn clean -mvn -Dbuildver=301 install -DskipTests -mvn -Dbuildver=302 install -Drat.skip=true -DskipTests -mvn -Dbuildver=303 install -Drat.skip=true -DskipTests mvn -Dbuildver=311 install -Drat.skip=true -DskipTests mvn -Dbuildver=312 install -Drat.skip=true -DskipTests +mvn -Dbuildver=313 install -Drat.skip=true -DskipTests mvn -Dbuildver=320 install -Drat.skip=true -DskipTests +mvn -Dbuildver=321 install -Drat.skip=true -DskipTests mvn -Dbuildver=311cdh install -Drat.skip=true -DskipTests mvn -pl dist -PnoSnapshots package -DskipTests ``` @@ -88,9 +87,9 @@ There is a build script `build/buildall` that automates the local build process. By default, it builds everything that is needed to create a distribution jar for all released (noSnapshots) Spark versions except for Databricks. Other profiles that you can pass using `--profile=` include - `snapshots` -- `minimumFeatureVersionMix` that currently includes 302, 311cdh, 312, 320 is recommended for catching incompatibilities already in the local development cycle +- `minimumFeatureVersionMix` that currently includes 311cdh, 312, 320 is recommended for catching incompatibilities already in the local development cycle -For initial quick iterations we can use `--profile=` to build a single-shim version. e.g., `--profile=301` for Spark 3.0.1. +For initial quick iterations we can use `--profile=` to build a single-shim version. e.g., `--profile=311` for Spark 3.1.1. The option `--module=` allows to limit the number of build steps. When iterating, we often don't have the need for the entire build. We may be interested in building everything necessary just to run integration tests (`--module=integration_tests`), or we may want to just rebuild the distribution jar (`--module=dist`) @@ -127,10 +126,8 @@ The version-specific directory names have one of the following forms / use cases - `src/main/312+-apache/scala`contains Scala source code for *upstream* **Apache** Spark builds, only beginning with version Spark 3.1.2, and + signifies there is no upper version boundary among the supported versions -- `src/main/302until312-all` contains code that applies to all shims between 3.0.2 *inclusive*, -3.1.2 *exclusive* -- `src/main/302to312-cdh` contains code that applies to Cloudera CDH shims between 3.0.2 *inclusive*, - 3.1.2 *inclusive* +- `src/main/311until320-all` contains code that applies to all shims between 3.1.1 *inclusive*, +3.2.0 *exclusive* - `src/main/pre320-treenode` contains shims for the Catalyst `TreeNode` class before the [children trait specialization in Apache Spark 3.2.0](https://issues.apache.org/jira/browse/SPARK-34906). - `src/main/post320-treenode` contains shims for the Catalyst `TreeNode` class after the @@ -224,7 +221,7 @@ Install [Scala Metals extension](https://scalameta.org/metals/docs/editors/vscod either locally or into a Remote-SSH extension destination depending on your target environment. When your project folder is open in VS Code, it may prompt you to import Maven project. IMPORTANT: always decline with "Don't ask again", otherwise it will overwrite the Bloop projects -generated with the default `301` profile. If you need to use a different profile, always rerun the +generated with the default `311` profile. If you need to use a different profile, always rerun the command above manually. When regenerating projects it's recommended to proceed to Metals "Build commands" View, and click: 1. "Restart build server" diff --git a/api_validation/README.md b/api_validation/README.md index 4f4ec5aba96..1d2d354139f 100644 --- a/api_validation/README.md +++ b/api_validation/README.md @@ -2,7 +2,7 @@ API validation script checks the compatibility of community Spark Execs and GPU Execs in the Rapids Plugin for Spark. For example: HashAggregateExec with GpuHashAggregateExec. -Script can be used to audit different versions of Spark(3.0.1 and 3.1.1) +Script can be used to audit different versions of Spark. The script prints Execs where validation fails. Validation fails when: 1) The number of parameters differ between community Spark Execs and Gpu Execs. @@ -17,11 +17,11 @@ It requires cudf, rapids-4-spark and spark jars. ``` cd api_validation -// To run validation script on all version of Spark(3.0.1 and 3.1.1) +// To run validation script on all version of Spark sh auditAllVersions.sh -// To run script on particular version we can use profile(spark301 and spark311) -mvn scala:run -P spark301 +// To run script on particular version we can use profile +mvn scala:run -P spark311 ``` # Output diff --git a/api_validation/auditAllVersions.sh b/api_validation/auditAllVersions.sh index 8b82ce5daf8..5deddacec65 100644 --- a/api_validation/auditAllVersions.sh +++ b/api_validation/auditAllVersions.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,4 @@ # limitations under the License. set -ex -mvn scala:run -P spark301 mvn scala:run -P spark311 diff --git a/api_validation/pom.xml b/api_validation/pom.xml index a5ae0de7682..d8c61704a45 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -41,19 +41,6 @@ - - spark301 - - ${spark301.version} - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - - - spark311 diff --git a/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala b/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala index 942e534218e..83835071bd0 100644 --- a/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala +++ b/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala @@ -70,7 +70,7 @@ object ApiValidation extends Logging { val gpuKeys = gpuExecs.keys var printNewline = false - val sparkToShimMap = Map("3.0.1" -> "spark301", "3.1.1" -> "spark311") + val sparkToShimMap = Map("3.1.1" -> "spark311") val sparkVersion = SparkShimImpl.getSparkShimVersion.toString val shimVersion = sparkToShimMap(sparkVersion) diff --git a/build/buildall b/build/buildall index b95a1270336..81e9e581f09 100755 --- a/build/buildall +++ b/build/buildall @@ -138,10 +138,6 @@ case $DIST_PROFILE in snapshots?(WithDatabricks)) SPARK_SHIM_VERSIONS=( - 301 - 302 - 303 - 304 311 311cdh 312 @@ -156,9 +152,6 @@ case $DIST_PROFILE in noSnapshots?(WithDatabricks)) SPARK_SHIM_VERSIONS=( - 301 - 302 - 303 311 311cdh 312 @@ -170,7 +163,6 @@ case $DIST_PROFILE in minimumFeatureVersionMix) SPARK_SHIM_VERSIONS=( - 302 311cdh 312 320 @@ -251,8 +243,8 @@ export -f build_single_shim # Install all the versions for DIST_PROFILE # First build the aggregator module for all SPARK_SHIM_VERSIONS in parallel skipping expensive plugins that -# - either deferred to 301 because the check is identical in all shim profiles such as scalastyle -# - or deferred to 301 because we currently don't require it per shim such as javadoc generation +# - either deferred to 311 because the check is identical in all shim profiles such as scalastyle +# - or deferred to 311 because we currently don't require it per shim such as javadoc generation # - or there is a dedicated step to run against a particular shim jar such as unit tests, in # the near future we will run unit tests against a combined multi-shim jar to catch classloading # regressions even before pytest-based integration_tests @@ -265,7 +257,7 @@ time ( bash -c 'build_single_shim "$@"' _ % # This used to resume from dist. However, without including aggregator in the build # the build does not properly initialize spark.version property via buildver profiles - # in the root pom, and we get a missing spark301 dependency even for --profile=312,321 + # in the root pom, and we get a missing spark311 dependency even for --profile=312,321 # where the build does not require it. Moving it to aggregator resolves this issue with # a negligible increase of the build time by ~2 seconds. joinShimBuildFrom="aggregator" diff --git a/build/coverage-report b/build/coverage-report index a4ae19aa84e..ac024b608d1 100755 --- a/build/coverage-report +++ b/build/coverage-report @@ -1,7 +1,7 @@ #!/bin/bash # -# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,7 +23,7 @@ TMP_CLASS=${TEMP_CLASS_LOC:-"./target/jacoco_classes/"} HTML_LOC=${HTML_LOCATION:="./target/jacoco-report/"} XML_LOC=${XML_LOCATION:="${HTML_LOC}"} DIST_JAR=${RAPIDS_DIST_JAR:-$(ls ./dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)} -SPK_VER=${JACOCO_SPARK_VER:-"301"} +SPK_VER=${JACOCO_SPARK_VER:-"311"} UDF_JAR=${RAPIDS_UDF_JAR:-$(ls ./udf-compiler/target/spark${SPK_VER}/rapids-4-spark-udf_2.12-*-SNAPSHOT-spark${SPK_VER}.jar | grep -v test | xargs readlink -f)} SOURCE_DIRS=${SOURCE_DIRS:-"./sql-plugin/src/main/scala/:./sql-plugin/src/main/java/:./shuffle-plugin/src/main/scala/:./udf-compiler/src/main/scala/"} diff --git a/dist/README.md b/dist/README.md index 7b2086b139d..6afdb59cbb9 100644 --- a/dist/README.md +++ b/dist/README.md @@ -17,22 +17,22 @@ Files are: `com.nvidia.spark.rapids.SparkShimServiceProvider.sparkNonSnapshot`, The new uber jar is structured like: -1. Base common classes are user visible classes. For these we use Spark 3.0.1 versions because they are assumed to be +1. Base common classes are user visible classes. For these we use Spark 3.1.1 versions because they are assumed to be bitwise-identical to the other shims, this assumption is subject to the future automatic validation. 2. META-INF/services. This is a file that has to list all the shim versions supported by this jar. The files talked about above for each profile are put into place here for uber jars. Although we currently do not use [ServiceLoader API](https://docs.oracle.com/javase/8/docs/api/java/util/ServiceLoader.html) we use the same service provider discovery mechanism -3. META-INF base files are from 3.0.1 - maven, LICENSE, NOTICE, etc +3. META-INF base files are from 3.1.1 - maven, LICENSE, NOTICE, etc 4. Spark specific directory (aka Parallel World in the jargon of [ParallelWorldClassloader](https://github.com/openjdk/jdk/blob/jdk8-b120/jaxws/src/share/jaxws_classes/com/sun/istack/internal/tools/ParallelWorldClassLoader.java)) -for each version of Spark supported in the jar, i.e., spark301/, spark302/, spark311/, etc. +for each version of Spark supported in the jar, i.e., spark311/, spark312/, spark320/, etc. If you have to change the contents of the uber jar the following files control what goes into the base jar as classes that are not shaded. -1. `unshimmed-common-from-spark301.txt` - this has classes and files that should go into the base jar with their normal +1. `unshimmed-common-from-spark311.txt` - this has classes and files that should go into the base jar with their normal package name (not shaded). This includes user visible classes (i.e., com/nvidia/spark/SQLPlugin), python files, -and other files that aren't version specific. Uses Spark 3.0.1 built jar for these base classes as explained above. +and other files that aren't version specific. Uses Spark 3.1.1 built jar for these base classes as explained above. 2. `unshimmed-from-each-spark3xx.txt` - This is applied to all the individual Spark specific version jars to pull any files that need to go into the base of the jar and not into the Spark specific directory. 3. `unshimmed-spark311.txt` - This is applied to all the Spark 3.1.1 specific version jars to pull any files that need to go diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml index 145802f71e3..32665ef4e3e 100644 --- a/dist/maven-antrun/build-parallel-worlds.xml +++ b/dist/maven-antrun/build-parallel-worlds.xml @@ -1,6 +1,6 @@ - - - ${project.basedir}/src/main/301+-nondb/scala - ${project.basedir}/src/main/301/scala - ${project.basedir}/src/main/301until304/scala - ${project.basedir}/src/main/301until310-all/scala - ${project.basedir}/src/main/301until310-nondb/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala - ${project.basedir}/src/main/pre320-treenode/scala - - - - - - - - - common - dist - integration_tests - shuffle-plugin - sql-plugin - tests - udf-compiler - api_validation - tools - aggregator - - - - release302 - - - buildver - 302 - - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-profile-src-30+ - add-source - none - - - ${project.basedir}/src/main/301+-nondb/scala - ${project.basedir}/src/main/302/scala - ${project.basedir}/src/main/301until304/scala - ${project.basedir}/src/main/301until310-all/scala - ${project.basedir}/src/main/301until310-nondb/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala - ${project.basedir}/src/main/pre320-treenode/scala - - - - - - - - - ${spark302.version} - ${spark302.version} - spark302 - - - common - dist - integration_tests - shuffle-plugin - sql-plugin - tests - udf-compiler - aggregator - tools - api_validation - - - - release303 - - - buildver - 303 - - - - ${spark303.version} - ${spark303.version} - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-profile-src-30+ - add-source - none - - - ${project.basedir}/src/main/301+-nondb/scala - ${project.basedir}/src/main/303/scala - ${project.basedir}/src/main/301until304/scala - ${project.basedir}/src/main/301until310-all/scala - ${project.basedir}/src/main/301until310-nondb/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala - ${project.basedir}/src/main/pre320-treenode/scala - - - - - - - - - common - dist - integration_tests - shuffle-plugin - sql-plugin - tests - udf-compiler - api_validation - tools - aggregator - - - - release304 - - - buildver - 304 - - - - ${spark304.version} - ${spark304.version} - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-profile-src-30+ - add-source - none - - - ${project.basedir}/src/main/301+-nondb/scala - ${project.basedir}/src/main/304/scala - ${project.basedir}/src/main/301until310-all/scala - ${project.basedir}/src/main/301until310-nondb/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala - ${project.basedir}/src/main/pre320-treenode/scala - - - - - - - - - common - dist - integration_tests - shuffle-plugin - sql-plugin - tests - udf-compiler - api_validation - tools - aggregator - - release311 + true buildver 311 @@ -333,18 +113,14 @@ none - ${project.basedir}/src/main/301+-nondb/scala ${project.basedir}/src/main/311-nondb/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala ${project.basedir}/src/main/311until320-all/scala ${project.basedir}/src/main/311until320-noncdh/scala ${project.basedir}/src/main/311until320-nondb/scala + ${project.basedir}/src/main/311until330-all/scala + ${project.basedir}/src/main/311until330-nondb/scala ${project.basedir}/src/main/pre320-treenode/scala @@ -367,67 +143,6 @@ tests-spark310+ - - - release301db - - - buildver - 301db - - - - - 3.4.4 - spark301db - spark301db - - ${spark301db.version} - ${spark301db.version} - 2.7.4 - true - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-profile-src-31+ - add-source - none - - - ${project.basedir}/src/main/301db/scala - ${project.basedir}/src/main/301until310-all/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/pre320-treenode/scala - - - - - - - - - common - dist - integration_tests - shuffle-plugin - sql-plugin - tests - udf-compiler - aggregator - - release312db @@ -465,14 +180,12 @@ none - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala ${project.basedir}/src/main/312db/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/311until320-all/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-db/scala + ${project.basedir}/src/main/311until320-all/scala ${project.basedir}/src/main/311until320-noncdh/scala + ${project.basedir}/src/main/311until330-all/scala ${project.basedir}/src/main/31xdb/scala ${project.basedir}/src/main/post320-treenode/scala @@ -517,18 +230,15 @@ none - ${project.basedir}/src/main/301+-nondb/scala + ${project.basedir}/src/main/311+-nondb/scala ${project.basedir}/src/main/312-nondb/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala ${project.basedir}/src/main/311until320-all/scala ${project.basedir}/src/main/311until320-noncdh/scala ${project.basedir}/src/main/311until320-nondb/scala + ${project.basedir}/src/main/311until330-all/scala + ${project.basedir}/src/main/311until330-nondb/scala ${project.basedir}/src/main/pre320-treenode/scala @@ -575,18 +285,14 @@ none - ${project.basedir}/src/main/301+-nondb/scala ${project.basedir}/src/main/313/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala ${project.basedir}/src/main/311until320-all/scala ${project.basedir}/src/main/311until320-noncdh/scala ${project.basedir}/src/main/311until320-nondb/scala + ${project.basedir}/src/main/311until330-all/scala + ${project.basedir}/src/main/311until330-nondb/scala ${project.basedir}/src/main/pre320-treenode/scala @@ -633,18 +339,14 @@ none - ${project.basedir}/src/main/301+-nondb/scala ${project.basedir}/src/main/314/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-noncdh/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala ${project.basedir}/src/main/311until320-all/scala ${project.basedir}/src/main/311until320-noncdh/scala ${project.basedir}/src/main/311until320-nondb/scala + ${project.basedir}/src/main/311until330-all/scala + ${project.basedir}/src/main/311until330-nondb/scala ${project.basedir}/src/main/pre320-treenode/scala @@ -701,12 +403,11 @@ none - ${project.basedir}/src/main/301+-nondb/scala ${project.basedir}/src/main/320/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala + ${project.basedir}/src/main/311until330-all/scala + ${project.basedir}/src/main/311until330-nondb/scala ${project.basedir}/src/main/320/scala ${project.basedir}/src/main/320+/scala ${project.basedir}/src/main/320+-nondb/scala @@ -766,12 +467,11 @@ none - ${project.basedir}/src/main/301+-nondb/scala ${project.basedir}/src/main/321/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala + ${project.basedir}/src/main/311until330-all/scala + ${project.basedir}/src/main/311until330-nondb/scala ${project.basedir}/src/main/320+/scala ${project.basedir}/src/main/320+-nondb/scala ${project.basedir}/src/main/320until330-all/scala @@ -831,12 +531,11 @@ generate-sources - ${project.basedir}/src/main/301+-nondb/scala ${project.basedir}/src/main/322/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala + ${project.basedir}/src/main/311until330-all/scala + ${project.basedir}/src/main/311until330-nondb/scala ${project.basedir}/src/main/320+/scala ${project.basedir}/src/main/320+-nondb/scala ${project.basedir}/src/main/321+/scala @@ -910,7 +609,7 @@ ${project.basedir}/src/main/321db/scala - ${project.basedir}/src/main/301until330-all/scala + ${project.basedir}/src/main/311until330-all/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-db/scala ${project.basedir}/src/main/320+/scala @@ -959,7 +658,6 @@ none - ${project.basedir}/src/main/301+-nondb/scala ${project.basedir}/src/main/330/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala @@ -1022,18 +720,15 @@ none - ${project.basedir}/src/main/301+-nondb/scala ${project.basedir}/src/main/311-nondb/scala ${project.basedir}/src/main/311cdh/scala - ${project.basedir}/src/main/301until320-all/scala - ${project.basedir}/src/main/301until320-nondb/scala - ${project.basedir}/src/main/301until330-all/scala - ${project.basedir}/src/main/301until330-nondb/scala ${project.basedir}/src/main/311+-all/scala ${project.basedir}/src/main/311+-nondb/scala ${project.basedir}/src/main/311cdh/scala ${project.basedir}/src/main/311until320-all/scala ${project.basedir}/src/main/311until320-nondb/scala + ${project.basedir}/src/main/311until330-all/scala + ${project.basedir}/src/main/311until330-nondb/scala ${project.basedir}/src/main/pre320-treenode/scala @@ -1106,11 +801,11 @@ - 301 + 311 1.8 1.8 8 - ${spark301.version} + ${spark311.version} ${spark.version} spark${buildver} cuda11 @@ -1136,11 +831,6 @@ If you update a dependency version so it is no longer a SNAPSHOT please update the snapshot-shims profile as well so it is accurate --> 2.4.8 - 3.0.1 - 3.0.1-databricks - 3.0.2 - 3.0.3 - 3.0.4-SNAPSHOT 3.1.1 3.1.1.3.1.7270.0-253 3.1.2 diff --git a/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/shims/spark301/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/shims/spark301/RapidsShuffleInternalManager.scala deleted file mode 100644 index 47808ea22d0..00000000000 --- a/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/shims/spark301/RapidsShuffleInternalManager.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.rapids.shims.spark301 - -import org.apache.spark.{SparkConf, TaskContext} -import org.apache.spark.shuffle._ -import org.apache.spark.sql.rapids.{ProxyRapidsShuffleInternalManagerBase, RapidsShuffleInternalManagerBase} - -/** - * A shuffle manager optimized for the RAPIDS Plugin For Apache Spark. - * @note This is an internal class to obtain access to the private - * `ShuffleManager` and `SortShuffleManager` classes. - */ -class RapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends RapidsShuffleInternalManagerBase(conf, isDriver) { - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, startMapIndex, endMapIndex, startPartition, endPartition, context, - metrics) - } - - def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, 0, Int.MaxValue, startPartition, endPartition, context, metrics) - } - -} - -class ProxyRapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends ProxyRapidsShuffleInternalManagerBase(conf, isDriver) with ShuffleManager { - - override def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter - ): org.apache.spark.shuffle.ShuffleReader[K,C] = { - self.getReader(handle, startPartition, endPartition, context, metrics) - } - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter - ): ShuffleReader[K,C] = { - self.getReaderForRange(handle, startMapIndex, endMapIndex, startPartition, endPartition, - context, metrics) - } -} \ No newline at end of file diff --git a/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/shims/spark301/SparkShimServiceProvider.scala b/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/shims/spark301/SparkShimServiceProvider.scala deleted file mode 100644 index 3d861c0c656..00000000000 --- a/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/shims/spark301/SparkShimServiceProvider.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims.spark301 - -import com.nvidia.spark.rapids.SparkShimVersion - -object SparkShimServiceProvider { - val VERSION = SparkShimVersion(3, 0, 1) - val VERSIONNAMES = Seq(s"$VERSION") -} -class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider { - - override def getShimVersion: SparkShimVersion = SparkShimServiceProvider.VERSION - - def matchesVersion(version: String): Boolean = { - SparkShimServiceProvider.VERSIONNAMES.contains(version) - } -} diff --git a/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/spark301/RapidsShuffleManager.scala b/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/spark301/RapidsShuffleManager.scala deleted file mode 100644 index 8359be2eb9c..00000000000 --- a/sql-plugin/src/main/301/scala/com/nvidia/spark/rapids/spark301/RapidsShuffleManager.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.spark301 - -import org.apache.spark.SparkConf -import org.apache.spark.sql.rapids.shims.spark301.ProxyRapidsShuffleInternalManager - -/** A shuffle manager optimized for the RAPIDS Plugin for Apache Spark. */ -sealed class RapidsShuffleManager( - conf: SparkConf, - isDriver: Boolean) extends ProxyRapidsShuffleInternalManager(conf, isDriver) { -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/SparkShims.scala deleted file mode 100644 index 68382bfd084..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/SparkShims.scala +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids._ - -object SparkShimImpl extends Spark30XdbShims { - - override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala deleted file mode 100644 index b001bc929c0..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.execution.adaptive.{QueryStageExec, ShuffleQueryStageExec} - -/** Utility methods for manipulating Catalyst classes involved in Adaptive Query Execution */ -object AQEUtils { - /** Return a new QueryStageExec reuse instance with updated output attributes */ - def newReuseInstance(sqse: ShuffleQueryStageExec, newOutput: Seq[Attribute]): QueryStageExec = { - sqse.newReuseInstance(sqse.id, newOutput) - } -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala deleted file mode 100644 index 823c81fe3f2..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -object AggregationTagging { - // Whether aggregations must be replaced only when both halves are replaced. - val mustReplaceBoth: Boolean = false -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala deleted file mode 100644 index c2810f37d91..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.GpuHashPartitioningBase - -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution, HashClusteredDistribution} - -case class GpuHashPartitioning(expressions: Seq[Expression], numPartitions: Int) - extends GpuHashPartitioningBase(expressions, numPartitions) { - - override def satisfies0(required: Distribution): Boolean = { - super.satisfies0(required) || { - required match { - case h: HashClusteredDistribution => - expressions.length == h.expressions.length && expressions.zip(h.expressions).forall { - case (l, r) => l.semanticEquals(r) - } - case ClusteredDistribution(requiredClustering, _) => - expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) - case _ => false - } - } - } - -} - -object GpuHashPartitioning { - def getDistribution(exprs: Seq[Expression]): Distribution = HashClusteredDistribution(exprs) -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuJoinUtils.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuJoinUtils.scala deleted file mode 100644 index 3bdb81676bf..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuJoinUtils.scala +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{GpuBuildLeft, GpuBuildRight, GpuBuildSide} - -import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} - -object GpuJoinUtils { - def getGpuBuildSide(buildSide: BuildSide): GpuBuildSide = { - buildSide match { - case BuildRight => GpuBuildRight - case BuildLeft => GpuBuildLeft - case _ => throw new Exception(s"unknown build side type $buildSide") - } - } -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuRegExpReplaceExec.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuRegExpReplaceExec.scala deleted file mode 100644 index 9c30127dc27..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuRegExpReplaceExec.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{CudfRegexTranspiler, DataFromReplacementRule, GpuExpression, GpuOverrides, RapidsConf, RapidsMeta, RegexReplaceMode, RegexUnsupportedException, TernaryExprMeta} - -import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, RegExpReplace} -import org.apache.spark.sql.rapids.{GpuRegExpReplace, GpuRegExpUtils, GpuStringReplace} -import org.apache.spark.sql.types.DataTypes -import org.apache.spark.unsafe.types.UTF8String - -class GpuRegExpReplaceMeta( - expr: RegExpReplace, - conf: RapidsConf, - parent: Option[RapidsMeta[_, _, _]], - rule: DataFromReplacementRule) - extends TernaryExprMeta[RegExpReplace](expr, conf, parent, rule) { - - private var pattern: Option[String] = None - private var replacement: Option[String] = None - - override def tagExprForGpu(): Unit = { - GpuRegExpUtils.tagForRegExpEnabled(this) - expr.regexp match { - case Literal(s: UTF8String, DataTypes.StringType) if s != null => - if (GpuOverrides.isSupportedStringReplacePattern(expr.regexp)) { - // use GpuStringReplace - } else { - try { - pattern = Some(new CudfRegexTranspiler(RegexReplaceMode).transpile(s.toString)) - } catch { - case e: RegexUnsupportedException => - willNotWorkOnGpu(e.getMessage) - } - } - - case _ => - willNotWorkOnGpu(s"only non-null literal strings are supported on GPU") - } - - expr.rep match { - case Literal(s: UTF8String, DataTypes.StringType) if s != null => - if (GpuRegExpUtils.containsBackrefs(s.toString)) { - willNotWorkOnGpu("regexp_replace with back-references is not supported") - } - replacement = Some(GpuRegExpUtils.unescapeReplaceString(s.toString)) - case _ => - } - } - - override def convertToGpu( - lhs: Expression, - regexp: Expression, - rep: Expression): GpuExpression = { - if (GpuOverrides.isSupportedStringReplacePattern(expr.regexp)) { - GpuStringReplace(lhs, regexp, rep) - } else { - (pattern, replacement) match { - case (Some(cudfPattern), Some(cudfReplacement)) => - GpuRegExpReplace(lhs, regexp, rep, cudfPattern, cudfReplacement) - case _ => - throw new IllegalStateException("Expression has not been tagged correctly") - } - } - } -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala deleted file mode 100644 index f12c8208ec2..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuRunningWindowExecMeta.scala +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.databricks.sql.execution.window.RunningWindowFunctionExec -import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuBaseWindowExecMeta, RapidsConf, RapidsMeta} - -import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression, SortOrder} - -/** - * GPU-based window-exec implementation, analogous to RunningWindowFunctionExec. - */ -class GpuRunningWindowExecMeta(runningWindowFunctionExec: RunningWindowFunctionExec, - conf: RapidsConf, - parent: Option[RapidsMeta[_, _, _]], - rule: DataFromReplacementRule) - extends GpuBaseWindowExecMeta[RunningWindowFunctionExec](runningWindowFunctionExec, conf, - parent, rule) { - - override def getInputWindowExpressions: Seq[NamedExpression] = - runningWindowFunctionExec.windowExpressionList - override def getPartitionSpecs: Seq[Expression] = runningWindowFunctionExec.partitionSpec - override def getOrderSpecs: Seq[SortOrder] = runningWindowFunctionExec.orderSpec - override def getResultColumnsOnly: Boolean = true -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala deleted file mode 100644 index e3085905c9c..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{GpuBindReferences, GpuBoundReference, GpuProjectExec, GpuWindowExpression} - -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression, SortOrder} -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.rapids.execution.python.GpuWindowInPandasExecBase -import org.apache.spark.sql.vectorized.ColumnarBatch - -/* - * This GpuWindowInPandasExec aims at accelerating the data transfer between - * JVM and Python, and scheduling GPU resources for Python processes - */ -case class GpuWindowInPandasExec( - projectList: Seq[Expression], - gpuPartitionSpec: Seq[Expression], - cpuOrderSpec: Seq[SortOrder], - child: SparkPlan)( - override val cpuPartitionSpec: Seq[Expression]) extends GpuWindowInPandasExecBase { - - override def otherCopyArgs: Seq[AnyRef] = cpuPartitionSpec :: Nil - - override final def pythonModuleKey: String = "databricks" - - // On Databricks, the projectList contains not only the window expression, but may also contains - // the input attributes. So we need to extract the window expressions from it. - override def windowExpression: Seq[Expression] = projectList.filter { expr => - expr.find(node => node.isInstanceOf[GpuWindowExpression]).isDefined - } - - // On Databricks, the projectList is expected to be the final output, and it is nondeterministic. - // It may contain the input attributes or not, or even part of the input attributes. So - // we need to project the joined batch per this projectList. - // But for the schema, just return it directly. - override def output: Seq[Attribute] = projectList - .map(_.asInstanceOf[NamedExpression].toAttribute) - - override def projectResult(joinedBatch: ColumnarBatch): ColumnarBatch = { - // Project the data - withResource(joinedBatch) { joinBatch => - GpuProjectExec.project(joinBatch, outReferences) - } - } - - // On Databricks, binding the references on driver side will get some invalid expressions - // (e.g. none#0L, none@1L) in the `projectList`, causing failures in `test_window` test. - // So need to do the binding for `projectList` lazily, and the binding will actually run - // on executors now. - private lazy val outReferences = { - val allExpressions = windowFramesWithExpressions.map(_._2).flatten - val references = allExpressions.zipWithIndex.map { case (e, i) => - // Results of window expressions will be on the right side of child's output - GpuBoundReference(child.output.size + i, e.dataType, - e.nullable)(NamedExpression.newExprId, s"gpu_win_$i") - } - val unboundToRefMap = allExpressions.zip(references).toMap - // Bound the project list for GPU - GpuBindReferences.bindGpuReferences( - projectList.map(_.transform(unboundToRefMap)), child.output) - } - -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala deleted file mode 100644 index 7a36a7596a6..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import scala.concurrent.Promise - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.execution.exchange.BroadcastExchangeLike - -/** - * This shim handles the completion future differences between - * Apache Spark and Databricks. - */ -trait ShimBroadcastExchangeLike extends BroadcastExchangeLike { - @transient - protected lazy val promise = Promise[Broadcast[Any]]() - - /** - * For registering callbacks on `relationFuture`. - * Note that calling this field will not start the execution of broadcast job. - */ - @transient - lazy val completionFuture: concurrent.Future[Broadcast[Any]] = promise.future -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/Spark30XdbShims.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/Spark30XdbShims.scala deleted file mode 100644 index ea9143802cd..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/Spark30XdbShims.scala +++ /dev/null @@ -1,693 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import java.net.URI -import java.nio.ByteBuffer - -import com.databricks.sql.execution.window.RunningWindowFunctionExec -import com.esotericsoftware.kryo.Kryo -import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer} -import com.nvidia.spark.rapids._ -import org.apache.arrow.memory.ReferenceManager -import org.apache.arrow.vector.ValueVector -import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.parquet.schema.MessageType - -import org.apache.spark.SparkEnv -import org.apache.spark.internal.Logging -import org.apache.spark.rapids.shims.GpuShuffleExchangeExec -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.Resolver -import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.errors.attachTree -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate.Average -import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning} -import org.apache.spark.sql.catalyst.trees.TreeNode -import org.apache.spark.sql.connector.read.Scan -import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, ShuffleQueryStageExec} -import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, RunnableCommand} -import org.apache.spark.sql.execution.datasources.{DataSourceUtils, FileIndex, FilePartition, HadoopFsRelation, InMemoryFileIndex, PartitionDirectory, PartitionedFile, PartitioningAwareFileIndex} -import org.apache.spark.sql.execution.datasources.json.JsonFileFormat -import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters -import org.apache.spark.sql.execution.datasources.rapids.GpuPartitioningUtils -import org.apache.spark.sql.execution.datasources.v2.ShowCurrentNamespaceExec -import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan -import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan -import org.apache.spark.sql.execution.exchange.{ReusedExchangeExec, ShuffleExchangeExec} -import org.apache.spark.sql.execution.python._ -import org.apache.spark.sql.execution.window.WindowExecBase -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.{GpuAbs, GpuAverage, GpuFileSourceScanExec, GpuTimeSub} -import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, SerializeBatchDeserializeHostBuffer, SerializeConcatHostBuffersDeserializeBatch, TrampolineUtil} -import org.apache.spark.sql.rapids.execution.python._ -import org.apache.spark.sql.rapids.execution.python.shims._ -import org.apache.spark.sql.rapids.shims.{GpuFileScanRDD, GpuSchemaUtils} -import org.apache.spark.sql.sources.BaseRelation -import org.apache.spark.sql.types._ -import org.apache.spark.storage.{BlockId, BlockManagerId} -import org.apache.spark.unsafe.types.CalendarInterval - -abstract class Spark30XdbShims extends Spark30XdbShimsBase with Logging { - override def getParquetFilters( - schema: MessageType, - pushDownDate: Boolean, - pushDownTimestamp: Boolean, - pushDownDecimal: Boolean, - pushDownStartWith: Boolean, - pushDownInFilterThreshold: Int, - caseSensitive: Boolean, - lookupFileMeta: String => String, - dateTimeRebaseModeFromConf: String): ParquetFilters = { - val datetimeRebaseMode = DataSourceUtils - .datetimeRebaseMode(lookupFileMeta, dateTimeRebaseModeFromConf) - new ParquetFilters(schema, pushDownDate, pushDownTimestamp, pushDownDecimal, pushDownStartWith, - pushDownInFilterThreshold, caseSensitive, datetimeRebaseMode) - } - - override def v1RepairTableCommand(tableName: TableIdentifier): RunnableCommand = - AlterTableRecoverPartitionsCommand(tableName) - - override def getScalaUDFAsExpression( - function: AnyRef, - dataType: DataType, - children: Seq[Expression], - inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Nil, - outputEncoder: Option[ExpressionEncoder[_]] = None, - udfName: Option[String] = None, - nullable: Boolean = true, - udfDeterministic: Boolean = true): Expression = { - // outputEncoder is only used in Spark 3.1+ - ScalaUDF(function, dataType, children, inputEncoders, udfName, nullable, udfDeterministic) - } - - override def getMapSizesByExecutorId( - shuffleId: Int, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = { - SparkEnv.get.mapOutputTracker.getMapSizesByRange(shuffleId, - startMapIndex, endMapIndex, startPartition, endPartition) - } - - override def getGpuShuffleExchangeExec( - gpuOutputPartitioning: GpuPartitioning, - child: SparkPlan, - cpuOutputPartitioning: Partitioning, - cpuShuffle: Option[ShuffleExchangeExec]): GpuShuffleExchangeExecBase = { - val canChangeNumPartitions = cpuShuffle.forall(_.canChangeNumPartitions) - GpuShuffleExchangeExec(gpuOutputPartitioning, child, canChangeNumPartitions)( - cpuOutputPartitioning) - } - - override def getGpuShuffleExchangeExec( - queryStage: ShuffleQueryStageExec): GpuShuffleExchangeExecBase = { - queryStage.shuffle.asInstanceOf[GpuShuffleExchangeExecBase] - } - - override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = { - Seq( - GpuOverrides.exec[RunningWindowFunctionExec]( - "Databricks-specific window function exec, for \"running\" windows, " + - "i.e. (UNBOUNDED PRECEDING TO CURRENT ROW)", - ExecChecks( - (TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_128 + - TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP).nested(), - TypeSig.all, - Map("partitionSpec" -> - InputCheck(TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_128, - TypeSig.all))), - (runningWindowFunctionExec, conf, p, r) => - new GpuRunningWindowExecMeta(runningWindowFunctionExec, conf, p, r) - ), - GpuOverrides.exec[FileSourceScanExec]( - "Reading data from files, often from Hive tables", - ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.STRUCT + TypeSig.MAP + - TypeSig.ARRAY + TypeSig.DECIMAL_128).nested(), TypeSig.all), - (fsse, conf, p, r) => new SparkPlanMeta[FileSourceScanExec](fsse, conf, p, r) { - - // Replaces SubqueryBroadcastExec inside dynamic pruning filters with GPU counterpart - // if possible. Instead regarding filters as childExprs of current Meta, we create - // a new meta for SubqueryBroadcastExec. The reason is that the GPU replacement of - // FileSourceScan is independent from the replacement of the partitionFilters. It is - // possible that the FileSourceScan is on the CPU, while the dynamic partitionFilters - // are on the GPU. And vice versa. - private lazy val partitionFilters = { - val convertBroadcast = (bc: SubqueryBroadcastExec) => { - val meta = GpuOverrides.wrapAndTagPlan(bc, conf) - meta.tagForExplain() - meta.convertIfNeeded().asInstanceOf[BaseSubqueryExec] - } - wrapped.partitionFilters.map { filter => - filter.transformDown { - case dpe @ DynamicPruningExpression(inSub: InSubqueryExec) => - inSub.plan match { - case bc: SubqueryBroadcastExec => - dpe.copy(inSub.copy(plan = convertBroadcast(bc))) - case reuse @ ReusedSubqueryExec(bc: SubqueryBroadcastExec) => - dpe.copy(inSub.copy(plan = reuse.copy(convertBroadcast(bc)))) - case _ => - dpe - } - } - } - } - - // partition filters and data filters are not run on the GPU - override val childExprs: Seq[ExprMeta[_]] = Seq.empty - - override def tagPlanForGpu(): Unit = { - // this is very specific check to have any of the Delta log metadata queries - // fallback and run on the CPU since there is some incompatibilities in - // Databricks Spark and Apache Spark. - if (wrapped.relation.fileFormat.isInstanceOf[JsonFileFormat] && - wrapped.relation.location.getClass.getCanonicalName() == - "com.databricks.sql.transaction.tahoe.DeltaLogFileIndex") { - this.entirePlanWillNotWork("Plans that read Delta Index JSON files can not run " + - "any part of the plan on the GPU!") - } - GpuFileSourceScanExec.tagSupport(this) - } - - override def convertToCpu(): SparkPlan = { - wrapped.copy(partitionFilters = partitionFilters) - } - - override def convertToGpu(): GpuExec = { - val sparkSession = wrapped.relation.sparkSession - val options = wrapped.relation.options - - val location = replaceWithAlluxioPathIfNeeded( - conf, - wrapped.relation, - partitionFilters, - wrapped.dataFilters) - - val newRelation = HadoopFsRelation( - location, - wrapped.relation.partitionSchema, - wrapped.relation.dataSchema, - wrapped.relation.bucketSpec, - GpuFileSourceScanExec.convertFileFormat(wrapped.relation.fileFormat), - options)(sparkSession) - - GpuFileSourceScanExec( - newRelation, - wrapped.output, - wrapped.requiredSchema, - partitionFilters, - wrapped.optionalBucketSet, - // TODO: Does Databricks have coalesced bucketing implemented? - None, - wrapped.dataFilters, - wrapped.tableIdentifier)(conf) - } - }), - GpuOverrides.exec[WindowInPandasExec]( - "The backend for Window Aggregation Pandas UDF, Accelerates the data transfer between" + - " the Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled. For now it only supports row based window frame.", - ExecChecks( - (TypeSig.commonCudfTypes + TypeSig.ARRAY).nested(TypeSig.commonCudfTypes), - TypeSig.all), - (winPy, conf, p, r) => new GpuWindowInPandasExecMetaBase(winPy, conf, p, r) { - override val windowExpressions: Seq[BaseExprMeta[NamedExpression]] = - winPy.windowExpression.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - - override def convertToGpu(): GpuExec = { - GpuWindowInPandasExec( - windowExpressions.map(_.convertToGpu()), - partitionSpec.map(_.convertToGpu()), - // leave ordering expression on the CPU, it's not used for GPU computation - winPy.orderSpec, - childPlans.head.convertIfNeeded() - )(winPy.partitionSpec) - } - }).disabledByDefault("it only supports row based frame for now"), - GpuOverrides.exec[ArrowEvalPythonExec]( - "The backend of the Scalar Pandas UDFs. Accelerates the data transfer between the" + - " Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled", - ExecChecks( - (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(), - TypeSig.all), - (e, conf, p, r) => - new SparkPlanMeta[ArrowEvalPythonExec](e, conf, p, r) { - val udfs: Seq[BaseExprMeta[PythonUDF]] = - e.udfs.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - val resultAttrs: Seq[BaseExprMeta[Attribute]] = - e.resultAttrs.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - override val childExprs: Seq[BaseExprMeta[_]] = udfs ++ resultAttrs - - override def replaceMessage: String = "partially run on GPU" - override def noReplacementPossibleMessage(reasons: String): String = - s"cannot run even partially on the GPU because $reasons" - - override def convertToGpu(): GpuExec = - GpuArrowEvalPythonExec(udfs.map(_.convertToGpu()).asInstanceOf[Seq[GpuPythonUDF]], - resultAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]], - childPlans.head.convertIfNeeded(), - e.evalType) - }), - GpuOverrides.exec[MapInPandasExec]( - "The backend for Map Pandas Iterator UDF. Accelerates the data transfer between the" + - " Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled.", - ExecChecks((TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(), - TypeSig.all), - (mapPy, conf, p, r) => new GpuMapInPandasExecMeta(mapPy, conf, p, r)), - GpuOverrides.exec[FlatMapGroupsInPandasExec]( - "The backend for Flat Map Groups Pandas UDF, Accelerates the data transfer between the" + - " Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled.", - ExecChecks(TypeSig.commonCudfTypes, TypeSig.all), - (flatPy, conf, p, r) => new GpuFlatMapGroupsInPandasExecMeta(flatPy, conf, p, r)), - GpuOverrides.exec[AggregateInPandasExec]( - "The backend for an Aggregation Pandas UDF, this accelerates the data transfer between" + - " the Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled.", - ExecChecks(TypeSig.commonCudfTypes, TypeSig.all), - (aggPy, conf, p, r) => new GpuAggregateInPandasExecMeta(aggPy, conf, p, r)) - ).map(r => (r.getClassFor.asSubclass(classOf[SparkPlan]), r)).toMap - } - - protected def getExprsSansTimeSub: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = { - Seq( - GpuOverrides.expr[Cast]( - "Convert a column of one type of data into another type", - new CastChecks(), - (cast, conf, p, r) => new CastExprMeta[Cast](cast, - SparkSession.active.sessionState.conf.ansiEnabled, conf, p, r, - doFloatToIntCheck = false, stringToAnsiDate = false)), - GpuOverrides.expr[AnsiCast]( - "Convert a column of one type of data into another type", - new CastChecks(), - (cast, conf, p, r) => new CastExprMeta[AnsiCast](cast, ansiEnabled = true, conf = conf, - parent = p, rule = r, doFloatToIntCheck = false, stringToAnsiDate = false)), - GpuOverrides.expr[Average]( - "Average aggregate operator", - ExprChecks.fullAgg( - TypeSig.DOUBLE + TypeSig.DECIMAL_128, - TypeSig.DOUBLE + TypeSig.DECIMAL_128, - Seq(ParamCheck("input", - TypeSig.integral + TypeSig.fp + TypeSig.DECIMAL_128, - TypeSig.cpuNumeric))), - (a, conf, p, r) => new AggExprMeta[Average](a, conf, p, r) { - override def tagAggForGpu(): Unit = { - // For Decimal Average the SUM adds a precision of 10 to avoid overflowing - // then it divides by the count with an output scale that is 4 more than the input - // scale. With how our divide works to match Spark, this means that we will need a - // precision of 5 more. So 38 - 10 - 5 = 23 - val dataType = a.child.dataType - dataType match { - case dt: DecimalType => - if (dt.precision > 23) { - if (conf.needDecimalGuarantees) { - willNotWorkOnGpu("GpuAverage cannot guarantee proper overflow checks for " + - s"a precision large than 23. The current precision is ${dt.precision}") - } else { - logWarning("Decimal overflow guarantees disabled for " + - s"Average(${a.child.dataType}) produces $dt with an " + - s"intermediate precision of ${dt.precision + 15}") - } - } - case _ => // NOOP - } - GpuOverrides.checkAndTagFloatAgg(dataType, conf, this) - } - - override def convertToGpu(childExprs: Seq[Expression]): GpuExpression = - GpuAverage(childExprs.head) - - // Average is not supported in ANSI mode right now, no matter the type - override val ansiTypeToCheck: Option[DataType] = None - }), - GpuOverrides.expr[Abs]( - "Absolute value", - ExprChecks.unaryProjectAndAstInputMatchesOutput( - TypeSig.implicitCastsAstTypes, TypeSig.gpuNumeric, - TypeSig.cpuNumeric), - (a, conf, p, r) => new UnaryAstExprMeta[Abs](a, conf, p, r) { - // ANSI support for ABS was added in 3.2.0 SPARK-33275 - override def convertToGpu(child: Expression): GpuExpression = GpuAbs(child, false) - }), - GpuOverrides.expr[RegExpReplace]( - "String replace using a regular expression pattern", - ExprChecks.projectOnly(TypeSig.STRING, TypeSig.STRING, - Seq(ParamCheck("str", TypeSig.STRING, TypeSig.STRING), - ParamCheck("regex", TypeSig.lit(TypeEnum.STRING), TypeSig.STRING), - ParamCheck("rep", TypeSig.lit(TypeEnum.STRING), TypeSig.STRING))), - (a, conf, p, r) => new GpuRegExpReplaceMeta(a, conf, p, r)), - GpuScalaUDFMeta.exprMeta - ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap - } - - override def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = { - getExprsSansTimeSub + (classOf[TimeSub] -> GpuOverrides.expr[TimeSub]( - "Subtracts interval from timestamp", - ExprChecks.binaryProject(TypeSig.TIMESTAMP, TypeSig.TIMESTAMP, - ("start", TypeSig.TIMESTAMP, TypeSig.TIMESTAMP), - ("interval", TypeSig.lit(TypeEnum.CALENDAR) - .withPsNote(TypeEnum.CALENDAR, "months not supported"), TypeSig.CALENDAR)), - (timeSub, conf, p, r) => new BinaryExprMeta[TimeSub](timeSub, conf, p, r) { - override def tagExprForGpu(): Unit = { - timeSub.interval match { - case Literal(intvl: CalendarInterval, DataTypes.CalendarIntervalType) => - if (intvl.months != 0) { - willNotWorkOnGpu("interval months isn't supported") - } - case _ => - } - checkTimeZoneId(timeSub.timeZoneId) - } - - override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = - GpuTimeSub(lhs, rhs) - })) - } - - override def getScans: Map[Class[_ <: Scan], ScanRule[_ <: Scan]] = Seq( - GpuOverrides.scan[ParquetScan]( - "Parquet parsing", - (a, conf, p, r) => new RapidsParquetScanMeta(a, conf, p, r)), - GpuOverrides.scan[OrcScan]( - "ORC parsing", - (a, conf, p, r) => new RapidsOrcScanMeta(a, conf, p, r)) - ).map(r => (r.getClassFor.asSubclass(classOf[Scan]), r)).toMap - - override def getPartitionFileNames( - partitions: Seq[PartitionDirectory]): Seq[String] = { - val files = partitions.flatMap(partition => partition.files) - files.map(_.getPath.getName) - } - - override def getPartitionFileStatusSize(partitions: Seq[PartitionDirectory]): Long = { - partitions.map(_.files.map(_.getLen).sum).sum - } - - override def getPartitionedFiles( - partitions: Array[PartitionDirectory]): Array[PartitionedFile] = { - partitions.flatMap { p => - p.files.map { f => - PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values) - } - } - } - - override def getPartitionSplitFiles( - partitions: Array[PartitionDirectory], - maxSplitBytes: Long, - relation: HadoopFsRelation): Array[PartitionedFile] = { - partitions.flatMap { partition => - partition.files.flatMap { file => - // getPath() is very expensive so we only want to call it once in this block: - val filePath = file.getPath - val isSplitable = relation.fileFormat.isSplitable( - relation.sparkSession, relation.options, filePath) - PartitionedFileUtil.splitFiles( - sparkSession = relation.sparkSession, - file = file, - filePath = filePath, - isSplitable = isSplitable, - maxSplitBytes = maxSplitBytes, - partitionValues = partition.values - ) - } - } - } - - override def isWindowFunctionExec(plan: SparkPlan): Boolean = - plan.isInstanceOf[WindowExecBase] || plan.isInstanceOf[RunningWindowFunctionExec] - - override def getFileScanRDD( - sparkSession: SparkSession, - readFunction: PartitionedFile => Iterator[InternalRow], - filePartitions: Seq[FilePartition], - readDataSchema: StructType, - metadataColumns: Seq[AttributeReference]): RDD[InternalRow] = { - new GpuFileScanRDD(sparkSession, readFunction, filePartitions) - } - - // Hardcoded for Spark-3.0.* - override def getFileSourceMaxMetadataValueLength(sqlConf: SQLConf): Int = 100 - - override def createFilePartition(index: Int, files: Array[PartitionedFile]): FilePartition = { - FilePartition(index, files) - } - - override def copyBatchScanExec( - batchScanExec: GpuBatchScanExec, - queryUsesInputFile: Boolean): GpuBatchScanExec = { - val scanCopy = batchScanExec.scan match { - case parquetScan: GpuParquetScan => - parquetScan.copy(queryUsesInputFile=queryUsesInputFile) - case orcScan: GpuOrcScan => - orcScan.copy(queryUsesInputFile=queryUsesInputFile) - case _ => throw new RuntimeException("Wrong format") // never reach here - } - batchScanExec.copy(scan=scanCopy) - } - - override def copyFileSourceScanExec( - scanExec: GpuFileSourceScanExec, - queryUsesInputFile: Boolean): GpuFileSourceScanExec = { - scanExec.copy(queryUsesInputFile=queryUsesInputFile)(scanExec.rapidsConf) - } - - override def getGpuColumnarToRowTransition(plan: SparkPlan, - exportColumnRdd: Boolean): GpuColumnarToRowExecParent = { - GpuColumnarToRowExec(plan, exportColumnRdd) - } - - override def checkColumnNameDuplication( - schema: StructType, - colType: String, - resolver: Resolver): Unit = { - GpuSchemaUtils.checkColumnNameDuplication(schema, colType, resolver) - } - - override def sortOrder( - child: Expression, - direction: SortDirection, - nullOrdering: NullOrdering): SortOrder = SortOrder(child, direction, nullOrdering, Set.empty) - - override def copySortOrderWithNewChild(s: SortOrder, child: Expression): SortOrder = { - s.copy(child = child) - } - - override def alias(child: Expression, name: String)( - exprId: ExprId, - qualifier: Seq[String], - explicitMetadata: Option[Metadata]): Alias = { - Alias(child, name)(exprId, qualifier, explicitMetadata) - } - - override def shouldIgnorePath(path: String): Boolean = { - InMemoryFileIndex.shouldFilterOut(path) - } - - override def getLegacyComplexTypeToString(): Boolean = true - - // Arrow version changed between Spark versions - override def getArrowDataBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = { - val arrowBuf = vec.getDataBuffer - (arrowBuf.nioBuffer(), arrowBuf.getReferenceManager) - } - - override def getArrowValidityBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = { - val arrowBuf = vec.getValidityBuffer - (arrowBuf.nioBuffer(), arrowBuf.getReferenceManager) - } - - override def getArrowOffsetsBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = { - val arrowBuf = vec.getOffsetBuffer - (arrowBuf.nioBuffer(), arrowBuf.getReferenceManager) - } - - override def replaceWithAlluxioPathIfNeeded( - conf: RapidsConf, - relation: HadoopFsRelation, - partitionFilters: Seq[Expression], - dataFilters: Seq[Expression]): FileIndex = { - - val alluxioPathsReplace: Option[Seq[String]] = conf.getAlluxioPathsToReplace - - if (alluxioPathsReplace.isDefined) { - // alluxioPathsReplace: Seq("key->value", "key1->value1") - // turn the rules to the Map with eg - // { s3:/foo -> alluxio://0.1.2.3:19998/foo, - // gs:/bar -> alluxio://0.1.2.3:19998/bar, - // /baz -> alluxio://0.1.2.3:19998/baz } - val replaceMapOption = alluxioPathsReplace.map(rules => { - rules.map(rule => { - val split = rule.split("->") - if (split.size == 2) { - split(0).trim -> split(1).trim - } else { - throw new IllegalArgumentException(s"Invalid setting for " + - s"${RapidsConf.ALLUXIO_PATHS_REPLACE.key}") - } - }).toMap - }) - - replaceMapOption.map(replaceMap => { - - def isDynamicPruningFilter(e: Expression): Boolean = - e.find(_.isInstanceOf[PlanExpression[_]]).isDefined - - val partitionDirs = relation.location.listFiles( - partitionFilters.filterNot(isDynamicPruningFilter), dataFilters) - - // replacement func to check if the file path is prefixed with the string user configured - // if yes, replace it - val replaceFunc = (f: Path) => { - val pathStr = f.toString - val matchedSet = replaceMap.keySet.filter(reg => pathStr.startsWith(reg)) - if (matchedSet.size > 1) { - // never reach here since replaceMap is a Map - throw new IllegalArgumentException(s"Found ${matchedSet.size} same replacing rules " + - s"from ${RapidsConf.ALLUXIO_PATHS_REPLACE.key} which requires only 1 rule for each " + - s"file path") - } else if (matchedSet.size == 1) { - new Path(pathStr.replaceFirst(matchedSet.head, replaceMap(matchedSet.head))) - } else { - f - } - } - - // replace all of input files - val inputFiles: Seq[Path] = partitionDirs.flatMap(partitionDir => { - replacePartitionDirectoryFiles(partitionDir, replaceFunc) - }) - - // replace all of rootPaths which are already unique - val rootPaths = relation.location.rootPaths.map(replaceFunc) - - val parameters: Map[String, String] = relation.options - - // infer PartitionSpec - val partitionSpec = GpuPartitioningUtils.inferPartitioning( - relation.sparkSession, - rootPaths, - inputFiles, - parameters, - Option(relation.dataSchema), - replaceFunc) - - // generate a new InMemoryFileIndex holding paths with alluxio schema - new InMemoryFileIndex( - relation.sparkSession, - inputFiles, - parameters, - Option(relation.dataSchema), - userSpecifiedPartitionSpec = Some(partitionSpec)) - }).getOrElse(relation.location) - - } else { - relation.location - } - } - - override def replacePartitionDirectoryFiles(partitionDir: PartitionDirectory, - replaceFunc: Path => Path): Seq[Path] = { - partitionDir.files.map(f => replaceFunc(f.getPath)) - } - - override def shouldFailDivByZero(): Boolean = false - - override def reusedExchangeExecPfn: PartialFunction[SparkPlan, ReusedExchangeExec] = { - case ShuffleQueryStageExec(_, e: ReusedExchangeExec, _) => e - case BroadcastQueryStageExec(_, e: ReusedExchangeExec, _) => e - } - - /** dropped by SPARK-34234 */ - override def attachTreeIfSupported[TreeType <: TreeNode[_], A]( - tree: TreeType, - msg: String)( - f: => A - ): A = { - attachTree(tree, msg)(f) - } - - override def createTable(table: CatalogTable, - sessionCatalog: SessionCatalog, - tableLocation: Option[URI], - result: BaseRelation) = { - val newTable = table.copy( - storage = table.storage.copy(locationUri = tableLocation), - // We will use the schema of resolved.relation as the schema of the table (instead of - // the schema of df). It is important since the nullability may be changed by the relation - // provider (for example, see org.apache.spark.sql.parquet.DefaultSource). - schema = result.schema) - // Table location is already validated. No need to check it again during table creation. - sessionCatalog.createTable(newTable, ignoreIfExists = false, validateLocation = false) - } - - override def hasAliasQuoteFix: Boolean = false - - override def hasCastFloatTimestampUpcast: Boolean = false - - override def filesFromFileIndex(fileCatalog: PartitioningAwareFileIndex): Seq[FileStatus] = { - fileCatalog.allFiles().map(_.toFileStatus) - } - - // this is to help with an optimization in Spark 3.1, so we disable it by default in Spark 3.0.x - override def isEmptyRelation(relation: Any): Boolean = false - override def tryTransformIfEmptyRelation(mode: BroadcastMode): Option[Any] = None - - override def broadcastModeTransform(mode: BroadcastMode, rows: Array[InternalRow]): Any = - mode.transform(rows, TrampolineUtil.getTaskMemoryManager()) - - override def registerKryoClasses(kryo: Kryo): Unit = { - kryo.register(classOf[SerializeConcatHostBuffersDeserializeBatch], - new KryoJavaSerializer()) - kryo.register(classOf[SerializeBatchDeserializeHostBuffer], - new KryoJavaSerializer()) - } - - override def getAdaptiveInputPlan(adaptivePlan: AdaptiveSparkPlanExec): SparkPlan = { - adaptivePlan.initialPlan - } - - override def getLegacyStatisticalAggregate(): Boolean = true - - override def supportsColumnarAdaptivePlans: Boolean = false - - override def columnarAdaptivePlan(a: AdaptiveSparkPlanExec, goal: CoalesceSizeGoal): SparkPlan = { - // When the input is an adaptive plan we do not get to see the GPU version until - // the plan is executed and sometimes the plan will have a GpuColumnarToRowExec as the - // final operator and we can bypass this to keep the data columnar by inserting - // the [[AvoidAdaptiveTransitionToRow]] operator here - AvoidAdaptiveTransitionToRow(GpuRowToColumnarExec(a, goal)) - } - - def neverReplaceShowCurrentNamespaceCommand: ExecRule[_ <: SparkPlan] = { - GpuOverrides.neverReplaceExec[ShowCurrentNamespaceExec]("Namespace metadata operation") - } -} - -// First, Last and Collect have mistakenly been marked as non-deterministic until Spark-3.3. -// They are actually deterministic iff their child expression is deterministic. -trait GpuDeterministicFirstLastCollectShim extends Expression { - override lazy val deterministic = false -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/Spark30XdbShimsBase.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/Spark30XdbShimsBase.scala deleted file mode 100644 index 7fd763b5268..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/Spark30XdbShimsBase.scala +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import scala.collection.mutable.ListBuffer - -import com.nvidia.spark.rapids.{ExecChecks, ExecRule, SparkPlanMeta, SparkShims, TypeSig} -import com.nvidia.spark.rapids.GpuOverrides.exec - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils} -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, CustomShuffleReaderExec, QueryStageExec, ShuffleQueryStageExec} -import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec -import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec, ShuffledHashJoinExec} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.execution.GpuCustomShuffleReaderExec - -/** -* Shim base class that can be compiled with every supported 3.0.x -*/ -trait Spark30XdbShimsBase extends SparkShims { - override def parquetRebaseReadKey: String = - SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ.key - override def parquetRebaseWriteKey: String = - SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key - override def avroRebaseReadKey: String = - SQLConf.LEGACY_AVRO_REBASE_MODE_IN_READ.key - override def avroRebaseWriteKey: String = - SQLConf.LEGACY_AVRO_REBASE_MODE_IN_WRITE.key - override def parquetRebaseRead(conf: SQLConf): String = - conf.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ) - override def parquetRebaseWrite(conf: SQLConf): String = - conf.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE) - override def int96ParquetRebaseRead(conf: SQLConf): String = - parquetRebaseRead(conf) - override def int96ParquetRebaseWrite(conf: SQLConf): String = - parquetRebaseWrite(conf) - override def int96ParquetRebaseReadKey: String = - parquetRebaseReadKey - override def int96ParquetRebaseWriteKey: String = - parquetRebaseWriteKey - override def hasSeparateINT96RebaseConf: Boolean = false - - override def sessionFromPlan(plan: SparkPlan): SparkSession = { - plan.sqlContext.sparkSession - } - - override def newBroadcastQueryStageExec( - old: BroadcastQueryStageExec, - newPlan: SparkPlan): BroadcastQueryStageExec = - BroadcastQueryStageExec(old.id, newPlan, old._canonicalized) - - override def getDateFormatter(): DateFormatter = { - DateFormatter(DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)) - } - - override def isExchangeOp(plan: SparkPlanMeta[_]): Boolean = { - // if the child query stage already executed on GPU then we need to keep the - // next operator on GPU in these cases - SQLConf.get.adaptiveExecutionEnabled && (plan.wrapped match { - case _: CustomShuffleReaderExec - | _: ShuffledHashJoinExec - | _: BroadcastHashJoinExec - | _: BroadcastExchangeExec - | _: BroadcastNestedLoopJoinExec => true - case _ => false - }) - } - - override def isAqePlan(p: SparkPlan): Boolean = p match { - case _: AdaptiveSparkPlanExec | - _: QueryStageExec | - _: CustomShuffleReaderExec => true - case _ => false - } - - override def isCustomReaderExec(x: SparkPlan): Boolean = x match { - case _: GpuCustomShuffleReaderExec | _: CustomShuffleReaderExec => true - case _ => false - } - - override def aqeShuffleReaderExec: ExecRule[_ <: SparkPlan] = exec[CustomShuffleReaderExec]( - "A wrapper of shuffle query stage", - ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.ARRAY + - TypeSig.STRUCT + TypeSig.MAP).nested(), TypeSig.all), - (exec, conf, p, r) => new GpuCustomShuffleReaderMeta(exec, conf, p, r)) - - override def findOperators(plan: SparkPlan, predicate: SparkPlan => Boolean): Seq[SparkPlan] = { - def recurse( - plan: SparkPlan, - predicate: SparkPlan => Boolean, - accum: ListBuffer[SparkPlan]): Seq[SparkPlan] = { - if (predicate(plan)) { - accum += plan - } - plan match { - case a: AdaptiveSparkPlanExec => recurse(a.executedPlan, predicate, accum) - case qs: BroadcastQueryStageExec => recurse(qs.broadcast, predicate, accum) - case qs: ShuffleQueryStageExec => recurse(qs.shuffle, predicate, accum) - case other => other.children.flatMap(p => recurse(p, predicate, accum)).headOption - } - accum - } - recurse(plan, predicate, new ListBuffer[SparkPlan]()) - } - - override def skipAssertIsOnTheGpu(plan: SparkPlan): Boolean = false - - override def shouldFailDivOverflow(): Boolean = false - - override def leafNodeDefaultParallelism(ss: SparkSession): Int = { - ss.sparkContext.defaultParallelism - } - - override def shouldFallbackOnAnsiTimestamp(): Boolean = false - - override def shouldFailOnElementNotExists(): Boolean = false - - override def isCastingStringToNegDecimalScaleSupported: Boolean = true -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/spark301db/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/spark301db/RapidsShuffleInternalManager.scala deleted file mode 100644 index 240a67df11f..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/spark301db/RapidsShuffleInternalManager.scala +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.rapids.shims.spark301db - -import org.apache.spark.{SparkConf, TaskContext} -import org.apache.spark.shuffle._ -import org.apache.spark.sql.rapids.{ProxyRapidsShuffleInternalManagerBase, RapidsShuffleInternalManagerBase} - -/** - * A shuffle manager optimized for the RAPIDS Plugin For Apache Spark. - * @note This is an internal class to obtain access to the private - * `ShuffleManager` and `SortShuffleManager` classes. - */ -class RapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends RapidsShuffleInternalManagerBase(conf, isDriver) { - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, startMapIndex, endMapIndex, startPartition, endPartition, context, - metrics) - } - - def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, 0, Int.MaxValue, startPartition, endPartition, context, metrics) - } - -} - -class ProxyRapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends ProxyRapidsShuffleInternalManagerBase(conf, isDriver) with ShuffleManager { - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - self.getReaderForRange(handle, startMapIndex, endMapIndex, startPartition, endPartition, - context, metrics) - } - - def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - self.getReader(handle, startPartition, endPartition, context, metrics) - } - -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/spark301db/SparkShimServiceProvider.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/spark301db/SparkShimServiceProvider.scala deleted file mode 100644 index de07110a8b7..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/spark301db/SparkShimServiceProvider.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims.spark301db - -import com.nvidia.spark.rapids.{DatabricksShimVersion, ShimVersion} - -object SparkShimServiceProvider { - val VERSION = DatabricksShimVersion(3, 0, 1) - val VERSIONNAMES = Seq(s"$VERSION") -} - -class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider { - - override def getShimVersion: ShimVersion = SparkShimServiceProvider.VERSION - - def matchesVersion(version: String): Boolean = { - SparkShimServiceProvider.VERSIONNAMES.contains(version) - } -} diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/spark301db/RapidsShuffleManager.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/spark301db/RapidsShuffleManager.scala deleted file mode 100644 index 00e72792ae5..00000000000 --- a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/spark301db/RapidsShuffleManager.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.spark301db - -import org.apache.spark.SparkConf -import org.apache.spark.sql.rapids.shims.spark301db.ProxyRapidsShuffleInternalManager - -/** A shuffle manager optimized for the RAPIDS Plugin for Apache Spark. */ -sealed class RapidsShuffleManager( - conf: SparkConf, - isDriver: Boolean) extends ProxyRapidsShuffleInternalManager(conf, isDriver) { -} diff --git a/sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala b/sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala deleted file mode 100644 index 2f3fb8c0e4f..00000000000 --- a/sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.rapids.shims - -import com.nvidia.spark.rapids.GpuPartitioning - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.plans.logical.Statistics -import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.execution.{ShufflePartitionSpec, SparkPlan} -import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike -import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBaseWithMetrics, ShuffledBatchRDD} - -case class GpuShuffleExchangeExec( - gpuOutputPartitioning: GpuPartitioning, - child: SparkPlan, - canChangeNumPartitions: Boolean)( - cpuOutputPartitioning: Partitioning) - extends GpuShuffleExchangeExecBaseWithMetrics(gpuOutputPartitioning, child) - with ShuffleExchangeLike { - - override def otherCopyArgs: Seq[AnyRef] = cpuOutputPartitioning :: Nil - - override val outputPartitioning: Partitioning = cpuOutputPartitioning - - override def numMappers: Int = shuffleDependencyColumnar.rdd.getNumPartitions - - override def numPartitions: Int = shuffleDependencyColumnar.partitioner.numPartitions - - override def getShuffleRDD( - partitionSpecs: Array[ShufflePartitionSpec], - partitionSizes: Option[Array[Long]]): RDD[_] = { - new ShuffledBatchRDD(shuffleDependencyColumnar, metrics ++ readMetrics, partitionSpecs) - } - - override def runtimeStatistics: Statistics = { - // note that Spark will only use the sizeInBytes statistic but making the rowCount - // available here means that we can more easily reference it in GpuOverrides when - // planning future query stages when AQE is on - Statistics( - sizeInBytes = metrics("dataSize").value, - rowCount = Some(metrics("numOutputRows").value) - ) - } -} diff --git a/sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala b/sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala deleted file mode 100644 index 9e4863f6e9d..00000000000 --- a/sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rapids.shims - -import com.nvidia.spark.rapids.shims.SparkShimImpl - -import org.apache.spark.{MapOutputTrackerMaster, Partition, ShuffleDependency, SparkEnv, TaskContext} -import org.apache.spark.shuffle.ShuffleReader -import org.apache.spark.sql.execution.{CoalescedPartitionSpec, PartialMapperPartitionSpec, PartialReducerPartitionSpec} -import org.apache.spark.sql.execution.metric.SQLShuffleReadMetricsReporter -import org.apache.spark.sql.rapids.execution.ShuffledBatchRDDPartition -import org.apache.spark.sql.vectorized.ColumnarBatch - -/** - * Some APIs for the ShuffledBatchRDD are only accessible from org.apache.spark... - * This code tries to match the Spark code as closely as possible. Fixing a compiler or IDE - * warning is not always the best thing here because if it changes how it matches up with the - * Spark code it may make it harder to maintain as thing change in Spark. - */ -object ShuffledBatchRDDUtil { - def preferredLocations( - partition: Partition, - dependency: ShuffleDependency[Int, ColumnarBatch, ColumnarBatch]): Seq[String] = { - val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] - partition.asInstanceOf[ShuffledBatchRDDPartition].spec match { - case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) => - // TODO order by partition size. - startReducerIndex.until(endReducerIndex).flatMap { reducerIndex => - tracker.getPreferredLocationsForShuffle(dependency, reducerIndex) - } - - case PartialReducerPartitionSpec(_, startMapIndex, endMapIndex, _) => - tracker.getMapLocation(dependency, startMapIndex, endMapIndex) - - case PartialMapperPartitionSpec(mapIndex, _, _) => - tracker.getMapLocation(dependency, mapIndex, mapIndex + 1) - } - } - - def getReaderAndPartSize( - split: Partition, - context: TaskContext, - dependency: ShuffleDependency[Int, ColumnarBatch, ColumnarBatch], - sqlMetricsReporter: SQLShuffleReadMetricsReporter): - (ShuffleReader[Nothing, Nothing], Long) = { - val shim = SparkShimImpl - split.asInstanceOf[ShuffledBatchRDDPartition].spec match { - case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) => - val reader = SparkEnv.get.shuffleManager.getReader( - dependency.shuffleHandle, - startReducerIndex, - endReducerIndex, - context, - sqlMetricsReporter) - val blocksByAddress = shim.getMapSizesByExecutorId( - dependency.shuffleHandle.shuffleId, 0, Int.MaxValue, startReducerIndex, endReducerIndex) - val partitionSize = blocksByAddress.flatMap(_._2).map(_._2).sum - (reader, partitionSize) - - case PartialReducerPartitionSpec(reducerIndex, startMapIndex, endMapIndex, _) => - val reader = SparkEnv.get.shuffleManager.getReaderForRange( - dependency.shuffleHandle, - startMapIndex, - endMapIndex, - reducerIndex, - reducerIndex + 1, - context, - sqlMetricsReporter) - val blocksByAddress = shim.getMapSizesByExecutorId( - dependency.shuffleHandle.shuffleId, 0, Int.MaxValue, reducerIndex, - reducerIndex + 1) - val partitionSize = blocksByAddress.flatMap(_._2) - .filter(tuple => tuple._3 >= startMapIndex && tuple._3 < endMapIndex) - .map(_._2).sum - (reader, partitionSize) - case PartialMapperPartitionSpec(mapIndex, startReducerIndex, endReducerIndex) => - val reader = SparkEnv.get.shuffleManager.getReaderForRange( - dependency.shuffleHandle, - mapIndex, - mapIndex + 1, - startReducerIndex, - endReducerIndex, - context, - sqlMetricsReporter) - val blocksByAddress = shim.getMapSizesByExecutorId( - dependency.shuffleHandle.shuffleId, 0, Int.MaxValue, startReducerIndex, endReducerIndex) - val partitionSize = blocksByAddress.flatMap(_._2) - .filter(_._3 == mapIndex) - .map(_._2).sum - (reader, partitionSize) - } - } -} diff --git a/sql-plugin/src/main/301db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala b/sql-plugin/src/main/301db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala deleted file mode 100644 index 81342601d0b..00000000000 --- a/sql-plugin/src/main/301db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.rapids.execution.python.shims - -import com.nvidia.spark.rapids._ -import com.nvidia.spark.rapids.python.PythonWorkerSemaphore -import com.nvidia.spark.rapids.shims.{ShimUnaryExecNode, SparkShimImpl} - -import org.apache.spark.TaskContext -import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning} -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.python.FlatMapGroupsInPandasExec -import org.apache.spark.sql.rapids.execution.python.{GpuArrowPythonRunner, GpuPythonExecBase, GpuPythonHelper, GpuPythonUDF, GroupArgs} -import org.apache.spark.sql.rapids.execution.python.BatchGroupUtils._ -import org.apache.spark.sql.types.{StructField, StructType} -import org.apache.spark.sql.util.ArrowUtils -import org.apache.spark.sql.vectorized.ColumnarBatch - -class GpuFlatMapGroupsInPandasExecMeta( - flatPandas: FlatMapGroupsInPandasExec, - conf: RapidsConf, - parent: Option[RapidsMeta[_, _, _]], - rule: DataFromReplacementRule) - extends SparkPlanMeta[FlatMapGroupsInPandasExec](flatPandas, conf, parent, rule) { - - override def replaceMessage: String = "partially run on GPU" - override def noReplacementPossibleMessage(reasons: String): String = - s"cannot run even partially on the GPU because $reasons" - - private val groupingAttrs: Seq[BaseExprMeta[Attribute]] = - flatPandas.groupingAttributes.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - - private val udf: BaseExprMeta[PythonUDF] = GpuOverrides.wrapExpr( - flatPandas.func.asInstanceOf[PythonUDF], conf, Some(this)) - - private val resultAttrs: Seq[BaseExprMeta[Attribute]] = - flatPandas.output.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - - override val childExprs: Seq[BaseExprMeta[_]] = groupingAttrs ++ resultAttrs :+ udf - - override def convertToGpu(): GpuExec = - GpuFlatMapGroupsInPandasExec( - groupingAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]], - udf.convertToGpu(), - resultAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]], - childPlans.head.convertIfNeeded() - ) -} - -/** - * GPU version of Spark's `FlatMapGroupsInPandasExec` - * - * Rows in each group are passed to the Python worker as an Arrow record batch. - * The Python worker turns the record batch to a `pandas.DataFrame`, invoke the - * user-defined function, and passes the resulting `pandas.DataFrame` - * as an Arrow record batch. Finally, each record batch is turned to - * a ColumnarBatch. - * - * This node aims at accelerating the data transfer between JVM and Python for GPU pipeline, and - * scheduling GPU resources for its Python processes. - */ -case class GpuFlatMapGroupsInPandasExec( - groupingAttributes: Seq[Attribute], - func: Expression, - output: Seq[Attribute], - child: SparkPlan) - extends SparkPlan with ShimUnaryExecNode with GpuPythonExecBase { - - override def producedAttributes: AttributeSet = AttributeSet(output) - - override def outputPartitioning: Partitioning = child.outputPartitioning - - override def requiredChildDistribution: Seq[Distribution] = { - if (groupingAttributes.isEmpty) { - AllTuples :: Nil - } else { - ClusteredDistribution(groupingAttributes) :: Nil - } - } - - override def requiredChildOrdering: Seq[Seq[SortOrder]] = - Seq(groupingAttributes.map(SparkShimImpl.sortOrder(_, Ascending))) - - private val pandasFunction = func.asInstanceOf[GpuPythonUDF].func - - // One batch as input to keep the integrity for each group - override def childrenCoalesceGoal: Seq[CoalesceGoal] = Seq(RequireSingleBatch) - - // The input batch will be split into multiple batches by grouping expression, and - // processed by Python executors group by group, so better to coalesce the output batches. - override def coalesceAfter: Boolean = true - - override def doExecuteColumnar(): RDD[ColumnarBatch] = { - val (mNumInputRows, mNumInputBatches, mNumOutputRows, mNumOutputBatches, - spillCallback) = commonGpuMetrics() - - lazy val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf) - val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction))) - val sessionLocalTimeZone = conf.sessionLocalTimeZone - val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) - val localOutput = output - val localChildOutput = child.output - // Python wraps the resulting columns in a single struct column. - val pythonOutputSchema = StructType( - StructField("out_struct", StructType.fromAttributes(localOutput)) :: Nil) - - // Resolve the argument offsets and related attributes. - val GroupArgs(dedupAttrs, argOffsets, groupingOffsets) = - resolveArgOffsets(child, groupingAttributes) - - // Start processing. Map grouped batches to ArrowPythonRunner results. - child.executeColumnar().mapPartitionsInternal { inputIter => - if (isPythonOnGpuEnabled) { - GpuPythonHelper.injectGpuInfo(chainedFunc, isPythonOnGpuEnabled) - PythonWorkerSemaphore.acquireIfNecessary(TaskContext.get()) - } - - // Projects each input batch into the deduplicated schema, and splits - // into separate group batches to sends them to Python group by group later. - val pyInputIter = projectAndGroup(inputIter, localChildOutput, dedupAttrs, groupingOffsets, - mNumInputRows, mNumInputBatches, spillCallback) - - if (pyInputIter.hasNext) { - // Launch Python workers only when the data is not empty. - val pyRunner = new GpuArrowPythonRunner( - chainedFunc, - PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, - Array(argOffsets), - StructType.fromAttributes(dedupAttrs), - sessionLocalTimeZone, - pythonRunnerConf, - // The whole group data should be written in a single call, so here is unlimited - Int.MaxValue, - spillCallback.semaphoreWaitTime, - onDataWriteFinished = null, - pythonOutputSchema, - // We can not assert the result batch from Python has the same row number with the - // input batch. Because Grouped Map UDF allows the output of arbitrary length. - // So try to read as many as possible by specifying `minReadTargetBatchSize` as - // `Int.MaxValue` here. - Int.MaxValue) - - executePython(pyInputIter, localOutput, pyRunner, mNumOutputRows, mNumOutputBatches) - } else { - // Empty partition, return it directly - inputIter - } - } // end of mapPartitionsInternal - } -} diff --git a/sql-plugin/src/main/301db/scala/org/apache/spark/sql/rapids/shims/GpuFileScanRDD.scala b/sql-plugin/src/main/301db/scala/org/apache/spark/sql/rapids/shims/GpuFileScanRDD.scala deleted file mode 100644 index 2cbf39dc1e1..00000000000 --- a/sql-plugin/src/main/301db/scala/org/apache/spark/sql/rapids/shims/GpuFileScanRDD.scala +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.rapids.shims - -import java.io.{FileNotFoundException, IOException} - -import org.apache.parquet.io.ParquetDecodingException - -import org.apache.spark.{Partition => RDDPartition, SparkUpgradeException, TaskContext} -import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.rdd.{InputFileBlockHolder, RDD} -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.QueryExecutionException -import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.spark.util.NextIterator - -/** - * An RDD that scans a list of file partitions. - * Databricks has different versions of FileScanRDD so we copy the - * Apache Spark version. - */ -class GpuFileScanRDD( - @transient private val sparkSession: SparkSession, - readFunction: (PartitionedFile) => Iterator[InternalRow], - @transient val filePartitions: Seq[FilePartition]) - extends RDD[InternalRow](sparkSession.sparkContext, Nil) { - - private val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles - private val ignoreMissingFiles = sparkSession.sessionState.conf.ignoreMissingFiles - - override def compute(split: RDDPartition, context: TaskContext): Iterator[InternalRow] = { - val iterator = new Iterator[Object] with AutoCloseable { - private val inputMetrics = context.taskMetrics().inputMetrics - private val existingBytesRead = inputMetrics.bytesRead - - // Find a function that will return the FileSystem bytes read by this thread. Do this before - // apply readFunction, because it might read some bytes. - private val getBytesReadCallback = - SparkHadoopUtil.get.getFSBytesReadOnThreadCallback() - - // We get our input bytes from thread-local Hadoop FileSystem statistics. - // If we do a coalesce, however, we are likely to compute multiple partitions in the same - // task and in the same thread, in which case we need to avoid override values written by - // previous partitions (SPARK-13071). - private def incTaskInputMetricsBytesRead(): Unit = { - inputMetrics.setBytesRead(existingBytesRead + getBytesReadCallback()) - } - - private[this] val files = split.asInstanceOf[FilePartition].files.toIterator - private[this] var currentFile: PartitionedFile = null - private[this] var currentIterator: Iterator[Object] = null - - def hasNext: Boolean = { - // Kill the task in case it has been marked as killed. This logic is from - // InterruptibleIterator, but we inline it here instead of wrapping the iterator in order - // to avoid performance overhead. - context.killTaskIfInterrupted() - (currentIterator != null && currentIterator.hasNext) || nextIterator() - } - def next(): Object = { - val nextElement = currentIterator.next() - // TODO: we should have a better separation of row based and batch based scan, so that we - // don't need to run this `if` for every record. - if (nextElement.isInstanceOf[ColumnarBatch]) { - incTaskInputMetricsBytesRead() - inputMetrics.incRecordsRead(nextElement.asInstanceOf[ColumnarBatch].numRows()) - } else { - // too costly to update every record - if (inputMetrics.recordsRead % - SparkHadoopUtil.UPDATE_INPUT_METRICS_INTERVAL_RECORDS == 0) { - incTaskInputMetricsBytesRead() - } - inputMetrics.incRecordsRead(1) - } - nextElement - } - - private def readCurrentFile(): Iterator[InternalRow] = { - try { - readFunction(currentFile) - } catch { - case e: FileNotFoundException => - throw new FileNotFoundException( - e.getMessage + "\n" + - "It is possible the underlying files have been updated. " + - "You can explicitly invalidate the cache in Spark by " + - "running 'REFRESH TABLE tableName' command in SQL or " + - "by recreating the Dataset/DataFrame involved.") - } - } - - /** Advances to the next file. Returns true if a new non-empty iterator is available. */ - private def nextIterator(): Boolean = { - if (files.hasNext) { - currentFile = files.next() - logInfo(s"Reading File $currentFile") - // Sets InputFileBlockHolder for the file block's information - InputFileBlockHolder.set(currentFile.filePath, currentFile.start, currentFile.length) - - if (ignoreMissingFiles || ignoreCorruptFiles) { - currentIterator = new NextIterator[Object] { - // The readFunction may read some bytes before consuming the iterator, e.g., - // vectorized Parquet reader. Here we use lazy val to delay the creation of - // iterator so that we will throw exception in `getNext`. - private lazy val internalIter = readCurrentFile() - - override def getNext(): AnyRef = { - try { - if (internalIter.hasNext) { - internalIter.next() - } else { - finished = true - null - } - } catch { - case e: FileNotFoundException if ignoreMissingFiles => - logWarning(s"Skipped missing file: $currentFile", e) - finished = true - null - // Throw FileNotFoundException even if `ignoreCorruptFiles` is true - case e: FileNotFoundException if !ignoreMissingFiles => throw e - case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles => - logWarning( - s"Skipped the rest of the content in the corrupted file: $currentFile", e) - finished = true - null - } - } - - override def close(): Unit = {} - } - } else { - currentIterator = readCurrentFile() - } - - try { - hasNext - } catch { - case e: SchemaColumnConvertNotSupportedException => - val message = "Parquet column cannot be converted in " + - s"file ${currentFile.filePath}. Column: ${e.getColumn}, " + - s"Expected: ${e.getLogicalType}, Found: ${e.getPhysicalType}" - throw new QueryExecutionException(message, e) - case e: ParquetDecodingException => - if (e.getCause.isInstanceOf[SparkUpgradeException]) { - throw e.getCause - } else if (e.getMessage.contains("Can not read value at")) { - val message = "Encounter error while reading parquet files. " + - "One possible cause: Parquet column cannot be converted in the " + - "corresponding files. Details: " - throw new QueryExecutionException(message, e) - } - throw e - } - } else { - currentFile = null - InputFileBlockHolder.unset() - false - } - } - - override def close(): Unit = { - incTaskInputMetricsBytesRead() - InputFileBlockHolder.unset() - } - } - - // Register an on-task-completion callback to close the input stream. - context.addTaskCompletionListener[Unit](_ => iterator.close()) - - iterator.asInstanceOf[Iterator[InternalRow]] // This is an erasure hack. - } - - override protected def getPartitions: Array[RDDPartition] = filePartitions.toArray - - override protected def getPreferredLocations(split: RDDPartition): Seq[String] = { - split.asInstanceOf[FilePartition].preferredLocations() - } -} - diff --git a/sql-plugin/src/main/301until304/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/301until304/scala/com/nvidia/spark/rapids/shims/SparkShims.scala deleted file mode 100644 index 5613f378702..00000000000 --- a/sql-plugin/src/main/301until304/scala/com/nvidia/spark/rapids/shims/SparkShims.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{ShimLoader, ShimVersion} -import org.apache.parquet.schema.MessageType - -import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters - -object SparkShimImpl extends Spark30XShims with Spark30Xuntil33XShims { - - override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion - - override def getParquetFilters( - schema: MessageType, - pushDownDate: Boolean, - pushDownTimestamp: Boolean, - pushDownDecimal: Boolean, - pushDownStartWith: Boolean, - pushDownInFilterThreshold: Int, - caseSensitive: Boolean, - lookupFileMeta: String => String, - dateTimeRebaseModeFromConf: String): ParquetFilters = { - new ParquetFilters(schema, pushDownDate, pushDownTimestamp, pushDownDecimal, pushDownStartWith, - pushDownInFilterThreshold, caseSensitive) - } -} diff --git a/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuOrcScan.scala b/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuOrcScan.scala deleted file mode 100644 index 25917a3e317..00000000000 --- a/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuOrcScan.scala +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{GpuOrcScanBase, RapidsConf} -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.connector.read.PartitionReaderFactory -import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex -import org.apache.spark.sql.execution.datasources.v2.FileScan -import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap - -case class GpuOrcScan( - sparkSession: SparkSession, - hadoopConf: Configuration, - fileIndex: PartitioningAwareFileIndex, - dataSchema: StructType, - readDataSchema: StructType, - readPartitionSchema: StructType, - options: CaseInsensitiveStringMap, - pushedFilters: Array[Filter], - partitionFilters: Seq[Expression], - dataFilters: Seq[Expression], - rapidsConf: RapidsConf, - queryUsesInputFile: Boolean = false) - extends GpuOrcScanBase(sparkSession, hadoopConf, dataSchema, readDataSchema, - readPartitionSchema, pushedFilters, rapidsConf, queryUsesInputFile) with FileScan { - - override def isSplitable(path: Path): Boolean = super.isSplitableBase(path) - - override def createReaderFactory(): PartitionReaderFactory = super.createReaderFactoryBase() - - override def equals(obj: Any): Boolean = obj match { - case o: GpuOrcScan => - super.equals(o) && dataSchema == o.dataSchema && options == o.options && - equivalentFilters(pushedFilters, o.pushedFilters) && rapidsConf == o.rapidsConf && - queryUsesInputFile == o.queryUsesInputFile - case _ => false - } - - override def hashCode(): Int = getClass.hashCode() - - override def description(): String = { - super.description() + ", PushedFilters: " + seqToString(pushedFilters) - } - - override def withFilters( - partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = - this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) -} diff --git a/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuParquetScan.scala b/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuParquetScan.scala deleted file mode 100644 index 22ff476a7e3..00000000000 --- a/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuParquetScan.scala +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{GpuParquetScanBase, RapidsConf} -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.connector.read.PartitionReaderFactory -import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex -import org.apache.spark.sql.execution.datasources.v2.FileScan -import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap - -case class GpuParquetScan( - sparkSession: SparkSession, - hadoopConf: Configuration, - fileIndex: PartitioningAwareFileIndex, - dataSchema: StructType, - readDataSchema: StructType, - readPartitionSchema: StructType, - pushedFilters: Array[Filter], - options: CaseInsensitiveStringMap, - partitionFilters: Seq[Expression], - dataFilters: Seq[Expression], - rapidsConf: RapidsConf, - queryUsesInputFile: Boolean = false) - extends GpuParquetScanBase(sparkSession, hadoopConf, dataSchema, - readDataSchema, readPartitionSchema, pushedFilters, rapidsConf, - queryUsesInputFile) with FileScan { - - override def isSplitable(path: Path): Boolean = super.isSplitableBase(path) - - override def createReaderFactory(): PartitionReaderFactory = super.createReaderFactoryBase() - - override def equals(obj: Any): Boolean = obj match { - case p: GpuParquetScan => - super.equals(p) && dataSchema == p.dataSchema && options == p.options && - equivalentFilters(pushedFilters, p.pushedFilters) && rapidsConf == p.rapidsConf && - queryUsesInputFile == p.queryUsesInputFile - - case _ => false - } - - override def hashCode(): Int = getClass.hashCode() - - override def description(): String = { - super.description() + ", PushedFilters: " + seqToString(pushedFilters) - } - - override def withFilters( - partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = - this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) -} diff --git a/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuRowBasedScalaUDF.scala b/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuRowBasedScalaUDF.scala deleted file mode 100644 index 2578019916f..00000000000 --- a/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/GpuRowBasedScalaUDF.scala +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{ExprChecks, ExprRule, GpuOverrides, GpuUserDefinedFunction, RepeatingParamCheck, TypeSig} - -import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} -import org.apache.spark.sql.rapids.{GpuRowBasedScalaUDFBase, ScalaUDFMetaBase} -import org.apache.spark.sql.types.DataType - -/** Run a row-based UDF in a GPU operation */ -case class GpuRowBasedScalaUDF( - sparkFunc: AnyRef, - dataType: DataType, - children: Seq[Expression], - inputEncoders: Seq[Option[ExpressionEncoder[_]]], - udfName: Option[String], - nullable: Boolean, - udfDeterministic: Boolean) - extends GpuRowBasedScalaUDFBase(sparkFunc, dataType, children, inputEncoders, None, udfName) { - - override def createInputConverter(i: Int, dataType: DataType): Any => Any = { - if (inputEncoders.isEmpty) { - // for untyped Scala UDF - CatalystTypeConverters.createToScalaConverter(dataType) - } else { - val encoder = inputEncoders(i) - if (encoder.isDefined && encoder.get.isSerializedAsStructForTopLevel) { - val fromRow = encoder.get.resolveAndBind().createDeserializer() - row: Any => fromRow(row.asInstanceOf[InternalRow]) - } else { - CatalystTypeConverters.createToScalaConverter(dataType) - } - } - } - - override val checkNull: Boolean = false -} - -object GpuScalaUDFMeta { - def exprMeta: ExprRule[ScalaUDF] = GpuOverrides.expr[ScalaUDF]( - "User Defined Function, the UDF can choose to implement a RAPIDS accelerated interface " + - "to get better performance.", - ExprChecks.projectOnly( - GpuUserDefinedFunction.udfTypeSig, - TypeSig.all, - repeatingParamCheck = - Some(RepeatingParamCheck("param", GpuUserDefinedFunction.udfTypeSig, TypeSig.all))), - (expr, conf, p, r) => new ScalaUDFMetaBase(expr, conf, p, r) { - override protected def rowBasedScalaUDF: GpuRowBasedScalaUDFBase = - GpuRowBasedScalaUDF( - expr.function, - expr.dataType, - childExprs.map(_.convertToGpu()), - expr.inputEncoders, - expr.udfName, - expr.nullable, - expr.udfDeterministic) - }) -} diff --git a/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala b/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala deleted file mode 100644 index db75dabe32e..00000000000 --- a/sql-plugin/src/main/301until310-all/scala/com/nvidia/spark/rapids/shims/OffsetWindowFunctionMeta.scala +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, ExprMeta, GpuOverrides, RapidsConf, RapidsMeta} - -import org.apache.spark.sql.catalyst.expressions.{Lag, Lead, Literal, OffsetWindowFunction} -import org.apache.spark.sql.types.IntegerType - -abstract class OffsetWindowFunctionMeta[INPUT <: OffsetWindowFunction] ( - expr: INPUT, - conf: RapidsConf, - parent: Option[RapidsMeta[_, _, _]], - rule: DataFromReplacementRule) - extends ExprMeta[INPUT](expr, conf, parent, rule) { - lazy val input: BaseExprMeta[_] = GpuOverrides.wrapExpr(expr.input, conf, Some(this)) - lazy val offset: BaseExprMeta[_] = { - expr match { - case _: Lead => // Supported. - case _: Lag => // Supported. - case other => - throw new IllegalStateException( - s"Only LEAD/LAG offset window functions are supported. Found: $other") - } - - val literalOffset = GpuOverrides.extractLit(expr.offset) match { - case Some(Literal(offset: Int, IntegerType)) => - Literal(offset, IntegerType) - case _ => - throw new IllegalStateException( - s"Only integer literal offsets are supported for LEAD/LAG. Found: ${expr.offset}") - } - - GpuOverrides.wrapExpr(literalOffset, conf, Some(this)) - } - lazy val default: BaseExprMeta[_] = GpuOverrides.wrapExpr(expr.default, conf, Some(this)) - - override val childExprs: Seq[BaseExprMeta[_]] = Seq(input, offset, default) - - override def tagExprForGpu(): Unit = { - expr match { - case _: Lead => // Supported. - case _: Lag => // Supported. - case other => - willNotWorkOnGpu( s"Only LEAD/LAG offset window functions are supported. Found: $other") - } - - if (GpuOverrides.extractLit(expr.offset).isEmpty) { // Not a literal offset. - willNotWorkOnGpu( - s"Only integer literal offsets are supported for LEAD/LAG. Found: ${expr.offset}") - } - } -} diff --git a/sql-plugin/src/main/301until310-all/scala/org/apache/spark/sql/rapids/aggregate/GpuSum.scala b/sql-plugin/src/main/301until310-all/scala/org/apache/spark/sql/rapids/aggregate/GpuSum.scala deleted file mode 100644 index 09bbf280896..00000000000 --- a/sql-plugin/src/main/301until310-all/scala/org/apache/spark/sql/rapids/aggregate/GpuSum.scala +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.rapids.aggregate - -object GpuSumDefaults { - val hasIsEmptyField: Boolean = false -} \ No newline at end of file diff --git a/sql-plugin/src/main/301until310-all/scala/org/apache/spark/sql/rapids/shims/GpuSchemaUtils.scala b/sql-plugin/src/main/301until310-all/scala/org/apache/spark/sql/rapids/shims/GpuSchemaUtils.scala deleted file mode 100644 index 48ba95be0c3..00000000000 --- a/sql-plugin/src/main/301until310-all/scala/org/apache/spark/sql/rapids/shims/GpuSchemaUtils.scala +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.rapids.shims - -import org.apache.spark.sql.catalyst.analysis.Resolver -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.SchemaUtils - -object GpuSchemaUtils { - - def checkColumnNameDuplication( - schema: StructType, - colType: String, - resolver: Resolver): Unit = { - SchemaUtils.checkColumnNameDuplication(schema.map(_.name), colType, resolver) - } -} diff --git a/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/GpuJoinUtils.scala b/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/GpuJoinUtils.scala deleted file mode 100644 index 228b7de7872..00000000000 --- a/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/GpuJoinUtils.scala +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids.{GpuBuildLeft, GpuBuildRight, GpuBuildSide} - -import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide} - -object GpuJoinUtils { - def getGpuBuildSide(buildSide: BuildSide): GpuBuildSide = { - buildSide match { - case BuildRight => GpuBuildRight - case BuildLeft => GpuBuildLeft - case _ => throw new Exception(s"unknown build side type $buildSide") - } - } -} diff --git a/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/GpuRegExpReplaceMeta.scala b/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/GpuRegExpReplaceMeta.scala deleted file mode 100644 index 630216597d8..00000000000 --- a/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/GpuRegExpReplaceMeta.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids._ - -import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, RegExpReplace} -import org.apache.spark.sql.rapids.{GpuRegExpReplace, GpuRegExpUtils, GpuStringReplace} -import org.apache.spark.sql.types.DataTypes -import org.apache.spark.unsafe.types.UTF8String - -class GpuRegExpReplaceMeta( - expr: RegExpReplace, - conf: RapidsConf, - parent: Option[RapidsMeta[_, _, _]], - rule: DataFromReplacementRule) - extends TernaryExprMeta[RegExpReplace](expr, conf, parent, rule) { - - private var pattern: Option[String] = None - private var replacement: Option[String] = None - - override def tagExprForGpu(): Unit = { - GpuRegExpUtils.tagForRegExpEnabled(this) - expr.regexp match { - case Literal(s: UTF8String, DataTypes.StringType) if s != null => - if (GpuOverrides.isSupportedStringReplacePattern(expr.regexp)) { - // use GpuStringReplace - } else { - try { - pattern = Some(new CudfRegexTranspiler(RegexReplaceMode).transpile(s.toString)) - } catch { - case e: RegexUnsupportedException => - willNotWorkOnGpu(e.getMessage) - } - } - - case _ => - willNotWorkOnGpu(s"only non-null literal strings are supported on GPU") - } - - expr.rep match { - case Literal(s: UTF8String, DataTypes.StringType) if s != null => - if (GpuRegExpUtils.containsBackrefs(s.toString)) { - willNotWorkOnGpu("regexp_replace with back-references is not supported") - } - replacement = Some(GpuRegExpUtils.unescapeReplaceString(s.toString)) - case _ => - } - } - - override def convertToGpu( - lhs: Expression, - regexp: Expression, - rep: Expression): GpuExpression = { - if (GpuOverrides.isSupportedStringReplacePattern(expr.regexp)) { - GpuStringReplace(lhs, regexp, rep) - } else { - (pattern, replacement) match { - case (Some(cudfPattern), Some(cudfReplacement)) => - GpuRegExpReplace(lhs, regexp, rep, cudfPattern, cudfReplacement) - case _ => - throw new IllegalStateException("Expression has not been tagged correctly") - } - } - } -} diff --git a/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/Spark30XShims.scala b/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/Spark30XShims.scala deleted file mode 100644 index f9da8dc0887..00000000000 --- a/sql-plugin/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/Spark30XShims.scala +++ /dev/null @@ -1,393 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import java.nio.ByteBuffer - -import com.nvidia.spark.rapids._ -import org.apache.arrow.memory.ReferenceManager -import org.apache.arrow.vector.ValueVector - -import org.apache.spark.SparkEnv -import org.apache.spark.internal.Logging -import org.apache.spark.rapids.shims.GpuShuffleExchangeExec -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.errors.attachTree -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate.Average -import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning} -import org.apache.spark.sql.catalyst.trees.TreeNode -import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, ShuffleQueryStageExec} -import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InMemoryFileIndex} -import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec -import org.apache.spark.sql.execution.python.{AggregateInPandasExec, ArrowEvalPythonExec, FlatMapGroupsInPandasExec, MapInPandasExec, WindowInPandasExec} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.{GpuAbs, GpuAverage, GpuFileSourceScanExec, GpuTimeSub} -import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase -import org.apache.spark.sql.rapids.execution.python._ -import org.apache.spark.sql.rapids.execution.python.shims._ -import org.apache.spark.sql.types._ -import org.apache.spark.storage.{BlockId, BlockManagerId} -import org.apache.spark.unsafe.types.CalendarInterval - -abstract class Spark30XShims extends Spark301until320Shims with Logging { - override def int96ParquetRebaseRead(conf: SQLConf): String = - parquetRebaseRead(conf) - override def int96ParquetRebaseWrite(conf: SQLConf): String = - parquetRebaseWrite(conf) - override def int96ParquetRebaseReadKey: String = - parquetRebaseReadKey - override def int96ParquetRebaseWriteKey: String = - parquetRebaseWriteKey - override def hasSeparateINT96RebaseConf: Boolean = false - - override def getScalaUDFAsExpression( - function: AnyRef, - dataType: DataType, - children: Seq[Expression], - inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Nil, - outputEncoder: Option[ExpressionEncoder[_]] = None, - udfName: Option[String] = None, - nullable: Boolean = true, - udfDeterministic: Boolean = true): Expression = { - // outputEncoder is only used in Spark 3.1+ - ScalaUDF(function, dataType, children, inputEncoders, udfName, nullable, udfDeterministic) - } - - override def getMapSizesByExecutorId( - shuffleId: Int, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = { - SparkEnv.get.mapOutputTracker.getMapSizesByRange(shuffleId, - startMapIndex, endMapIndex, startPartition, endPartition) - } - - override def getGpuShuffleExchangeExec( - gpuOutputPartitioning: GpuPartitioning, - child: SparkPlan, - cpuOutputPartitioning: Partitioning, - cpuShuffle: Option[ShuffleExchangeExec]): GpuShuffleExchangeExecBase = { - val canChangeNumPartitions = cpuShuffle.forall(_.canChangeNumPartitions) - GpuShuffleExchangeExec(gpuOutputPartitioning, child, canChangeNumPartitions)( - cpuOutputPartitioning) - } - - override def getGpuShuffleExchangeExec( - queryStage: ShuffleQueryStageExec): GpuShuffleExchangeExecBase = { - queryStage.shuffle.asInstanceOf[GpuShuffleExchangeExecBase] - } - - override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = { - Seq( - GpuOverrides.exec[WindowInPandasExec]( - "The backend for Window Aggregation Pandas UDF, Accelerates the data transfer between" + - " the Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled. For now it only supports row based window frame.", - ExecChecks( - (TypeSig.commonCudfTypes + TypeSig.ARRAY).nested(TypeSig.commonCudfTypes), - TypeSig.all), - (winPy, conf, p, r) => new GpuWindowInPandasExecMetaBase(winPy, conf, p, r) { - override val windowExpressions: Seq[BaseExprMeta[NamedExpression]] = - winPy.windowExpression.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - - override def convertToGpu(): GpuExec = { - GpuWindowInPandasExec( - windowExpressions.map(_.convertToGpu()), - partitionSpec.map(_.convertToGpu()), - // leave ordering expression on the CPU, it's not used for GPU computation - winPy.orderSpec, - childPlans.head.convertIfNeeded() - )(winPy.partitionSpec) - } - }).disabledByDefault("it only supports row based frame for now"), - GpuOverrides.exec[FileSourceScanExec]( - "Reading data from files, often from Hive tables", - ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.STRUCT + TypeSig.MAP + - TypeSig.ARRAY + TypeSig.DECIMAL_128).nested(), TypeSig.all), - (fsse, conf, p, r) => new SparkPlanMeta[FileSourceScanExec](fsse, conf, p, r) { - - // Replaces SubqueryBroadcastExec inside dynamic pruning filters with GPU counterpart - // if possible. Instead regarding filters as childExprs of current Meta, we create - // a new meta for SubqueryBroadcastExec. The reason is that the GPU replacement of - // FileSourceScan is independent from the replacement of the partitionFilters. It is - // possible that the FileSourceScan is on the CPU, while the dynamic partitionFilters - // are on the GPU. And vice versa. - private lazy val partitionFilters = { - val convertBroadcast = (bc: SubqueryBroadcastExec) => { - val meta = GpuOverrides.wrapAndTagPlan(bc, conf) - meta.tagForExplain() - meta.convertIfNeeded().asInstanceOf[BaseSubqueryExec] - } - wrapped.partitionFilters.map { filter => - filter.transformDown { - case dpe @ DynamicPruningExpression(inSub: InSubqueryExec) => - inSub.plan match { - case bc: SubqueryBroadcastExec => - dpe.copy(inSub.copy(plan = convertBroadcast(bc))) - case reuse @ ReusedSubqueryExec(bc: SubqueryBroadcastExec) => - dpe.copy(inSub.copy(plan = reuse.copy(convertBroadcast(bc)))) - case _ => - dpe - } - } - } - } - - // partition filters and data filters are not run on the GPU - override val childExprs: Seq[ExprMeta[_]] = Seq.empty - - override def tagPlanForGpu(): Unit = GpuFileSourceScanExec.tagSupport(this) - - override def convertToCpu(): SparkPlan = { - wrapped.copy(partitionFilters = partitionFilters) - } - - override def convertToGpu(): GpuExec = { - val sparkSession = wrapped.relation.sparkSession - val options = wrapped.relation.options - - val location = replaceWithAlluxioPathIfNeeded( - conf, - wrapped.relation, - partitionFilters, - wrapped.dataFilters) - - val newRelation = HadoopFsRelation( - location, - wrapped.relation.partitionSchema, - wrapped.relation.dataSchema, - wrapped.relation.bucketSpec, - GpuFileSourceScanExec.convertFileFormat(wrapped.relation.fileFormat), - options)(sparkSession) - - GpuFileSourceScanExec( - newRelation, - wrapped.output, - wrapped.requiredSchema, - partitionFilters, - wrapped.optionalBucketSet, - None, - wrapped.dataFilters, - wrapped.tableIdentifier)(conf) - } - }), - GpuOverrides.exec[ArrowEvalPythonExec]( - "The backend of the Scalar Pandas UDFs. Accelerates the data transfer between the" + - " Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled", - ExecChecks( - (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(), - TypeSig.all), - (e, conf, p, r) => - new SparkPlanMeta[ArrowEvalPythonExec](e, conf, p, r) { - val udfs: Seq[BaseExprMeta[PythonUDF]] = - e.udfs.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - val resultAttrs: Seq[BaseExprMeta[Attribute]] = - e.resultAttrs.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - override val childExprs: Seq[BaseExprMeta[_]] = udfs ++ resultAttrs - - override def replaceMessage: String = "partially run on GPU" - override def noReplacementPossibleMessage(reasons: String): String = - s"cannot run even partially on the GPU because $reasons" - - override def convertToGpu(): GpuExec = - GpuArrowEvalPythonExec(udfs.map(_.convertToGpu()).asInstanceOf[Seq[GpuPythonUDF]], - resultAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]], - childPlans.head.convertIfNeeded(), - e.evalType) - }), - GpuOverrides.exec[MapInPandasExec]( - "The backend for Map Pandas Iterator UDF. Accelerates the data transfer between the" + - " Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled.", - ExecChecks((TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(), - TypeSig.all), - (mapPy, conf, p, r) => new GpuMapInPandasExecMeta(mapPy, conf, p, r)), - GpuOverrides.exec[FlatMapGroupsInPandasExec]( - "The backend for Flat Map Groups Pandas UDF, Accelerates the data transfer between the" + - " Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled.", - ExecChecks(TypeSig.commonCudfTypes, TypeSig.all), - (flatPy, conf, p, r) => new GpuFlatMapGroupsInPandasExecMeta(flatPy, conf, p, r)), - GpuOverrides.exec[AggregateInPandasExec]( - "The backend for an Aggregation Pandas UDF, this accelerates the data transfer between" + - " the Java process and the Python process. It also supports scheduling GPU resources" + - " for the Python process when enabled.", - ExecChecks(TypeSig.commonCudfTypes, TypeSig.all), - (aggPy, conf, p, r) => new GpuAggregateInPandasExecMeta(aggPy, conf, p, r)) - ).map(r => (r.getClassFor.asSubclass(classOf[SparkPlan]), r)).toMap - } - - protected def getExprsSansTimeSub: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = { - Seq( - GpuOverrides.expr[Cast]( - "Convert a column of one type of data into another type", - new CastChecks(), - (cast, conf, p, r) => new CastExprMeta[Cast](cast, - SparkSession.active.sessionState.conf.ansiEnabled, conf, p, r, - doFloatToIntCheck = false, stringToAnsiDate = false)), - GpuOverrides.expr[AnsiCast]( - "Convert a column of one type of data into another type", - new CastChecks(), - (cast, conf, p, r) => new CastExprMeta[AnsiCast](cast, ansiEnabled = true, conf = conf, - parent = p, rule = r, doFloatToIntCheck = false, stringToAnsiDate = false)), - GpuOverrides.expr[Average]( - "Average aggregate operator", - ExprChecks.fullAgg( - TypeSig.DOUBLE + TypeSig.DECIMAL_128, - TypeSig.DOUBLE + TypeSig.DECIMAL_128, - Seq(ParamCheck("input", - TypeSig.integral + TypeSig.fp + TypeSig.DECIMAL_128, - TypeSig.cpuNumeric))), - (a, conf, p, r) => new AggExprMeta[Average](a, conf, p, r) { - override def tagAggForGpu(): Unit = { - // For Decimal Average the SUM adds a precision of 10 to avoid overflowing - // then it divides by the count with an output scale that is 4 more than the input - // scale. With how our divide works to match Spark, this means that we will need a - // precision of 5 more. So 38 - 10 - 5 = 23 - val dataType = a.child.dataType - dataType match { - case dt: DecimalType => - if (dt.precision > 23) { - if (conf.needDecimalGuarantees) { - willNotWorkOnGpu("GpuAverage cannot guarantee proper overflow checks for " + - s"a precision large than 23. The current precision is ${dt.precision}") - } else { - logWarning("Decimal overflow guarantees disabled for " + - s"Average(${a.child.dataType}) produces $dt with an " + - s"intermediate precision of ${dt.precision + 15}") - } - } - case _ => // NOOP - } - GpuOverrides.checkAndTagFloatAgg(dataType, conf, this) - } - - override def convertToGpu(childExprs: Seq[Expression]): GpuExpression = - GpuAverage(childExprs.head) - - // Average is not supported in ANSI mode right now, no matter the type - override val ansiTypeToCheck: Option[DataType] = None - }), - GpuOverrides.expr[Abs]( - "Absolute value", - ExprChecks.unaryProjectAndAstInputMatchesOutput( - TypeSig.implicitCastsAstTypes, TypeSig.gpuNumeric, - TypeSig.cpuNumeric), - (a, conf, p, r) => new UnaryAstExprMeta[Abs](a, conf, p, r) { - // ANSI support for ABS was added in 3.2.0 SPARK-33275 - override def convertToGpu(child: Expression): GpuExpression = GpuAbs(child, false) - }), - GpuOverrides.expr[RegExpReplace]( - "String replace using a regular expression pattern", - ExprChecks.projectOnly(TypeSig.STRING, TypeSig.STRING, - Seq(ParamCheck("str", TypeSig.STRING, TypeSig.STRING), - ParamCheck("regex", TypeSig.lit(TypeEnum.STRING), TypeSig.STRING), - ParamCheck("rep", TypeSig.lit(TypeEnum.STRING), TypeSig.STRING))), - (a, conf, p, r) => new GpuRegExpReplaceMeta(a, conf, p, r)), - GpuScalaUDFMeta.exprMeta - ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap - } - - override def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = { - getExprsSansTimeSub + (classOf[TimeSub] -> GpuOverrides.expr[TimeSub]( - "Subtracts interval from timestamp", - ExprChecks.binaryProject(TypeSig.TIMESTAMP, TypeSig.TIMESTAMP, - ("start", TypeSig.TIMESTAMP, TypeSig.TIMESTAMP), - ("interval", TypeSig.lit(TypeEnum.CALENDAR) - .withPsNote(TypeEnum.CALENDAR, "months not supported"), TypeSig.CALENDAR)), - (timeSub, conf, p, r) => new BinaryExprMeta[TimeSub](timeSub, conf, p, r) { - override def tagExprForGpu(): Unit = { - timeSub.interval match { - case Literal(intvl: CalendarInterval, DataTypes.CalendarIntervalType) => - if (intvl.months != 0) { - willNotWorkOnGpu("interval months isn't supported") - } - case _ => - } - checkTimeZoneId(timeSub.timeZoneId) - } - - override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = - GpuTimeSub(lhs, rhs) - })) - } - - // Hardcoded for Spark-3.0.* - override def getFileSourceMaxMetadataValueLength(sqlConf: SQLConf): Int = 100 - - override def getGpuColumnarToRowTransition(plan: SparkPlan, - exportColumnRdd: Boolean): GpuColumnarToRowExecParent = { - GpuColumnarToRowExec(plan, exportColumnRdd) - } - - override def sortOrder( - child: Expression, - direction: SortDirection, - nullOrdering: NullOrdering): SortOrder = SortOrder(child, direction, nullOrdering, Set.empty) - - override def copySortOrderWithNewChild(s: SortOrder, child: Expression): SortOrder = { - s.copy(child = child) - } - - override def shouldIgnorePath(path: String): Boolean = { - InMemoryFileIndex.shouldFilterOut(path) - } - - override def getLegacyComplexTypeToString(): Boolean = true - - // Arrow version changed between Spark versions - override def getArrowDataBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = { - val arrowBuf = vec.getDataBuffer - (arrowBuf.nioBuffer(), arrowBuf.getReferenceManager) - } - - override def shouldFailDivByZero(): Boolean = false - - /** dropped by SPARK-34234 */ - override def attachTreeIfSupported[TreeType <: TreeNode[_], A]( - tree: TreeType, - msg: String)( - f: => A - ): A = { - attachTree(tree, msg)(f) - } - - override def hasCastFloatTimestampUpcast: Boolean = false - - override def getAdaptiveInputPlan(adaptivePlan: AdaptiveSparkPlanExec): SparkPlan = { - adaptivePlan.initialPlan - } - - // this is to help with an optimization in Spark 3.1, so we disable it by default in Spark 3.0.x - override def isEmptyRelation(relation: Any): Boolean = false - override def tryTransformIfEmptyRelation(mode: BroadcastMode): Option[Any] = None - - override def supportsColumnarAdaptivePlans: Boolean = false - - override def columnarAdaptivePlan(a: AdaptiveSparkPlanExec, goal: CoalesceSizeGoal): SparkPlan = { - // When the input is an adaptive plan we do not get to see the GPU version until - // the plan is executed and sometimes the plan will have a GpuColumnarToRowExec as the - // final operator and we can bypass this to keep the data columnar by inserting - // the [[AvoidAdaptiveTransitionToRow]] operator here - AvoidAdaptiveTransitionToRow(GpuRowToColumnarExec(a, goal)) - } -} diff --git a/sql-plugin/src/main/301until310-nondb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala b/sql-plugin/src/main/301until310-nondb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala deleted file mode 100644 index 03203b3ecff..00000000000 --- a/sql-plugin/src/main/301until310-nondb/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.rapids.shims - -import com.nvidia.spark.rapids.GpuPartitioning - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.plans.logical.Statistics -import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.execution.{ShufflePartitionSpec, SparkPlan} -import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike -import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBaseWithMetrics, ShuffledBatchRDD} - -case class GpuShuffleExchangeExec( - gpuOutputPartitioning: GpuPartitioning, - child: SparkPlan, - canChangeNumPartitions: Boolean)( - cpuOutputPartitioning: Partitioning) - extends GpuShuffleExchangeExecBaseWithMetrics(gpuOutputPartitioning, child) - with ShuffleExchangeLike { - - override def otherCopyArgs: Seq[AnyRef] = cpuOutputPartitioning :: Nil - - override val outputPartitioning: Partitioning = cpuOutputPartitioning - - override def numMappers: Int = shuffleDependencyColumnar.rdd.getNumPartitions - - override def numPartitions: Int = shuffleDependencyColumnar.partitioner.numPartitions - - override def getShuffleRDD(partitionSpecs: Array[ShufflePartitionSpec]): RDD[_] = { - new ShuffledBatchRDD(shuffleDependencyColumnar, metrics ++ readMetrics, partitionSpecs) - } - - override def runtimeStatistics: Statistics = { - // note that Spark will only use the sizeInBytes statistic but making the rowCount - // available here means that we can more easily reference it in GpuOverrides when - // planning future query stages when AQE is on - Statistics( - sizeInBytes = metrics("dataSize").value, - rowCount = Some(metrics("numOutputRows").value) - ) - } -} diff --git a/sql-plugin/src/main/301until310-nondb/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala b/sql-plugin/src/main/301until310-nondb/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala deleted file mode 100644 index 3ae5a44b6dc..00000000000 --- a/sql-plugin/src/main/301until310-nondb/scala/org/apache/spark/rapids/shims/ShuffledBatchRDDUtil.scala +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rapids.shims - -import com.nvidia.spark.rapids.shims.SparkShimImpl - -import org.apache.spark.{MapOutputTrackerMaster, Partition, ShuffleDependency, SparkEnv, TaskContext} -import org.apache.spark.shuffle.ShuffleReader -import org.apache.spark.sql.execution.{CoalescedPartitionSpec, PartialMapperPartitionSpec, PartialReducerPartitionSpec} -import org.apache.spark.sql.execution.metric.SQLShuffleReadMetricsReporter -import org.apache.spark.sql.rapids.execution.ShuffledBatchRDDPartition -import org.apache.spark.sql.vectorized.ColumnarBatch - -/** - * Some APIs for the ShuffledBatchRDD are only accessible from org.apache.spark... - * This code tries to match the Spark code as closely as possible. Fixing a compiler or IDE - * warning is not always the best thing here because if it changes how it matches up with the - * Spark code it may make it harder to maintain as thing change in Spark. - */ -object ShuffledBatchRDDUtil { - def preferredLocations( - partition: Partition, - dependency: ShuffleDependency[Int, ColumnarBatch, ColumnarBatch]): Seq[String] = { - val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] - partition.asInstanceOf[ShuffledBatchRDDPartition].spec match { - case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) => - // TODO order by partition size. - startReducerIndex.until(endReducerIndex).flatMap { reducerIndex => - tracker.getPreferredLocationsForShuffle(dependency, reducerIndex) - } - - case PartialReducerPartitionSpec(_, startMapIndex, endMapIndex) => - tracker.getMapLocation(dependency, startMapIndex, endMapIndex) - - case PartialMapperPartitionSpec(mapIndex, _, _) => - tracker.getMapLocation(dependency, mapIndex, mapIndex + 1) - } - } - - def getReaderAndPartSize( - split: Partition, - context: TaskContext, - dependency: ShuffleDependency[Int, ColumnarBatch, ColumnarBatch], - sqlMetricsReporter: SQLShuffleReadMetricsReporter): - (ShuffleReader[Nothing, Nothing], Long) = { - val shim = SparkShimImpl - split.asInstanceOf[ShuffledBatchRDDPartition].spec match { - case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) => - val reader = SparkEnv.get.shuffleManager.getReader( - dependency.shuffleHandle, - startReducerIndex, - endReducerIndex, - context, - sqlMetricsReporter) - val blocksByAddress = shim.getMapSizesByExecutorId( - dependency.shuffleHandle.shuffleId, 0, Int.MaxValue, startReducerIndex, endReducerIndex) - val partitionSize = blocksByAddress.flatMap(_._2).map(_._2).sum - (reader, partitionSize) - - case PartialReducerPartitionSpec(reducerIndex, startMapIndex, endMapIndex) => - val reader = SparkEnv.get.shuffleManager.getReaderForRange( - dependency.shuffleHandle, - startMapIndex, - endMapIndex, - reducerIndex, - reducerIndex + 1, - context, - sqlMetricsReporter) - val blocksByAddress = shim.getMapSizesByExecutorId( - dependency.shuffleHandle.shuffleId, 0, Int.MaxValue, reducerIndex, - reducerIndex + 1) - val partitionSize = blocksByAddress.flatMap(_._2) - .filter(tuple => tuple._3 >= startMapIndex && tuple._3 < endMapIndex) - .map(_._2).sum - (reader, partitionSize) - case PartialMapperPartitionSpec(mapIndex, startReducerIndex, endReducerIndex) => - val reader = SparkEnv.get.shuffleManager.getReaderForRange( - dependency.shuffleHandle, - mapIndex, - mapIndex + 1, - startReducerIndex, - endReducerIndex, - context, - sqlMetricsReporter) - val blocksByAddress = shim.getMapSizesByExecutorId( - dependency.shuffleHandle.shuffleId, 0, Int.MaxValue, startReducerIndex, endReducerIndex) - val partitionSize = blocksByAddress.flatMap(_._2) - .filter(_._3 == mapIndex) - .map(_._2).sum - (reader, partitionSize) - } - } -} diff --git a/sql-plugin/src/main/301until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark301until320Shims.scala b/sql-plugin/src/main/301until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark301until320Shims.scala deleted file mode 100644 index 3e77552e5f5..00000000000 --- a/sql-plugin/src/main/301until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark301until320Shims.scala +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import java.net.URI -import java.nio.ByteBuffer - -import scala.collection.mutable.ListBuffer - -import com.esotericsoftware.kryo.Kryo -import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer} -import com.nvidia.spark.rapids._ -import com.nvidia.spark.rapids.GpuOverrides.exec -import org.apache.arrow.memory.ReferenceManager -import org.apache.arrow.vector.ValueVector -import org.apache.hadoop.fs.{FileStatus, Path} - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.Resolver -import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.physical.BroadcastMode -import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils} -import org.apache.spark.sql.connector.read.Scan -import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, CustomShuffleReaderExec, QueryStageExec, ShuffleQueryStageExec} -import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, RunnableCommand} -import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.execution.datasources.rapids.GpuPartitioningUtils -import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan -import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan -import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec} -import org.apache.spark.sql.execution.joins._ -import org.apache.spark.sql.execution.window.WindowExecBase -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids._ -import org.apache.spark.sql.rapids.execution.{GpuCustomShuffleReaderExec, SerializeBatchDeserializeHostBuffer, SerializeConcatHostBuffersDeserializeBatch} -import org.apache.spark.sql.rapids.shims.GpuSchemaUtils -import org.apache.spark.sql.sources.BaseRelation -import org.apache.spark.sql.types._ - -/** -* Shim base class that can be compiled with from 301 until 320 -*/ -trait Spark301until320Shims extends SparkShims { - override def parquetRebaseReadKey: String = - SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ.key - override def parquetRebaseWriteKey: String = - SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key - override def avroRebaseReadKey: String = - SQLConf.LEGACY_AVRO_REBASE_MODE_IN_READ.key - override def avroRebaseWriteKey: String = - SQLConf.LEGACY_AVRO_REBASE_MODE_IN_WRITE.key - override def parquetRebaseRead(conf: SQLConf): String = - conf.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ) - override def parquetRebaseWrite(conf: SQLConf): String = - conf.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE) - - override def sessionFromPlan(plan: SparkPlan): SparkSession = { - plan.sqlContext.sparkSession - } - - override def filesFromFileIndex(fileIndex: PartitioningAwareFileIndex): Seq[FileStatus] = { - fileIndex.allFiles() - } - - def broadcastModeTransform(mode: BroadcastMode, rows: Array[InternalRow]): Any = - mode.transform(rows) - - override def newBroadcastQueryStageExec( - old: BroadcastQueryStageExec, - newPlan: SparkPlan): BroadcastQueryStageExec = BroadcastQueryStageExec(old.id, newPlan) - - override def getDateFormatter(): DateFormatter = { - DateFormatter(DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)) - } - - override def isExchangeOp(plan: SparkPlanMeta[_]): Boolean = { - // if the child query stage already executed on GPU then we need to keep the - // next operator on GPU in these cases - SQLConf.get.adaptiveExecutionEnabled && (plan.wrapped match { - case _: CustomShuffleReaderExec - | _: ShuffledHashJoinExec - | _: BroadcastHashJoinExec - | _: BroadcastExchangeExec - | _: BroadcastNestedLoopJoinExec => true - case _ => false - }) - } - - override def isAqePlan(p: SparkPlan): Boolean = p match { - case _: AdaptiveSparkPlanExec | - _: QueryStageExec | - _: CustomShuffleReaderExec => true - case _ => false - } - - override def isCustomReaderExec(x: SparkPlan): Boolean = x match { - case _: GpuCustomShuffleReaderExec | _: CustomShuffleReaderExec => true - case _ => false - } - - override def aqeShuffleReaderExec: ExecRule[_ <: SparkPlan] = exec[CustomShuffleReaderExec]( - "A wrapper of shuffle query stage", - ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.ARRAY + - TypeSig.STRUCT + TypeSig.MAP).nested(), TypeSig.all), - (exec, conf, p, r) => new GpuCustomShuffleReaderMeta(exec, conf, p, r)) - - override def findOperators(plan: SparkPlan, predicate: SparkPlan => Boolean): Seq[SparkPlan] = { - def recurse( - plan: SparkPlan, - predicate: SparkPlan => Boolean, - accum: ListBuffer[SparkPlan]): Seq[SparkPlan] = { - if (predicate(plan)) { - accum += plan - } - plan match { - case a: AdaptiveSparkPlanExec => recurse(a.executedPlan, predicate, accum) - case qs: BroadcastQueryStageExec => recurse(qs.broadcast, predicate, accum) - case qs: ShuffleQueryStageExec => recurse(qs.shuffle, predicate, accum) - case other => other.children.flatMap(p => recurse(p, predicate, accum)).headOption - } - accum - } - recurse(plan, predicate, new ListBuffer[SparkPlan]()) - } - - override def skipAssertIsOnTheGpu(plan: SparkPlan): Boolean = false - - override def shouldFailDivOverflow(): Boolean = false - - override def leafNodeDefaultParallelism(ss: SparkSession): Int = { - ss.sparkContext.defaultParallelism - } - - override def shouldFallbackOnAnsiTimestamp(): Boolean = false - - override def getLegacyStatisticalAggregate(): Boolean = true - - override def v1RepairTableCommand(tableName: TableIdentifier): RunnableCommand = - AlterTableRecoverPartitionsCommand(tableName) - - override def isWindowFunctionExec(plan: SparkPlan): Boolean = plan.isInstanceOf[WindowExecBase] - - override def getScans: Map[Class[_ <: Scan], ScanRule[_ <: Scan]] = Seq( - GpuOverrides.scan[ParquetScan]( - "Parquet parsing", - (a, conf, p, r) => new RapidsParquetScanMeta(a, conf, p, r)), - GpuOverrides.scan[OrcScan]( - "ORC parsing", - (a, conf, p, r) => new RapidsOrcScanMeta(a, conf, p, r)) - ).map(r => (r.getClassFor.asSubclass(classOf[Scan]), r)).toMap - - override def getPartitionFileNames( - partitions: Seq[PartitionDirectory]): Seq[String] = { - val files = partitions.flatMap(partition => partition.files) - files.map(_.getPath.getName) - } - - override def getPartitionFileStatusSize(partitions: Seq[PartitionDirectory]): Long = { - partitions.map(_.files.map(_.getLen).sum).sum - } - - override def getPartitionedFiles( - partitions: Array[PartitionDirectory]): Array[PartitionedFile] = { - partitions.flatMap { p => - p.files.map { f => - PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values) - } - } - } - - override def getPartitionSplitFiles( - partitions: Array[PartitionDirectory], - maxSplitBytes: Long, - relation: HadoopFsRelation): Array[PartitionedFile] = { - partitions.flatMap { partition => - partition.files.flatMap { file => - // getPath() is very expensive so we only want to call it once in this block: - val filePath = file.getPath - val isSplitable = relation.fileFormat.isSplitable( - relation.sparkSession, relation.options, filePath) - PartitionedFileUtil.splitFiles( - sparkSession = relation.sparkSession, - file = file, - filePath = filePath, - isSplitable = isSplitable, - maxSplitBytes = maxSplitBytes, - partitionValues = partition.values - ) - } - } - } - - override def getFileScanRDD( - sparkSession: SparkSession, - readFunction: PartitionedFile => Iterator[InternalRow], - filePartitions: Seq[FilePartition], - readDataSchema: StructType, - metadataColumns: Seq[AttributeReference]): RDD[InternalRow] = { - new FileScanRDD(sparkSession, readFunction, filePartitions) - } - - override def createFilePartition(index: Int, files: Array[PartitionedFile]): FilePartition = { - FilePartition(index, files) - } - - override def copyBatchScanExec( - batchScanExec: GpuBatchScanExec, - queryUsesInputFile: Boolean): GpuBatchScanExec = { - val scanCopy = batchScanExec.scan match { - case parquetScan: GpuParquetScan => - parquetScan.copy(queryUsesInputFile = queryUsesInputFile) - case orcScan: GpuOrcScan => - orcScan.copy(queryUsesInputFile = queryUsesInputFile) - case _ => throw new RuntimeException("Wrong format") // never reach here - } - batchScanExec.copy(scan = scanCopy) - } - - override def copyFileSourceScanExec( - scanExec: GpuFileSourceScanExec, - queryUsesInputFile: Boolean): GpuFileSourceScanExec = { - scanExec.copy(queryUsesInputFile = queryUsesInputFile)(scanExec.rapidsConf) - } - - override def checkColumnNameDuplication( - schema: StructType, - colType: String, - resolver: Resolver): Unit = { - GpuSchemaUtils.checkColumnNameDuplication(schema, colType, resolver) - } - - override def alias(child: Expression, name: String)( - exprId: ExprId, - qualifier: Seq[String], - explicitMetadata: Option[Metadata]): Alias = { - Alias(child, name)(exprId, qualifier, explicitMetadata) - } - - override def getArrowValidityBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = { - val arrowBuf = vec.getValidityBuffer - (arrowBuf.nioBuffer(), arrowBuf.getReferenceManager) - } - - override def getArrowOffsetsBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = { - val arrowBuf = vec.getOffsetBuffer - (arrowBuf.nioBuffer(), arrowBuf.getReferenceManager) - } - - override def replaceWithAlluxioPathIfNeeded( - conf: RapidsConf, - relation: HadoopFsRelation, - partitionFilters: Seq[Expression], - dataFilters: Seq[Expression]): FileIndex = { - - val alluxioPathsReplace: Option[Seq[String]] = conf.getAlluxioPathsToReplace - - if (alluxioPathsReplace.isDefined) { - // alluxioPathsReplace: Seq("key->value", "key1->value1") - // turn the rules to the Map with eg - // { s3:/foo -> alluxio://0.1.2.3:19998/foo, - // gs:/bar -> alluxio://0.1.2.3:19998/bar, - // /baz -> alluxio://0.1.2.3:19998/baz } - val replaceMapOption = alluxioPathsReplace.map(rules => { - rules.map(rule => { - val split = rule.split("->") - if (split.size == 2) { - split(0).trim -> split(1).trim - } else { - throw new IllegalArgumentException(s"Invalid setting for " + - s"${RapidsConf.ALLUXIO_PATHS_REPLACE.key}") - } - }).toMap - }) - - replaceMapOption.map(replaceMap => { - - def isDynamicPruningFilter(e: Expression): Boolean = - e.find(_.isInstanceOf[PlanExpression[_]]).isDefined - - val partitionDirs = relation.location.listFiles( - partitionFilters.filterNot(isDynamicPruningFilter), dataFilters) - - // replacement func to check if the file path is prefixed with the string user configured - // if yes, replace it - val replaceFunc = (f: Path) => { - val pathStr = f.toString - val matchedSet = replaceMap.keySet.filter(reg => pathStr.startsWith(reg)) - if (matchedSet.size > 1) { - // never reach here since replaceMap is a Map - throw new IllegalArgumentException(s"Found ${matchedSet.size} same replacing rules " + - s"from ${RapidsConf.ALLUXIO_PATHS_REPLACE.key} which requires only 1 rule for each " + - s"file path") - } else if (matchedSet.size == 1) { - new Path(pathStr.replaceFirst(matchedSet.head, replaceMap(matchedSet.head))) - } else { - f - } - } - - // replace all of input files - val inputFiles: Seq[Path] = partitionDirs.flatMap(partitionDir => { - replacePartitionDirectoryFiles(partitionDir, replaceFunc) - }) - - // replace all of rootPaths which are already unique - val rootPaths = relation.location.rootPaths.map(replaceFunc) - - val parameters: Map[String, String] = relation.options - - // infer PartitionSpec - val partitionSpec = GpuPartitioningUtils.inferPartitioning( - relation.sparkSession, - rootPaths, - inputFiles, - parameters, - Option(relation.dataSchema), - replaceFunc) - - // generate a new InMemoryFileIndex holding paths with alluxio schema - new InMemoryFileIndex( - relation.sparkSession, - inputFiles, - parameters, - Option(relation.dataSchema), - userSpecifiedPartitionSpec = Some(partitionSpec)) - }).getOrElse(relation.location) - - } else { - relation.location - } - } - - override def replacePartitionDirectoryFiles(partitionDir: PartitionDirectory, - replaceFunc: Path => Path): Seq[Path] = { - partitionDir.files.map(f => replaceFunc(f.getPath)) - } - - override def hasAliasQuoteFix: Boolean = false - - override def registerKryoClasses(kryo: Kryo): Unit = { - kryo.register(classOf[SerializeConcatHostBuffersDeserializeBatch], - new KryoJavaSerializer()) - kryo.register(classOf[SerializeBatchDeserializeHostBuffer], - new KryoJavaSerializer()) - } - - override def reusedExchangeExecPfn: PartialFunction[SparkPlan, ReusedExchangeExec] = { - case ShuffleQueryStageExec(_, e: ReusedExchangeExec) => e - case BroadcastQueryStageExec(_, e: ReusedExchangeExec) => e - } - - override def createTable(table: CatalogTable, - sessionCatalog: SessionCatalog, - tableLocation: Option[URI], - result: BaseRelation) = { - val newTable = table.copy( - storage = table.storage.copy(locationUri = tableLocation), - // We will use the schema of resolved.relation as the schema of the table (instead of - // the schema of df). It is important since the nullability may be changed by the relation - // provider (for example, see org.apache.spark.sql.parquet.DefaultSource). - schema = result.schema) - // Table location is already validated. No need to check it again during table creation. - sessionCatalog.createTable(newTable, ignoreIfExists = false, validateLocation = false) - } -} diff --git a/sql-plugin/src/main/301until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala b/sql-plugin/src/main/301until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala deleted file mode 100644 index 9a613fbb676..00000000000 --- a/sql-plugin/src/main/301until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.rapids.shims.api.python - -import java.io.DataInputStream -import java.net.Socket -import java.util.concurrent.atomic.AtomicBoolean - -import com.nvidia.spark.rapids.Arm - -import org.apache.spark.{SparkEnv, TaskContext} -import org.apache.spark.api.python.BasePythonRunner -import org.apache.spark.sql.vectorized.ColumnarBatch - -// pid is not a constructor argument in 30x and 31x -abstract class ShimBasePythonRunner[IN, OUT]( - funcs : scala.Seq[org.apache.spark.api.python.ChainedPythonFunctions], - evalType : scala.Int, argOffsets : scala.Array[scala.Array[scala.Int]] -) extends BasePythonRunner[ColumnarBatch, ColumnarBatch](funcs, evalType, argOffsets) - with Arm { - protected abstract class ShimReaderIterator( - stream: DataInputStream, - writerThread: WriterThread, - startTime: Long, - env: SparkEnv, - worker: Socket, - pid: Option[Int], - releasedOrClosed: AtomicBoolean, - context: TaskContext - ) extends ReaderIterator(stream, writerThread, startTime, env, worker, releasedOrClosed, context) -} diff --git a/sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala deleted file mode 100644 index e3ee4ccd6be..00000000000 --- a/sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -object RapidsErrorUtils { - def invalidArrayIndexError(index: Int, numElements: Int, - isElementAtF: Boolean = false): ArrayIndexOutOfBoundsException = { - // Follow the Spark string format before 3.3.0 - new ArrayIndexOutOfBoundsException(s"Invalid index: $index, numElements: $numElements") - } - - def mapKeyNotExistError(key: String, isElementAtF: Boolean = false): NoSuchElementException = { - // Follow the Spark string format before 3.3.0 - new NoSuchElementException(s"Key $key does not exist.") - } - - def sqlArrayIndexNotStartAtOneError(): ArrayIndexOutOfBoundsException = { - new ArrayIndexOutOfBoundsException("SQL array indices start at 1") - } -} diff --git a/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala b/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala deleted file mode 100644 index b8736640a9f..00000000000 --- a/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.csv - -object GpuCsvUtils { - def dateFormatInRead(options: CSVOptions): String = options.dateFormat -} diff --git a/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala b/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala deleted file mode 100644 index b22da8a4f71..00000000000 --- a/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.json - -object GpuJsonUtils { - def dateFormatInRead(options: JSONOptions): String = options.dateFormat -} diff --git a/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala b/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala deleted file mode 100644 index 40fa54478af..00000000000 --- a/sql-plugin/src/main/301until330-nondb/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.json.rapids.shims - -import org.apache.spark.sql.catalyst.csv.CSVOptions -import org.apache.spark.sql.catalyst.json.JSONOptions - -object FileOptionsShims { - - def timestampFormatInRead(fileOptions: Serializable): Option[String] = { - fileOptions match { - case csvOpts: CSVOptions => Option(csvOpts.timestampFormat) - case jsonOpts: JSONOptions => Option(jsonOpts.timestampFormat) - case _ => throw new RuntimeException("Wrong file options.") - } - } - -} diff --git a/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/shims/spark302/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/shims/spark302/RapidsShuffleInternalManager.scala deleted file mode 100644 index 0ff70ba825b..00000000000 --- a/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/shims/spark302/RapidsShuffleInternalManager.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.rapids.shims.spark302 - -import org.apache.spark.{SparkConf, TaskContext} -import org.apache.spark.shuffle._ -import org.apache.spark.sql.rapids.{ProxyRapidsShuffleInternalManagerBase, RapidsShuffleInternalManagerBase} - -/** - * A shuffle manager optimized for the RAPIDS Plugin For Apache Spark. - * @note This is an internal class to obtain access to the private - * `ShuffleManager` and `SortShuffleManager` classes. - */ -class RapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends RapidsShuffleInternalManagerBase(conf, isDriver) { - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, startMapIndex, endMapIndex, startPartition, endPartition, context, - metrics) - } - - def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, 0, Int.MaxValue, startPartition, endPartition, context, metrics) - } - -} - -class ProxyRapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends ProxyRapidsShuffleInternalManagerBase(conf, isDriver) with ShuffleManager { - - def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter - ): org.apache.spark.shuffle.ShuffleReader[K,C] = { - self.getReader(handle, startPartition, endPartition, context, metrics) - } - - def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter - ): ShuffleReader[K,C] = { - self.getReaderForRange(handle, startMapIndex, endMapIndex, startPartition, endPartition, - context, metrics) - } -} diff --git a/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/shims/spark302/SparkShimServiceProvider.scala b/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/shims/spark302/SparkShimServiceProvider.scala deleted file mode 100644 index 34c490395f8..00000000000 --- a/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/shims/spark302/SparkShimServiceProvider.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims.spark302 - -import com.nvidia.spark.rapids.SparkShimVersion - -object SparkShimServiceProvider { - val VERSION = SparkShimVersion(3, 0, 2) - val VERSIONNAMES = Seq(s"$VERSION") -} -class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider { - - override def getShimVersion: SparkShimVersion = SparkShimServiceProvider.VERSION - - def matchesVersion(version: String): Boolean = { - SparkShimServiceProvider.VERSIONNAMES.contains(version) - } - -} diff --git a/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/spark302/RapidsShuffleManager.scala b/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/spark302/RapidsShuffleManager.scala deleted file mode 100644 index 0153eac46ee..00000000000 --- a/sql-plugin/src/main/302/scala/com/nvidia/spark/rapids/spark302/RapidsShuffleManager.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.spark302 - -import org.apache.spark.SparkConf -import org.apache.spark.sql.rapids.shims.spark302.ProxyRapidsShuffleInternalManager - -/** A shuffle manager optimized for the RAPIDS Plugin for Apache Spark. */ -sealed class RapidsShuffleManager( - conf: SparkConf, - isDriver: Boolean) extends ProxyRapidsShuffleInternalManager(conf, isDriver) { -} diff --git a/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/shims/spark303/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/shims/spark303/RapidsShuffleInternalManager.scala deleted file mode 100644 index be3144b70f1..00000000000 --- a/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/shims/spark303/RapidsShuffleInternalManager.scala +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.rapids.shims.spark303 - -import org.apache.spark.{SparkConf, TaskContext} -import org.apache.spark.shuffle._ -import org.apache.spark.sql.rapids.{ProxyRapidsShuffleInternalManagerBase, RapidsShuffleInternalManagerBase} - -/** - * A shuffle manager optimized for the RAPIDS Plugin For Apache Spark. - * @note This is an internal class to obtain access to the private - * `ShuffleManager` and `SortShuffleManager` classes. - */ -class RapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends RapidsShuffleInternalManagerBase(conf, isDriver) { - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, startMapIndex, endMapIndex, startPartition, endPartition, context, - metrics) - } - - def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, 0, Int.MaxValue, startPartition, endPartition, context, metrics) - } - -} - - -class ProxyRapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends ProxyRapidsShuffleInternalManagerBase(conf, isDriver) with ShuffleManager { - - override def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter - ): org.apache.spark.shuffle.ShuffleReader[K,C] = { - self.getReader(handle, startPartition, endPartition, context, metrics) - } - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter - ): ShuffleReader[K,C] = { - self.getReaderForRange(handle, startMapIndex, endMapIndex, startPartition, endPartition, - context, metrics) - } -} \ No newline at end of file diff --git a/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/shims/spark303/SparkShimServiceProvider.scala b/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/shims/spark303/SparkShimServiceProvider.scala deleted file mode 100644 index 777cf7539d2..00000000000 --- a/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/shims/spark303/SparkShimServiceProvider.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims.spark303 - -import com.nvidia.spark.rapids.SparkShimVersion - -object SparkShimServiceProvider { - val VERSION = SparkShimVersion(3, 0, 3) - val VERSIONNAMES = Seq(s"$VERSION") -} -class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider { - - override def getShimVersion: SparkShimVersion = SparkShimServiceProvider.VERSION - - def matchesVersion(version: String): Boolean = { - SparkShimServiceProvider.VERSIONNAMES.contains(version) - } -} diff --git a/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/spark303/RapidsShuffleManager.scala b/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/spark303/RapidsShuffleManager.scala deleted file mode 100644 index f3c277cb61a..00000000000 --- a/sql-plugin/src/main/303/scala/com/nvidia/spark/rapids/spark303/RapidsShuffleManager.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.spark303 - -import org.apache.spark.SparkConf -import org.apache.spark.sql.rapids.shims.spark303.ProxyRapidsShuffleInternalManager - -/** A shuffle manager optimized for the RAPIDS Plugin for Apache Spark. */ -sealed class RapidsShuffleManager( - conf: SparkConf, - isDriver: Boolean) extends ProxyRapidsShuffleInternalManager(conf, isDriver) { -} diff --git a/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/SparkShims.scala deleted file mode 100644 index ca7002a6a0e..00000000000 --- a/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/SparkShims.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims - -import com.nvidia.spark.rapids._ -import org.apache.parquet.schema.MessageType - -import org.apache.spark.sql.execution.datasources.DataSourceUtils -import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters - -object SparkShimImpl extends Spark30XShims with Spark30Xuntil33XShims { - - override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion - - override def getParquetFilters( - schema: MessageType, - pushDownDate: Boolean, - pushDownTimestamp: Boolean, - pushDownDecimal: Boolean, - pushDownStartWith: Boolean, - pushDownInFilterThreshold: Int, - caseSensitive: Boolean, - lookupFileMeta: String => String, - dateTimeRebaseModeFromConf: String): ParquetFilters = { - val datetimeRebaseMode = DataSourceUtils - .datetimeRebaseMode(lookupFileMeta, dateTimeRebaseModeFromConf) - new ParquetFilters(schema, pushDownDate, pushDownTimestamp, pushDownDecimal, pushDownStartWith, - pushDownInFilterThreshold, caseSensitive, datetimeRebaseMode) - } -} diff --git a/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/shims/spark304/RapidsShuffleInternalManager.scala b/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/shims/spark304/RapidsShuffleInternalManager.scala deleted file mode 100644 index 2fcbc75ab3a..00000000000 --- a/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/shims/spark304/RapidsShuffleInternalManager.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.rapids.shims.spark304 - -import org.apache.spark.{SparkConf, TaskContext} -import org.apache.spark.shuffle._ -import org.apache.spark.sql.rapids.{ProxyRapidsShuffleInternalManagerBase, RapidsShuffleInternalManagerBase} - -/** - * A shuffle manager optimized for the RAPIDS Plugin For Apache Spark. - * @note This is an internal class to obtain access to the private - * `ShuffleManager` and `SortShuffleManager` classes. - */ -class RapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends RapidsShuffleInternalManagerBase(conf, isDriver) { - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, startMapIndex, endMapIndex, startPartition, endPartition, context, - metrics) - } - - def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { - getReaderInternal(handle, 0, Int.MaxValue, startPartition, endPartition, context, metrics) - } - -} - -class ProxyRapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean) - extends ProxyRapidsShuffleInternalManagerBase(conf, isDriver) with ShuffleManager { - - override def getReader[K, C]( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter - ): org.apache.spark.shuffle.ShuffleReader[K,C] = { - self.getReader(handle, startPartition, endPartition, context, metrics) - } - - override def getReaderForRange[K, C]( - handle: ShuffleHandle, - startMapIndex: Int, - endMapIndex: Int, - startPartition: Int, - endPartition: Int, - context: TaskContext, - metrics: ShuffleReadMetricsReporter - ): ShuffleReader[K,C] = { - self.getReaderForRange(handle, startMapIndex, endMapIndex, startPartition, endPartition, - context, metrics) - } -} \ No newline at end of file diff --git a/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/shims/spark304/SparkShimServiceProvider.scala b/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/shims/spark304/SparkShimServiceProvider.scala deleted file mode 100644 index 6a9ff15da3e..00000000000 --- a/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/shims/spark304/SparkShimServiceProvider.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.shims.spark304 - -import com.nvidia.spark.rapids.SparkShimVersion - -object SparkShimServiceProvider { - val VERSION = SparkShimVersion(3, 0, 4) - val VERSIONNAMES = Seq(s"$VERSION", s"$VERSION-SNAPSHOT") -} -class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider { - - override def getShimVersion: SparkShimVersion = SparkShimServiceProvider.VERSION - - def matchesVersion(version: String): Boolean = { - SparkShimServiceProvider.VERSIONNAMES.contains(version) - } -} diff --git a/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/spark304/RapidsShuffleManager.scala b/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/spark304/RapidsShuffleManager.scala deleted file mode 100644 index 4fbb4684bf5..00000000000 --- a/sql-plugin/src/main/304/scala/com/nvidia/spark/rapids/spark304/RapidsShuffleManager.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.spark304 - -import org.apache.spark.SparkConf -import org.apache.spark.sql.rapids.shims.spark304.ProxyRapidsShuffleInternalManager - -/** A shuffle manager optimized for the RAPIDS Plugin for Apache Spark. */ -sealed class RapidsShuffleManager( - conf: SparkConf, - isDriver: Boolean) extends ProxyRapidsShuffleInternalManager(conf, isDriver) { -} diff --git a/sql-plugin/src/main/301+-nondb/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala b/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala similarity index 100% rename from sql-plugin/src/main/301+-nondb/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala rename to sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AQEUtils.scala diff --git a/sql-plugin/src/main/301+-nondb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala b/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala similarity index 100% rename from sql-plugin/src/main/301+-nondb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala rename to sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/AggregationTagging.scala diff --git a/sql-plugin/src/main/301+-nondb/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala b/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala similarity index 100% rename from sql-plugin/src/main/301+-nondb/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala rename to sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala diff --git a/sql-plugin/src/main/301+-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala b/sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala similarity index 100% rename from sql-plugin/src/main/301+-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala rename to sql-plugin/src/main/311+-nondb/scala/com/nvidia/spark/rapids/shims/ShimBroadcastExchangeLike.scala diff --git a/sql-plugin/src/main/301+-nondb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala b/sql-plugin/src/main/311+-nondb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala similarity index 100% rename from sql-plugin/src/main/301+-nondb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala rename to sql-plugin/src/main/311+-nondb/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuFlatMapGroupsInPandasExec.scala diff --git a/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala index df43190dc7f..4ee918e5a63 100644 --- a/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala +++ b/sql-plugin/src/main/311-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala @@ -21,7 +21,7 @@ import org.apache.parquet.schema.MessageType import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters -object SparkShimImpl extends Spark31XShims with Spark30Xuntil33XShims { +object SparkShimImpl extends Spark31XShims { override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion diff --git a/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala b/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala index d8e273b45bb..36bc43795d8 100644 --- a/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala +++ b/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala @@ -17,7 +17,7 @@ package com.nvidia.spark.rapids.shims import org.apache.orc.Reader -object OrcShims extends OrcShims301until320Base { +object OrcShims extends OrcShims311until320Base { // ORC Reader of the 311cdh Spark has no close method. // The resource is closed internally. diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/AvoidAdaptiveTransitionToRow.scala diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/HashUtils.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/HashUtils.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/HashUtils.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/HashUtils.scala diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims301until320Base.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala similarity index 99% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims301until320Base.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala index 1c6decd35d5..ac1ef0aba2a 100644 --- a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims301until320Base.scala +++ b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcShims311until320Base.scala @@ -24,7 +24,7 @@ import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcFile, O import org.apache.orc.impl.{DataReaderProperties, OutStream, SchemaEvolution} import org.apache.orc.impl.RecordReaderImpl.SargApplier -trait OrcShims301until320Base { +trait OrcShims311until320Base { // read data to buffer def readFileData(dataReader: DataReader, inputDataRanges: DiskRangeList): DiskRangeList = { diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsOrcScanMeta.scala diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/RapidsParquetScanMeta.scala diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimAQEShuffleReadExec.scala diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/ShimDataSourceRDD.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimDataSourceRDD.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/ShimDataSourceRDD.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/ShimDataSourceRDD.scala diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala similarity index 98% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala index 1b4c6d20166..f5b610cbee1 100644 --- a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala +++ b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/TypeSigUtil.scala @@ -20,7 +20,7 @@ import com.nvidia.spark.rapids.{TypeEnum, TypeSig, TypeSigUtilBase} import org.apache.spark.sql.types.DataType -/** TypeSig Support for [3.0.1, 3.2.0) */ +/** TypeSig Support for [3.1.1, 3.2.0) */ object TypeSigUtil extends TypeSigUtilBase { /** diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/YearParseUtil.scala diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala rename to sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/gpuWindows.scala diff --git a/sql-plugin/src/main/301until320-all/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala b/sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala rename to sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/GpuShuffleBlockResolver.scala diff --git a/sql-plugin/src/main/301until320-all/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala b/sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala rename to sql-plugin/src/main/311until320-all/scala/org/apache/spark/rapids/shims/storage/ShimDiskBlockManager.scala diff --git a/sql-plugin/src/main/301until320-all/scala/org/apache/spark/sql/execution/ShimTrampolineUtil.scala b/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/org/apache/spark/sql/execution/ShimTrampolineUtil.scala rename to sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/execution/ShimTrampolineUtil.scala diff --git a/sql-plugin/src/main/301until320-all/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala b/sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala similarity index 100% rename from sql-plugin/src/main/301until320-all/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala rename to sql-plugin/src/main/311until320-all/scala/org/apache/spark/sql/rapids/shims/datetimeExpressions.scala diff --git a/sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala b/sql-plugin/src/main/311until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala similarity index 95% rename from sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala rename to sql-plugin/src/main/311until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala index 15d99c0464e..a6a2bec9ace 100644 --- a/sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala +++ b/sql-plugin/src/main/311until320-noncdh/scala/com/nvidia/spark/rapids/shims/OrcShims.scala @@ -18,7 +18,7 @@ package com.nvidia.spark.rapids.shims import com.nvidia.spark.rapids.RapidsPluginImplicits._ import org.apache.orc.Reader -object OrcShims extends OrcShims301until320Base { +object OrcShims extends OrcShims311until320Base { // the ORC Reader in non CDH Spark is closeable def withReader[T <: AutoCloseable, V](r: T)(block: T => V): V = { diff --git a/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala b/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala index 9d12bb19750..9997741890b 100644 --- a/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala +++ b/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala @@ -16,42 +16,377 @@ package com.nvidia.spark.rapids.shims +import java.net.URI import java.nio.ByteBuffer +import scala.collection.mutable.ListBuffer + +import com.esotericsoftware.kryo.Kryo +import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer} import com.nvidia.spark.InMemoryTableScanMeta import com.nvidia.spark.rapids._ import org.apache.arrow.memory.ReferenceManager import org.apache.arrow.vector.ValueVector +import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.rapids.shims.GpuShuffleExchangeExec +import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} +import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Average import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning} import org.apache.spark.sql.catalyst.trees.TreeNode +import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils} +import org.apache.spark.sql.connector.read.Scan import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, ShuffleQueryStageExec} +import org.apache.spark.sql.execution.adaptive._ import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec +import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, RunnableCommand} import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleExchangeExec} -import org.apache.spark.sql.execution.joins.EmptyHashedRelation +import org.apache.spark.sql.execution.datasources.rapids.GpuPartitioningUtils +import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan +import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ENSURE_REQUIREMENTS, ReusedExchangeExec, ShuffleExchangeExec} +import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.execution.python._ +import org.apache.spark.sql.execution.window.WindowExecBase import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.rapids._ -import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase +import org.apache.spark.sql.rapids.execution.{GpuCustomShuffleReaderExec, GpuShuffleExchangeExecBase, SerializeBatchDeserializeHostBuffer, SerializeConcatHostBuffersDeserializeBatch} import org.apache.spark.sql.rapids.execution.python._ import org.apache.spark.sql.rapids.execution.python.shims._ -import org.apache.spark.sql.rapids.shims.{GpuColumnarToRowTransitionExec, HadoopFSUtilsShim} +import org.apache.spark.sql.rapids.shims.{GpuColumnarToRowTransitionExec, GpuSchemaUtils, HadoopFSUtilsShim} +import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ import org.apache.spark.storage.{BlockId, BlockManagerId} // 31x nondb shims, used by 311cdh and 31x -abstract class Spark31XShims extends Spark301until320Shims with Logging { +abstract class Spark31XShims extends SparkShims with Spark31Xuntil33XShims with Logging { + override def parquetRebaseReadKey: String = + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ.key + override def parquetRebaseWriteKey: String = + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key + override def avroRebaseReadKey: String = + SQLConf.LEGACY_AVRO_REBASE_MODE_IN_READ.key + override def avroRebaseWriteKey: String = + SQLConf.LEGACY_AVRO_REBASE_MODE_IN_WRITE.key + override def parquetRebaseRead(conf: SQLConf): String = + conf.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ) + override def parquetRebaseWrite(conf: SQLConf): String = + conf.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE) + + override def sessionFromPlan(plan: SparkPlan): SparkSession = { + plan.sqlContext.sparkSession + } + + override def filesFromFileIndex(fileIndex: PartitioningAwareFileIndex): Seq[FileStatus] = { + fileIndex.allFiles() + } + + def broadcastModeTransform(mode: BroadcastMode, rows: Array[InternalRow]): Any = + mode.transform(rows) + + override def newBroadcastQueryStageExec( + old: BroadcastQueryStageExec, + newPlan: SparkPlan): BroadcastQueryStageExec = BroadcastQueryStageExec(old.id, newPlan) + + override def getDateFormatter(): DateFormatter = { + DateFormatter(DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)) + } + + override def isExchangeOp(plan: SparkPlanMeta[_]): Boolean = { + // if the child query stage already executed on GPU then we need to keep the + // next operator on GPU in these cases + SQLConf.get.adaptiveExecutionEnabled && (plan.wrapped match { + case _: CustomShuffleReaderExec + | _: ShuffledHashJoinExec + | _: BroadcastHashJoinExec + | _: BroadcastExchangeExec + | _: BroadcastNestedLoopJoinExec => true + case _ => false + }) + } + + override def isAqePlan(p: SparkPlan): Boolean = p match { + case _: AdaptiveSparkPlanExec | + _: QueryStageExec | + _: CustomShuffleReaderExec => true + case _ => false + } + + override def isCustomReaderExec(x: SparkPlan): Boolean = x match { + case _: GpuCustomShuffleReaderExec | _: CustomShuffleReaderExec => true + case _ => false + } + + override def aqeShuffleReaderExec: ExecRule[_ <: SparkPlan] = + GpuOverrides.exec[CustomShuffleReaderExec]( + "A wrapper of shuffle query stage", + ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_128 + TypeSig.ARRAY + + TypeSig.STRUCT + TypeSig.MAP).nested(), TypeSig.all), + (exec, conf, p, r) => new GpuCustomShuffleReaderMeta(exec, conf, p, r)) + + override def findOperators(plan: SparkPlan, predicate: SparkPlan => Boolean): Seq[SparkPlan] = { + def recurse( + plan: SparkPlan, + predicate: SparkPlan => Boolean, + accum: ListBuffer[SparkPlan]): Seq[SparkPlan] = { + if (predicate(plan)) { + accum += plan + } + plan match { + case a: AdaptiveSparkPlanExec => recurse(a.executedPlan, predicate, accum) + case qs: BroadcastQueryStageExec => recurse(qs.broadcast, predicate, accum) + case qs: ShuffleQueryStageExec => recurse(qs.shuffle, predicate, accum) + case other => other.children.flatMap(p => recurse(p, predicate, accum)).headOption + } + accum + } + recurse(plan, predicate, new ListBuffer[SparkPlan]()) + } + + override def skipAssertIsOnTheGpu(plan: SparkPlan): Boolean = false + + override def shouldFailDivOverflow: Boolean = false + + override def leafNodeDefaultParallelism(ss: SparkSession): Int = { + ss.sparkContext.defaultParallelism + } + + override def v1RepairTableCommand(tableName: TableIdentifier): RunnableCommand = + AlterTableRecoverPartitionsCommand(tableName) + + override def isWindowFunctionExec(plan: SparkPlan): Boolean = plan.isInstanceOf[WindowExecBase] + + override def getScans: Map[Class[_ <: Scan], ScanRule[_ <: Scan]] = Seq( + GpuOverrides.scan[ParquetScan]( + "Parquet parsing", + (a, conf, p, r) => new RapidsParquetScanMeta(a, conf, p, r)), + GpuOverrides.scan[OrcScan]( + "ORC parsing", + (a, conf, p, r) => new RapidsOrcScanMeta(a, conf, p, r)) + ).map(r => (r.getClassFor.asSubclass(classOf[Scan]), r)).toMap + + override def getPartitionFileNames( + partitions: Seq[PartitionDirectory]): Seq[String] = { + val files = partitions.flatMap(partition => partition.files) + files.map(_.getPath.getName) + } + + override def getPartitionFileStatusSize(partitions: Seq[PartitionDirectory]): Long = { + partitions.map(_.files.map(_.getLen).sum).sum + } + + override def getPartitionedFiles( + partitions: Array[PartitionDirectory]): Array[PartitionedFile] = { + partitions.flatMap { p => + p.files.map { f => + PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values) + } + } + } + + override def getPartitionSplitFiles( + partitions: Array[PartitionDirectory], + maxSplitBytes: Long, + relation: HadoopFsRelation): Array[PartitionedFile] = { + partitions.flatMap { partition => + partition.files.flatMap { file => + // getPath() is very expensive so we only want to call it once in this block: + val filePath = file.getPath + val isSplitable = relation.fileFormat.isSplitable( + relation.sparkSession, relation.options, filePath) + PartitionedFileUtil.splitFiles( + sparkSession = relation.sparkSession, + file = file, + filePath = filePath, + isSplitable = isSplitable, + maxSplitBytes = maxSplitBytes, + partitionValues = partition.values + ) + } + } + } + + override def getFileScanRDD( + sparkSession: SparkSession, + readFunction: PartitionedFile => Iterator[InternalRow], + filePartitions: Seq[FilePartition], + readDataSchema: StructType, + metadataColumns: Seq[AttributeReference]): RDD[InternalRow] = { + new FileScanRDD(sparkSession, readFunction, filePartitions) + } + + override def createFilePartition(index: Int, files: Array[PartitionedFile]): FilePartition = { + FilePartition(index, files) + } + + override def copyBatchScanExec( + batchScanExec: GpuBatchScanExec, + queryUsesInputFile: Boolean): GpuBatchScanExec = { + val scanCopy = batchScanExec.scan match { + case parquetScan: GpuParquetScan => + parquetScan.copy(queryUsesInputFile = queryUsesInputFile) + case orcScan: GpuOrcScan => + orcScan.copy(queryUsesInputFile = queryUsesInputFile) + case _ => throw new RuntimeException("Wrong format") // never reach here + } + batchScanExec.copy(scan = scanCopy) + } + + override def copyFileSourceScanExec( + scanExec: GpuFileSourceScanExec, + queryUsesInputFile: Boolean): GpuFileSourceScanExec = { + scanExec.copy(queryUsesInputFile = queryUsesInputFile)(scanExec.rapidsConf) + } + + override def checkColumnNameDuplication( + schema: StructType, + colType: String, + resolver: Resolver): Unit = { + GpuSchemaUtils.checkColumnNameDuplication(schema, colType, resolver) + } + + override def alias(child: Expression, name: String)( + exprId: ExprId, + qualifier: Seq[String], + explicitMetadata: Option[Metadata]): Alias = { + Alias(child, name)(exprId, qualifier, explicitMetadata) + } + + override def getArrowValidityBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = { + val arrowBuf = vec.getValidityBuffer + (arrowBuf.nioBuffer(), arrowBuf.getReferenceManager) + } + + override def getArrowOffsetsBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = { + val arrowBuf = vec.getOffsetBuffer + (arrowBuf.nioBuffer(), arrowBuf.getReferenceManager) + } + + override def replaceWithAlluxioPathIfNeeded( + conf: RapidsConf, + relation: HadoopFsRelation, + partitionFilters: Seq[Expression], + dataFilters: Seq[Expression]): FileIndex = { + + val alluxioPathsReplace: Option[Seq[String]] = conf.getAlluxioPathsToReplace + + if (alluxioPathsReplace.isDefined) { + // alluxioPathsReplace: Seq("key->value", "key1->value1") + // turn the rules to the Map with eg + // { s3:/foo -> alluxio://0.1.2.3:19998/foo, + // gs:/bar -> alluxio://0.1.2.3:19998/bar, + // /baz -> alluxio://0.1.2.3:19998/baz } + val replaceMapOption = alluxioPathsReplace.map(rules => { + rules.map(rule => { + val split = rule.split("->") + if (split.size == 2) { + split(0).trim -> split(1).trim + } else { + throw new IllegalArgumentException(s"Invalid setting for " + + s"${RapidsConf.ALLUXIO_PATHS_REPLACE.key}") + } + }).toMap + }) + + replaceMapOption.map(replaceMap => { + + def isDynamicPruningFilter(e: Expression): Boolean = + e.find(_.isInstanceOf[PlanExpression[_]]).isDefined + + val partitionDirs = relation.location.listFiles( + partitionFilters.filterNot(isDynamicPruningFilter), dataFilters) + + // replacement func to check if the file path is prefixed with the string user configured + // if yes, replace it + val replaceFunc = (f: Path) => { + val pathStr = f.toString + val matchedSet = replaceMap.keySet.filter(reg => pathStr.startsWith(reg)) + if (matchedSet.size > 1) { + // never reach here since replaceMap is a Map + throw new IllegalArgumentException(s"Found ${matchedSet.size} same replacing rules " + + s"from ${RapidsConf.ALLUXIO_PATHS_REPLACE.key} which requires only 1 rule for each " + + s"file path") + } else if (matchedSet.size == 1) { + new Path(pathStr.replaceFirst(matchedSet.head, replaceMap(matchedSet.head))) + } else { + f + } + } + + // replace all of input files + val inputFiles: Seq[Path] = partitionDirs.flatMap(partitionDir => { + replacePartitionDirectoryFiles(partitionDir, replaceFunc) + }) + + // replace all of rootPaths which are already unique + val rootPaths = relation.location.rootPaths.map(replaceFunc) + + val parameters: Map[String, String] = relation.options + + // infer PartitionSpec + val partitionSpec = GpuPartitioningUtils.inferPartitioning( + relation.sparkSession, + rootPaths, + inputFiles, + parameters, + Option(relation.dataSchema), + replaceFunc) + + // generate a new InMemoryFileIndex holding paths with alluxio schema + new InMemoryFileIndex( + relation.sparkSession, + inputFiles, + parameters, + Option(relation.dataSchema), + userSpecifiedPartitionSpec = Some(partitionSpec)) + }).getOrElse(relation.location) + + } else { + relation.location + } + } + + override def replacePartitionDirectoryFiles(partitionDir: PartitionDirectory, + replaceFunc: Path => Path): Seq[Path] = { + partitionDir.files.map(f => replaceFunc(f.getPath)) + } + + override def hasAliasQuoteFix: Boolean = false + + override def registerKryoClasses(kryo: Kryo): Unit = { + kryo.register(classOf[SerializeConcatHostBuffersDeserializeBatch], + new KryoJavaSerializer()) + kryo.register(classOf[SerializeBatchDeserializeHostBuffer], + new KryoJavaSerializer()) + } + + override def reusedExchangeExecPfn: PartialFunction[SparkPlan, ReusedExchangeExec] = { + case ShuffleQueryStageExec(_, e: ReusedExchangeExec) => e + case BroadcastQueryStageExec(_, e: ReusedExchangeExec) => e + } + + override def createTable(table: CatalogTable, + sessionCatalog: SessionCatalog, + tableLocation: Option[URI], + result: BaseRelation) = { + val newTable = table.copy( + storage = table.storage.copy(locationUri = tableLocation), + // We will use the schema of resolved.relation as the schema of the table (instead of + // the schema of df). It is important since the nullability may be changed by the relation + // provider (for example, see org.apache.spark.sql.parquet.DefaultSource). + schema = result.schema) + // Table location is already validated. No need to check it again during table creation. + sessionCatalog.createTable(newTable, ignoreIfExists = false, validateLocation = false) + } override def int96ParquetRebaseRead(conf: SQLConf): String = conf.getConf(SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_READ) diff --git a/sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala b/sql-plugin/src/main/311until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala similarity index 100% rename from sql-plugin/src/main/301db/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala rename to sql-plugin/src/main/311until320-nondb/scala/org/apache/spark/rapids/shims/api/python/ShimBasePythonRunner.scala diff --git a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/AnsiCheckUtil.scala b/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/AnsiCheckUtil.scala similarity index 100% rename from sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/AnsiCheckUtil.scala rename to sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/AnsiCheckUtil.scala diff --git a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala b/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala similarity index 100% rename from sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala rename to sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuRangePartitioning.scala diff --git a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala b/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala similarity index 100% rename from sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala rename to sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/GpuTypeShims.scala diff --git a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala b/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala similarity index 100% rename from sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala rename to sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/ParquetFieldIdShims.scala diff --git a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala b/sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala similarity index 100% rename from sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala rename to sql-plugin/src/main/311until330-all/scala/com/nvidia/spark/rapids/shims/RapidsFileSourceMetaUtils.scala diff --git a/sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala b/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala similarity index 100% rename from sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala rename to sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/GpuHashPartitioning.scala diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala similarity index 100% rename from sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala rename to sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/RapidsErrorUtils.scala diff --git a/sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark30Xuntil33XShims.scala b/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala similarity index 96% rename from sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark30Xuntil33XShims.scala rename to sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala index 8ab1f4b6719..8133b3cf523 100644 --- a/sql-plugin/src/main/301until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark30Xuntil33XShims.scala +++ b/sql-plugin/src/main/311until330-nondb/scala/com/nvidia/spark/rapids/shims/Spark31Xuntil33XShims.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.datasources.v2._ -trait Spark30Xuntil33XShims extends SparkShims { +trait Spark31Xuntil33XShims extends SparkShims { def neverReplaceShowCurrentNamespaceCommand: ExecRule[_ <: SparkPlan] = { GpuOverrides.neverReplaceExec[ShowCurrentNamespaceExec]("Namespace metadata operation") diff --git a/sql-plugin/src/main/301db/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala b/sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala similarity index 100% rename from sql-plugin/src/main/301db/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala rename to sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/csv/GpuCsvUtils.scala diff --git a/sql-plugin/src/main/301db/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala b/sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala similarity index 100% rename from sql-plugin/src/main/301db/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala rename to sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/json/GpuJsonUtils.scala diff --git a/sql-plugin/src/main/301db/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala b/sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala similarity index 100% rename from sql-plugin/src/main/301db/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala rename to sql-plugin/src/main/311until330-nondb/scala/org/apache/spark/sql/catalyst/json/rapids/shims/FileOptionsShims.scala diff --git a/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala index 8ac77702b23..67bdb1c2623 100644 --- a/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala +++ b/sql-plugin/src/main/312-nondb/scala/com/nvidia/spark/rapids/shims/SparkShims.scala @@ -21,7 +21,7 @@ import org.apache.parquet.schema.MessageType import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters -object SparkShimImpl extends Spark31XShims with Spark30Xuntil33XShims { +object SparkShimImpl extends Spark31XShims { override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion diff --git a/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala index ce884783e7b..439b2fbbe7c 100644 --- a/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala +++ b/sql-plugin/src/main/313/scala/com/nvidia/spark/rapids/shims/SparkShims.scala @@ -22,7 +22,7 @@ import org.apache.parquet.schema.MessageType import org.apache.spark.sql.execution.datasources.DataSourceUtils import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters -object SparkShimImpl extends Spark31XShims with Spark30Xuntil33XShims { +object SparkShimImpl extends Spark31XShims { override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion diff --git a/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/SparkShims.scala index a2318f33bfe..f01f7ac5a63 100644 --- a/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/SparkShims.scala +++ b/sql-plugin/src/main/314/scala/com/nvidia/spark/rapids/shims/SparkShims.scala @@ -22,7 +22,7 @@ import org.apache.parquet.schema.MessageType import org.apache.spark.sql.execution.datasources.DataSourceUtils import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters -object SparkShimImpl extends Spark31XShims with Spark30Xuntil33XShims { +object SparkShimImpl extends Spark31XShims { override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion diff --git a/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala index ebe3e8e91d4..62a3e68d25a 100644 --- a/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala +++ b/sql-plugin/src/main/320/scala/com/nvidia/spark/rapids/shims/SparkShims.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.types.StructType object SparkShimImpl extends Spark320PlusShims with Spark320PlusNonDBShims - with Spark30Xuntil33XShims { + with Spark31Xuntil33XShims { override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion override def getFileScanRDD( diff --git a/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala index f51e36c78ec..695028b5a80 100644 --- a/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala +++ b/sql-plugin/src/main/321/scala/com/nvidia/spark/rapids/shims/SparkShims.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.types.StructType object SparkShimImpl extends Spark321PlusShims with Spark320PlusNonDBShims - with Spark30Xuntil33XShims { + with Spark31Xuntil33XShims { override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion override def getFileScanRDD( diff --git a/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala b/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala index f51e36c78ec..695028b5a80 100644 --- a/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala +++ b/sql-plugin/src/main/322/scala/com/nvidia/spark/rapids/shims/SparkShims.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.types.StructType object SparkShimImpl extends Spark321PlusShims with Spark320PlusNonDBShims - with Spark30Xuntil33XShims { + with Spark31Xuntil33XShims { override def getSparkShimVersion: ShimVersion = ShimLoader.getShimVersion override def getFileScanRDD( diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index b2d62743418..59f83e001a6 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -1182,9 +1182,9 @@ object RapidsConf { .internal() .doc("Overrides the automatic Spark shim detection logic and forces a specific shims " + "provider class to be used. Set to the fully qualified shims provider class to use. " + - "If you are using a custom Spark version such as Spark 3.0.1.0 then this can be used to " + - "specify the shims provider that matches the base Spark version of Spark 3.0.1, i.e.: " + - "com.nvidia.spark.rapids.shims.spark301.SparkShimServiceProvider. If you modified Spark " + + "If you are using a custom Spark version such as Spark 3.1.1.0 then this can be used to " + + "specify the shims provider that matches the base Spark version of Spark 3.1.1, i.e.: " + + "com.nvidia.spark.rapids.shims.spark311.SparkShimServiceProvider. If you modified Spark " + "then there is no guarantee the RAPIDS Accelerator will function properly." + "When tested in a combined jar with other Shims, it's expected that the provided " + "implementation follows the same convention as existing Spark shims. If its class" + diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala index 710384ea4c6..66b1662c99c 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala @@ -44,7 +44,6 @@ import org.apache.spark.util.MutableURLClassLoader spark3xx-common/com/nvidia/spark/rapids/CastExprMeta.class - spark301/org/apache/spark/sql/rapids/GpuUnaryMinus.class spark311/org/apache/spark/sql/rapids/GpuUnaryMinus.class spark320/org/apache/spark/sql/rapids/GpuUnaryMinus.class diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/VersionUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/VersionUtils.scala index 729cdb9b01f..d6b06fe0f2b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/VersionUtils.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/VersionUtils.scala @@ -20,10 +20,6 @@ import com.nvidia.spark.rapids.shims.SparkShimImpl object VersionUtils { - lazy val isSpark301OrLater: Boolean = cmpSparkVersion(3, 0, 1) >= 0 - - lazy val isSpark311OrLater: Boolean = cmpSparkVersion(3, 1, 1) >= 0 - lazy val isSpark320OrLater: Boolean = cmpSparkVersion(3, 2, 0) >= 0 lazy val isSpark: Boolean = { diff --git a/tests/README.md b/tests/README.md index 168ed8a9db1..1e1ff7fea39 100644 --- a/tests/README.md +++ b/tests/README.md @@ -26,15 +26,11 @@ For more information about using scalatest with Maven please refer to the #### Running Unit Tests Against Specific Apache Spark Versions You can run the unit tests against different versions of Spark using the different profiles. The -default version runs against Spark 3.0.1, to run against a specific version use one of the following +default version runs against Spark 3.1.1, to run against a specific version use one of the following profiles: - - `-Pspark301tests` (spark 3.0.1) - - `-Pspark302tests` (spark 3.0.2) - - `-Pspark303tests` (spark 3.0.3) - - `-Pspark304tests` (spark 3.0.4) - - `-Pspark311tests` (spark 3.1.1) - - `-Pspark312tests` (spark 3.1.2) - - `-Pspark313tests` (spark 3.1.3) + - `-Pspark311tests` (Spark 3.1.1) + - `-Pspark312tests` (Spark 3.1.2) + - `-Pspark313tests` (Spark 3.1.3) Please refer to the [tests project POM](pom.xml) to see the list of test profiles supported. Apache Spark specific configurations can be passed in by setting the `SPARK_CONF` environment diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala index b34f6f491d9..377d9b1be77 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala @@ -94,8 +94,6 @@ class AdaptiveQueryExecSuite } test("get row counts from executed shuffle query stages") { - assumeSpark301orLater - skewJoinTest { spark => val (_, innerAdaptivePlan) = runAdaptiveAndVerifyResult( spark, @@ -152,8 +150,6 @@ class AdaptiveQueryExecSuite } test("Join partitioned tables DPP fallback") { - assumeSpark301orLater - val conf = new SparkConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") .set(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, "-1") // force shuffle exchange @@ -435,7 +431,6 @@ class AdaptiveQueryExecSuite test("Exchange reuse") { logError("Exchange reuse") - assumeSpark301orLater val conf = new SparkConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") @@ -468,7 +463,6 @@ class AdaptiveQueryExecSuite test("Change merge join to broadcast join without local shuffle reader") { logError("Change merge join to broadcast join without local shuffle reader") - assumeSpark301orLater val conf = new SparkConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") @@ -499,7 +493,6 @@ class AdaptiveQueryExecSuite test("Verify the reader is LocalShuffleReaderExec") { logError("Verify the reader is LocalShuffleReaderExec") - assumeSpark301orLater val conf = new SparkConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") @@ -619,8 +612,6 @@ class AdaptiveQueryExecSuite } def skewJoinTest(fun: SparkSession => Unit) { - assumeSpark301orLater - val conf = new SparkConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") .set(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, "-1") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/BroadcastNestedLoopJoinSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/BroadcastNestedLoopJoinSuite.scala index 500031d97d6..12f5b1f33ed 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/BroadcastNestedLoopJoinSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/BroadcastNestedLoopJoinSuite.scala @@ -16,8 +16,6 @@ package com.nvidia.spark.rapids -import com.nvidia.spark.rapids.shims.SparkShimImpl - import org.apache.spark.SparkConf import org.apache.spark.sql.functions.broadcast import org.apache.spark.sql.internal.SQLConf @@ -60,14 +58,7 @@ class BroadcastNestedLoopJoinSuite extends SparkQueryCompareTestSuite { val nljCount = PlanUtils.findOperators(plan, _.isInstanceOf[GpuBroadcastNestedLoopJoinExec]) - SparkShimImpl.getSparkShimVersion match { - case SparkShimVersion(3, 0, 0) => - // we didn't start supporting GPU exchanges with AQE until 3.0.1 - assert(nljCount.size === 0) - case _ => - assert(nljCount.size === 1) - } - + assert(nljCount.size === 1) }, conf) } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CostBasedOptimizerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CostBasedOptimizerSuite.scala index 347d41697db..56d1108e617 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/CostBasedOptimizerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/CostBasedOptimizerSuite.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -127,7 +127,6 @@ class CostBasedOptimizerSuite extends SparkQueryCompareTestSuite // see https://github.com/NVIDIA/spark-rapids/issues/3526 ignore("Force section of plan back onto CPU, AQE on") { logError("Force section of plan back onto CPU, AQE on") - assumeSpark311orLater val conf = createDefaultConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") @@ -243,7 +242,6 @@ class CostBasedOptimizerSuite extends SparkQueryCompareTestSuite // see https://github.com/NVIDIA/spark-rapids/issues/3526 ignore("Force last section of plan back onto CPU, AQE on") { logError("Force last section of plan back onto CPU, AQE on") - assumeSpark311orLater val conf = createDefaultConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") @@ -474,7 +472,6 @@ class CostBasedOptimizerSuite extends SparkQueryCompareTestSuite } test("Compute estimated row count nested joins no broadcast") { - assumeSpark301orLater logError("Compute estimated row count nested joins no broadcast") val conf = createDefaultConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") @@ -523,7 +520,6 @@ class CostBasedOptimizerSuite extends SparkQueryCompareTestSuite } test("Compute estimated row count nested joins with broadcast") { - assumeSpark301orLater logError("Compute estimated row count nested joins with broadcast") val conf = createDefaultConf() .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala index a94affbf08d..6e1c72bf4f6 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala @@ -105,10 +105,9 @@ class OrcScanSuite extends SparkQueryCompareTestSuite { * then no result returned. Because of 1582-10-03 in hybrid calender * is actually 1582-09-23 in proleptic Gregorian calendar. */ - test("test hybrid Julian Gregorian calendar vs proleptic Gregorian calendar") { + ignore("test hybrid Julian Gregorian calendar vs proleptic Gregorian calendar") { // After Spark 3.1.1, Orc failed to prune when converting Hybrid calendar to Proleptic calendar // Orc bug: https://issues.apache.org/jira/browse/ORC-1083 - assumePriorToSpark311 withCpuSparkSession(spark => { val df = frameFromOrcWithSchema("hybrid-Julian-calendar.orc", diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala index 3226e4955e8..a066a2587e1 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala @@ -1822,22 +1822,12 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm { } } - /** most of the AQE tests requires Spark 3.0.1 or later */ - def assumeSpark301orLater: Assertion = - assume(VersionUtils.isSpark301OrLater, "Spark version not 3.0.1+") - - def assumeSpark311orLater: Assertion = - assume(VersionUtils.isSpark311OrLater, "Spark version not 3.1.1+") - def assumePriorToSpark320: Assertion = assume(!VersionUtils.isSpark320OrLater, "Spark version not before 3.2.0") def assumeSpark320orLater: Assertion = assume(VersionUtils.isSpark320OrLater, "Spark version not 3.2.0+") - def assumePriorToSpark311: Assertion = - assume(!VersionUtils.isSpark311OrLater, "Spark version not before 3.1.1") - def cmpSparkVersion(major: Int, minor: Int, bugfix: Int): Int = { val sparkShimVersion = SparkShimImpl.getSparkShimVersion val (sparkMajor, sparkMinor, sparkBugfix) = sparkShimVersion match {