From e944ea1e8d709971b8110d7efb77434fbafadc1a Mon Sep 17 00:00:00 2001 From: Moritz Mack Date: Thu, 20 Oct 2022 10:32:57 +0200 Subject: [PATCH] Remove Spark2 from Java testing projects (addresses #23728) --- sdks/java/testing/load-tests/build.gradle | 15 +------- sdks/java/testing/nexmark/build.gradle | 47 +++++++++-------------- sdks/java/testing/tpcds/README.md | 4 +- sdks/java/testing/tpcds/build.gradle | 2 +- 4 files changed, 23 insertions(+), 45 deletions(-) diff --git a/sdks/java/testing/load-tests/build.gradle b/sdks/java/testing/load-tests/build.gradle index 2d93993a56574..e157f2fabf32b 100644 --- a/sdks/java/testing/load-tests/build.gradle +++ b/sdks/java/testing/load-tests/build.gradle @@ -39,7 +39,7 @@ def runnerDependency = (project.hasProperty(runnerProperty) : ":runners:direct-java") def loadTestRunnerVersionProperty = "runner.version" def loadTestRunnerVersion = project.findProperty(loadTestRunnerVersionProperty) -def shouldProvideSpark = ":runners:spark:2".equals(runnerDependency) +def isSparkRunner = runnerDependency.startsWith(":runners:spark:") def isDataflowRunner = ":runners:google-cloud-dataflow-java".equals(runnerDependency) def isDataflowRunnerV2 = isDataflowRunner && "V2".equals(loadTestRunnerVersion) def runnerConfiguration = ":runners:direct-java".equals(runnerDependency) ? "shadow" : null @@ -82,20 +82,9 @@ dependencies { gradleRun project(project.path) gradleRun project(path: runnerDependency, configuration: runnerConfiguration) - - // The Spark runner requires the user to provide a Spark dependency. For self-contained - // runs with the Spark runner, we can provide such a dependency. This is deliberately phrased - // to not hardcode any runner other than :runners:direct-java - if (shouldProvideSpark) { - gradleRun library.java.spark_streaming - gradleRun library.java.spark_core, { - exclude group:"org.slf4j", module:"jul-to-slf4j" - } - gradleRun library.java.spark_sql - } } -if (shouldProvideSpark) { +if (isSparkRunner) { configurations.gradleRun { // Using Spark runner causes a StackOverflowError if slf4j-jdk14 is on the classpath exclude group: "org.slf4j", module: "slf4j-jdk14" diff --git a/sdks/java/testing/nexmark/build.gradle b/sdks/java/testing/nexmark/build.gradle index 3a8d3440c80bb..a7fbf2e08ad43 100644 --- a/sdks/java/testing/nexmark/build.gradle +++ b/sdks/java/testing/nexmark/build.gradle @@ -38,8 +38,7 @@ def nexmarkRunnerDependency = project.findProperty(nexmarkRunnerProperty) ?: ":runners:direct-java" def nexmarkRunnerVersionProperty = "nexmark.runner.version" def nexmarkRunnerVersion = project.findProperty(nexmarkRunnerVersionProperty) -def shouldProvideSpark2 = ":runners:spark:2".equals(nexmarkRunnerDependency) -def shouldProvideSpark3 = ":runners:spark:3".equals(nexmarkRunnerDependency) +def isSparkRunner = nexmarkRunnerDependency.startsWith(":runners:spark:") def isDataflowRunner = ":runners:google-cloud-dataflow-java".equals(nexmarkRunnerDependency) def isDataflowRunnerV2 = isDataflowRunner && "V2".equals(nexmarkRunnerVersion) def runnerConfiguration = ":runners:direct-java".equals(nexmarkRunnerDependency) ? "shadow" : null @@ -91,39 +90,15 @@ dependencies { testImplementation project(path: ":sdks:java:testing:test-utils", configuration: "testRuntimeMigration") gradleRun project(project.path) gradleRun project(path: nexmarkRunnerDependency, configuration: runnerConfiguration) - - // The Spark runner requires the user to provide a Spark dependency. For self-contained - // runs with the Spark runner, we can provide such a dependency. This is deliberately phrased - // to not hardcode any runner other than :runners:direct-java - if (shouldProvideSpark2) { - gradleRun library.java.spark_core, { - exclude group:"org.slf4j", module:"jul-to-slf4j" - } - gradleRun library.java.spark_sql - gradleRun library.java.spark_streaming - } - if (shouldProvideSpark3) { - gradleRun library.java.spark3_core, { - exclude group:"org.slf4j", module:"jul-to-slf4j" - } - - gradleRun library.java.spark3_sql - gradleRun library.java.spark3_streaming - } } -if (shouldProvideSpark2) { - configurations.gradleRun { - // Using Spark runner causes a StackOverflowError if slf4j-jdk14 is on the classpath - exclude group: "org.slf4j", module: "slf4j-jdk14" - } -} -if (shouldProvideSpark3) { +if (isSparkRunner) { configurations.gradleRun { // Using Spark runner causes a StackOverflowError if slf4j-jdk14 is on the classpath exclude group: "org.slf4j", module: "slf4j-jdk14" } } + def getNexmarkArgs = { def nexmarkArgsStr = project.findProperty(nexmarkArgsProperty) ?: "" def nexmarkArgsList = new ArrayList() @@ -155,6 +130,12 @@ def getNexmarkArgs = { } } } + + if(isSparkRunner) { + // For transparency, be explicit about configuration of local Spark + nexmarkArgsList.add("--sparkMaster=local[4]") + } + return nexmarkArgsList } @@ -162,7 +143,7 @@ def getNexmarkArgs = { // // Parameters: // -Pnexmark.runner -// Specify a runner subproject, such as ":runners:spark:2" or ":runners:flink:1.13" +// Specify a runner subproject, such as ":runners:spark:3" or ":runners:flink:1.13" // Defaults to ":runners:direct-java" // // -Pnexmark.args @@ -177,6 +158,14 @@ task run(type: JavaExec) { dependsOn ":runners:google-cloud-dataflow-java:worker:legacy-worker:shadowJar" } } + if(isSparkRunner) { + // Disable UI + systemProperty "spark.ui.enabled", "false" + systemProperty "spark.ui.showConsoleProgress", "false" + // Dataset runner only + systemProperty "spark.sql.shuffle.partitions", "4" + } + mainClass = "org.apache.beam.sdk.nexmark.Main" classpath = configurations.gradleRun args nexmarkArgsList.toArray() diff --git a/sdks/java/testing/tpcds/README.md b/sdks/java/testing/tpcds/README.md index 247b5cbe93005..85826e341ffb0 100644 --- a/sdks/java/testing/tpcds/README.md +++ b/sdks/java/testing/tpcds/README.md @@ -55,10 +55,10 @@ To run a query using ZetaSQL planner (currently Query96 can be run using ZetaSQL ## Spark Runner -To execute TPC-DS benchmark with Query3 for 1Gb dataset on Apache Spark 2.x, run the following example command from the command line: +To execute TPC-DS benchmark with Query3 for 1Gb dataset on Apache Spark 3.x, run the following example command from the command line: ```bash -./gradlew :sdks:java:testing:tpcds:run -Ptpcds.runner=":runners:spark:2" -Ptpcds.args=" \ +./gradlew :sdks:java:testing:tpcds:run -Ptpcds.runner=":runners:spark:3" -Ptpcds.args=" \ --runner=SparkRunner \ --queries=3 \ --tpcParallel=1 \ diff --git a/sdks/java/testing/tpcds/build.gradle b/sdks/java/testing/tpcds/build.gradle index e9537cfe50ca8..325222e8e8f14 100644 --- a/sdks/java/testing/tpcds/build.gradle +++ b/sdks/java/testing/tpcds/build.gradle @@ -94,7 +94,7 @@ if (isSpark) { // // Parameters: // -Ptpcds.runner -// Specify a runner subproject, such as ":runners:spark:2" or ":runners:flink:1.13" +// Specify a runner subproject, such as ":runners:spark:3" or ":runners:flink:1.13" // Defaults to ":runners:direct-java" // // -Ptpcds.args