Batch scalastyle checks across all modules upfront (#3433)

- replaces scalastyle-maven-plugin with an ant task, uses compact wildcard paths so we don't need to hardcode evolving source code layout - separates build info generation into pluginManagement to be inherited by submodules while limiting scalastyle task to the parent pom only - checks all modules in one go upfront so the developers don't have to wait for failures incurred later after lengthy scala builds - fixes codestyle violations accummulated while the check was incomplete Fixes #3388 Signed-off-by: Gera Shegalov <gera@apache.org>
NVIDIA · Sep 11, 2021 · 906af85 · 906af85
1 parent 1e62e02
commit 906af85
Show file tree

Hide file tree

Showing 32 changed files with 86 additions and 139 deletions.
diff --git a/dist/pom.xml b/dist/pom.xml
@@ -1737,6 +1737,7 @@
                 <configuration>
                     <excludes>
                         <exclude>com.nvidia.spark.rapids.SparkShimServiceProvider.*</exclude>
+                        <exclude>dependency-reduced-pom.xml</exclude>
                         <exclude>*.txt</exclude>
                         <exclude>*.md</exclude>
                     </excludes>

diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
@@ -238,13 +238,6 @@
                 <groupId>org.scalatest</groupId>
                 <artifactId>scalatest-maven-plugin</artifactId>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-                <configuration>
-                    <includeTestSourceDirectory>true</includeTestSourceDirectory>
-                </configuration>
-            </plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-dependency-plugin</artifactId>

diff --git a/pom.xml b/pom.xml
@@ -801,6 +801,27 @@
                   <groupId>org.apache.maven.plugins</groupId>
                   <artifactId>maven-antrun-plugin</artifactId>
                   <version>1.8</version>
+                  <executions>
+                    <execution>
+                        <id>generate-build-info</id>
+                        <phase>generate-resources</phase>
+                        <configuration>
+                            <!-- Execute the shell script to generate the plugin build information. -->
+                            <target>
+                                <mkdir dir="${project.build.directory}/extra-resources"/>
+                                <mkdir dir="${project.build.directory}/tmp"/>
+                                <exec executable="bash" failonerror="true" output="${project.build.directory}/extra-resources/rapids4spark-version-info.properties">
+                                    <arg value="${user.dir}/build/build-info"/>
+                                    <arg value="${project.version}"/>
+                                    <arg value="${cudf.version}"/>
+                                </exec>
+                            </target>
+                        </configuration>
+                        <goals>
+                            <goal>run</goal>
+                        </goals>
+                    </execution>
+                  </executions>
                 </plugin>
                 <plugin>
                   <groupId>org.apache.maven.plugins</groupId>
@@ -901,30 +922,6 @@
                         </execution>
                     </executions>
                 </plugin>
-                <plugin>
-                    <groupId>org.scalastyle</groupId>
-                    <artifactId>scalastyle-maven-plugin</artifactId>
-                    <version>1.0.0</version>
-                    <configuration>
-                        <verbose>false</verbose>
-                        <failOnViolation>true</failOnViolation>
-                        <includeTestSourceDirectory>false</includeTestSourceDirectory>
-                        <failOnWarning>false</failOnWarning>
-                        <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
-                        <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
-                        <configLocation>scalastyle-config.xml</configLocation>
-                        <outputFile>${basedir}/target/scalastyle-output.xml</outputFile>
-                        <inputEncoding>${project.build.sourceEncoding}</inputEncoding>
-                        <outputEncoding>${project.reporting.outputEncoding}</outputEncoding>
-                    </configuration>
-                    <executions>
-                        <execution>
-                            <goals>
-                                <goal>check</goal>
-                            </goals>
-                        </execution>
-                    </executions>
-                </plugin>
                 <plugin>
                     <groupId>org.apache.rat</groupId>
                     <artifactId>apache-rat-plugin</artifactId>
@@ -1001,28 +998,55 @@
                 </configuration>
             </plugin>
             <plugin>
+                <!--
+                This is an alternative implementation of the scalastyle check invocation,
+                a replacement for scalastyle-maven-plugin. It's motivated to address the following:
+                - All scala files are checked at once regardless of the module, so the developer
+                  can focus on addressing violations without being distracted by the build issues
+                  in-between
+                - We don't have to hardcode the source code roots added dynamically by other maven
+                  plugins to the project
+                - The scalastyle launch cost is amortized across all modules
+                -->
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-antrun-plugin</artifactId>
+                <inherited>false</inherited>
                 <executions>
                     <execution>
-                        <phase>generate-resources</phase>
+                        <id>scalastyle-all-modules</id>
+                        <phase>verify</phase>
+                        <goals><goal>run</goal></goals>
                         <configuration>
-                            <!-- Execute the shell script to generate the plugin build information. -->
                             <target>
-                                <mkdir dir="${project.build.directory}/extra-resources"/>
-                                <mkdir dir="${project.build.directory}/tmp"/>
-                                <exec executable="bash" failonerror="true" output="${project.build.directory}/extra-resources/rapids4spark-version-info.properties">
-                                    <arg value="${user.dir}/build/build-info"/>
-                                    <arg value="${project.version}"/>
-                                    <arg value="${cudf.version}"/>
-                                </exec>
+                                <pathconvert property="scalastyle.dirs" pathsep=" ">
+                                    <dirset dir="${project.basedir}" includes="**/src/main/scala"/>
+                                    <dirset dir="${project.basedir}" includes="**/src/main/*/scala"/>
+                                    <dirset dir="${project.basedir}" includes="**/src/test/scala"/>
+                                    <dirset dir="${project.basedir}" includes="**/src/test/*/scala"/>
+                                </pathconvert>
+                                <echo>Checking scalastyle for all modules using following paths:
+                                    ${scalastyle.dirs}
+                                </echo>
+                                <java classname="org.scalastyle.Main" failonerror="true">
+                                    <arg line="--verbose false"/>
+                                    <arg line="--warnings false"/>
+                                    <arg line="--config scalastyle-config.xml"/>
+                                    <arg line="--xmlOutput ${project.basedir}/target/scalastyle-output.xml"/>
+                                    <arg line="--inputEncoding ${project.build.sourceEncoding}"/>
+                                    <arg line="--xmlEncoding ${project.reporting.outputEncoding}"/>
+                                    <arg line="${scalastyle.dirs}"/>
+                                </java>
                             </target>
                         </configuration>
-                        <goals>
-                            <goal>run</goal>
-                        </goals>
                     </execution>
                 </executions>
+                <dependencies>
+                    <dependency>
+                        <groupId>org.scalastyle</groupId>
+                        <artifactId>scalastyle_${scala.binary.version}</artifactId>
+                        <version>1.0.0</version>
+                    </dependency>
+                </dependencies>
             </plugin>
             <plugin>
                 <groupId>org.jacoco</groupId>

diff --git a/shims/spark301/pom.xml b/shims/spark301/pom.xml
@@ -57,10 +57,6 @@
                     </execution>
                 </executions>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
         </plugins>
 
         <resources>

diff --git a/...db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastExchangeExec.scala b/...db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastExchangeExec.scala
@@ -25,7 +25,9 @@ import org.apache.spark.sql.rapids.execution.GpuBroadcastExchangeExecBaseWithFut
 
 case class GpuBroadcastExchangeExec(
     override val mode: BroadcastMode,
-    child: SparkPlan) extends GpuBroadcastExchangeExecBaseWithFuture(mode, child) with BroadcastExchangeLike {
+    child: SparkPlan)
+    extends GpuBroadcastExchangeExecBaseWithFuture(mode, child)
+        with BroadcastExchangeLike {
 
   override def runId: UUID = _runId
 

diff --git a/...k301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuRunningWindowExec.scala b/...k301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuRunningWindowExec.scala
@@ -28,9 +28,11 @@ class GpuRunningWindowExecMeta(runningWindowFunctionExec: RunningWindowFunctionE
     conf: RapidsConf,
     parent: Option[RapidsMeta[_, _, _]],
     rule: DataFromReplacementRule)
-    extends GpuBaseWindowExecMeta[RunningWindowFunctionExec](runningWindowFunctionExec, conf, parent, rule) {
+    extends GpuBaseWindowExecMeta[RunningWindowFunctionExec](runningWindowFunctionExec, conf,
+      parent, rule) {
 
-  override def getInputWindowExpressions: Seq[NamedExpression] = runningWindowFunctionExec.windowExpressionList
+  override def getInputWindowExpressions: Seq[NamedExpression] =
+    runningWindowFunctionExec.windowExpressionList
   override def getPartitionSpecs: Seq[Expression] = runningWindowFunctionExec.partitionSpec
   override def getOrderSpecs: Seq[SortOrder] = runningWindowFunctionExec.orderSpec
   override def getResultColumnsOnly: Boolean = true

diff --git a/...01db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuShuffleExchangeExec.scala b/...01db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuShuffleExchangeExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,8 @@ case class GpuShuffleExchangeExec(
     override val outputPartitioning: Partitioning,
     child: SparkPlan,
     canChangeNumPartitions: Boolean)
-  extends GpuShuffleExchangeExecBaseWithMetrics(outputPartitioning, child) with ShuffleExchangeLike {
+  extends GpuShuffleExchangeExecBaseWithMetrics(outputPartitioning, child)
+      with ShuffleExchangeLike {
 
   override def numMappers: Int = shuffleDependencyColumnar.rdd.getNumPartitions
 

diff --git a/...rc/main/scala/com/nvidia/spark/rapids/shims/spark301db/RapidsShuffleInternalManager.scala b/...rc/main/scala/com/nvidia/spark/rapids/shims/spark301db/RapidsShuffleInternalManager.scala
@@ -62,8 +62,8 @@ class ProxyRapidsShuffleInternalManager(conf: SparkConf, isDriver: Boolean)
       endPartition: Int,
       context: TaskContext,
       metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = {
-    self.getReaderForRange(handle, startMapIndex, endMapIndex, startPartition, endPartition, context,
-      metrics)
+    self.getReaderForRange(handle, startMapIndex, endMapIndex, startPartition, endPartition,
+      context, metrics)
   }
 
   def getReader[K, C](

diff --git a/...s/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/SparkBaseShims.scala b/...s/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/SparkBaseShims.scala
@@ -52,8 +52,6 @@ import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
 import org.apache.spark.sql.execution.exchange.{ReusedExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.execution.python._
-import org.apache.spark.sql.rapids.execution.TrampolineUtil
-import org.apache.spark.sql.rapids.execution.python.shims.spark301db._
 import org.apache.spark.sql.execution.window.WindowExecBase
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.{GpuFileSourceScanExec, GpuStringReplace, GpuTimeSub, ShuffleManagerShimBase}

diff --git a/...rg/apache/spark/sql/rapids/execution/python/shims/spark301db/GpuArrowEvalPythonExec.scala b/...rg/apache/spark/sql/rapids/execution/python/shims/spark301db/GpuArrowEvalPythonExec.scala
@@ -39,8 +39,8 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.execution.python.PythonUDFRunner
-import org.apache.spark.sql.rapids.execution.python._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.rapids.execution.python._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.sql.vectorized.ColumnarBatch

diff --git a/...apache/spark/sql/rapids/execution/python/shims/spark301db/GpuWindowInPandasExecBase.scala b/...apache/spark/sql/rapids/execution/python/shims/spark301db/GpuWindowInPandasExecBase.scala
@@ -34,8 +34,8 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.sql.execution.UnaryExecNode
 import org.apache.spark.sql.execution.python._
-import org.apache.spark.sql.rapids.execution.python._
 import org.apache.spark.sql.rapids.GpuAggregateExpression
+import org.apache.spark.sql.rapids.execution.python._
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.sql.vectorized.ColumnarBatch

diff --git a/shims/spark302/pom.xml b/shims/spark302/pom.xml
@@ -57,10 +57,6 @@
                     </execution>
                 </executions>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
         </plugins>
 
         <resources>

diff --git a/shims/spark303/pom.xml b/shims/spark303/pom.xml
@@ -57,10 +57,6 @@
                     </execution>
                 </executions>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
         </plugins>
 
         <resources>

diff --git a/shims/spark304/pom.xml b/shims/spark304/pom.xml
@@ -57,10 +57,6 @@
                     </execution>
                 </executions>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
         </plugins>
 
         <resources>

diff --git a/shims/spark311/pom.xml b/shims/spark311/pom.xml
@@ -57,10 +57,6 @@
                     </execution>
                 </executions>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
         </plugins>
 
         <resources>

diff --git a/shims/spark311cdh/pom.xml b/shims/spark311cdh/pom.xml
@@ -57,10 +57,6 @@
                     </execution>
                 </executions>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
         </plugins>
 
         <resources>

diff --git a/...k311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/GpuRunningWindowExec.scala b/...k311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/GpuRunningWindowExec.scala
@@ -28,9 +28,11 @@ class GpuRunningWindowExecMeta(runningWindowFunctionExec: RunningWindowFunctionE
     conf: RapidsConf,
     parent: Option[RapidsMeta[_, _, _]],
     rule: DataFromReplacementRule)
-    extends GpuBaseWindowExecMeta[RunningWindowFunctionExec](runningWindowFunctionExec, conf, parent, rule) {
+    extends GpuBaseWindowExecMeta[RunningWindowFunctionExec](runningWindowFunctionExec, conf,
+      parent, rule) {
 
-  override def getInputWindowExpressions: Seq[NamedExpression] = runningWindowFunctionExec.windowExpressionList
+  override def getInputWindowExpressions: Seq[NamedExpression] =
+    runningWindowFunctionExec.windowExpressionList
   override def getPartitionSpecs: Seq[Expression] = runningWindowFunctionExec.partitionSpec
   override def getOrderSpecs: Seq[SortOrder] = runningWindowFunctionExec.orderSpec
   override def getResultColumnsOnly: Boolean = true

diff --git a/...s/spark311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/SparkBaseShims.scala b/...s/spark311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/SparkBaseShims.scala
@@ -21,6 +21,7 @@ import java.nio.ByteBuffer
 
 import com.databricks.sql.execution.window.RunningWindowFunctionExec
 import com.nvidia.spark.rapids._
+import com.nvidia.spark.rapids.shims.spark311db._
 import com.nvidia.spark.rapids.shims.v2.Spark30XShims
 import org.apache.arrow.memory.ReferenceManager
 import org.apache.arrow.vector.ValueVector
@@ -51,12 +52,12 @@ import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ReusedExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.execution.python._
-import org.apache.spark.sql.rapids.execution.python.shims.spark311db._
 import org.apache.spark.sql.execution.window.WindowExecBase
 import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 import org.apache.spark.sql.rapids._
 import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExecBase, GpuBroadcastNestedLoopJoinExecBase, GpuShuffleExchangeExecBase, JoinTypeChecks}
 import org.apache.spark.sql.rapids.execution.python.{GpuAggregateInPandasExecMeta, GpuArrowEvalPythonExec, GpuMapInPandasExecMeta, GpuPythonUDF}
+import org.apache.spark.sql.rapids.execution.python.shims.spark311db._
 import org.apache.spark.sql.rapids.shims.spark311db._
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types._

diff --git a/...ain/scala/org/apache/spark/sql/rapids/shims/spark311db/GpuGroupUDFArrowPythonRunner.scala b/...ain/scala/org/apache/spark/sql/rapids/shims/spark311db/GpuGroupUDFArrowPythonRunner.scala
@@ -46,7 +46,6 @@ import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.Utils
-import org.apache.spark.sql.rapids.execution.python.GpuPythonUDF
 
 /**
  * Group Map UDF specific serializer for Databricks because they have a special GroupUDFSerializer.
@@ -72,7 +71,8 @@ class GpuGroupUDFArrowPythonRunner(
     onDataWriteFinished: () => Unit,
     override val pythonOutSchema: StructType,
     minReadTargetBatchSize: Int)
-    extends GpuArrowPythonRunner(funcs, evalType, argOffsets, pythonInSchema, timeZoneId, conf, batchSize, onDataWriteFinished, pythonOutSchema, minReadTargetBatchSize) {
+    extends GpuArrowPythonRunner(funcs, evalType, argOffsets, pythonInSchema, timeZoneId, conf,
+      batchSize, onDataWriteFinished, pythonOutSchema, minReadTargetBatchSize) {
 
   protected override def newWriterThread(
       env: SparkEnv,

diff --git a/.../src/main/scala/org/apache/spark/sql/rapids/shims/spark311db/GpuShuffleExchangeExec.scala b/.../src/main/scala/org/apache/spark/sql/rapids/shims/spark311db/GpuShuffleExchangeExec.scala
@@ -18,7 +18,6 @@ package org.apache.spark.sql.rapids.shims.spark311db
 import scala.concurrent.Future
 
 import org.apache.spark.MapOutputStatistics
-
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning

diff --git a/shims/spark312/pom.xml b/shims/spark312/pom.xml
@@ -57,10 +57,6 @@
                     </execution>
                 </executions>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
         </plugins>
 
         <resources>

diff --git a/shims/spark313/pom.xml b/shims/spark313/pom.xml
@@ -57,10 +57,6 @@
                     </execution>
                 </executions>
             </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
         </plugins>
 
         <resources>

diff --git a/shims/spark320/pom.xml b/shims/spark320/pom.xml
@@ -58,10 +58,6 @@
           </execution>
         </executions>
       </plugin>
-      <plugin>
-        <groupId>org.scalastyle</groupId>
-        <artifactId>scalastyle-maven-plugin</artifactId>
-      </plugin>
     </plugins>
 
     <resources>