[SPARK-48592][INFRA] Add structured logging style script and GitHub w…

…orkflow ### What changes were proposed in this pull request? This PR checks for Scala logging messages using logInfo, logWarning, logError and containing variables without MDC wrapper Example error output: ``` [error] spark/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala:225:4 [error] Logging message should use Structured Logging Framework style, such as log"...${MDC(TASK_ID, taskId)..." Refer to the guidelines in the file `internal/Logging.scala`. ``` ### Why are the changes needed? This makes development and PR review of the structured logging migration easier. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manual test, verified it will throw errors on invalid logging messages. ### Was this patch authored or co-authored using generative AI tooling? No Closes #47239 from asl3/structuredlogstylescript. Authored-by: Amanda Liu <amanda.liu@databricks.com> Signed-off-by: Gengliang Wang <gengliang@apache.org>
apache · Jul 20, 2024 · 41f67dd · 41f67dd
1 parent 5785098
commit 41f67dd
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 0 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -693,6 +693,11 @@ jobs:
       run: ./dev/mima
     - name: Scala linter
       run: ./dev/lint-scala
+    - name: Scala structured logging check
+      run: |
+        if [ -f ./dev/structured_logging_style.py ]; then
+            python3.9 ./dev/structured_logging_style.py
+        fi
     - name: Java linter
       run: ./dev/lint-java
     - name: Spark connect jvm client mima check

diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import re
+import glob
+
+
+def main():
+    log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\"\+(?!s?")).*|[^"]+\+\s*".*?"'
+    compiled_inner_log_pattern = re.compile(inner_log_pattern)
+
+    # Regex patterns for file paths to exclude from the Structured Logging style check
+    excluded_file_patterns = [
+        "[Tt]est",
+        "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen"
+        "/CodeGenerator.scala",
+        "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
+        "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver"
+        "/SparkSQLCLIService.scala",
+        "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
+    ]
+
+    nonmigrated_files = {}
+
+    scala_files = glob.glob(os.path.join("../", "**", "*.scala"), recursive=True)
+
+    for file in scala_files:
+        skip_file = False
+        for exclude_pattern in excluded_file_patterns:
+            if re.search(exclude_pattern, file):
+                skip_file = True
+                break
+
+        if not skip_file and not os.path.isdir(file):
+            with open(file, "r") as f:
+                content = f.read()
+
+                log_statements = re.finditer(log_pattern, content, re.DOTALL)
+
+                if log_statements:
+                    nonmigrated_files[file] = []
+                    for log_statement in log_statements:
+                        log_statement_str = log_statement.group(0).strip()
+                        # trim first ( and last )
+                        first_paren_index = log_statement_str.find("(")
+                        inner_log_statement = re.sub(
+                            r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
+                        )
+
+                        if compiled_inner_log_pattern.fullmatch(inner_log_statement):
+                            start_pos = log_statement.start()
+                            preceding_content = content[:start_pos]
+                            line_number = preceding_content.count("\n") + 1
+                            start_char = start_pos - preceding_content.rfind("\n") - 1
+                            nonmigrated_files[file].append((line_number, start_char))
+
+    if all(len(issues) == 0 for issues in nonmigrated_files.values()):
+        print("Structured logging style check passed.", file=sys.stderr)
+        sys.exit(0)
+    else:
+        for file_path, issues in nonmigrated_files.items():
+            for line_number, start_char in issues:
+                print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
+                print(
+                    "[error]\tPlease use the Structured Logging Framework for logging messages "
+                    'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
+                    "\n\tRefer to the guidelines in the file `internal/Logging.scala`.",
+                    file=sys.stderr,
+                )
+
+        sys.exit(-1)
+
+
+if __name__ == "__main__":
+    main()