Skip to content

Commit

Permalink
[SPARK-48592][INFRA] Add structured logging style script and GitHub w…
Browse files Browse the repository at this point in the history
…orkflow

### What changes were proposed in this pull request?
This PR checks for Scala logging messages using logInfo, logWarning, logError and containing variables without MDC wrapper

Example error output:
```
[error] spark/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala:225:4
[error] Logging message should use Structured Logging Framework style, such as log"...${MDC(TASK_ID, taskId)..."
                Refer to the guidelines in the file `internal/Logging.scala`.
```

### Why are the changes needed?
This makes development and PR review of the structured logging migration easier.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Manual test, verified it will throw errors on invalid logging messages.

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #47239 from asl3/structuredlogstylescript.

Authored-by: Amanda Liu <amanda.liu@databricks.com>
Signed-off-by: Gengliang Wang <gengliang@apache.org>
  • Loading branch information
asl3 authored and gengliangwang committed Jul 20, 2024
1 parent 5785098 commit 41f67dd
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,11 @@ jobs:
run: ./dev/mima
- name: Scala linter
run: ./dev/lint-scala
- name: Scala structured logging check
run: |
if [ -f ./dev/structured_logging_style.py ]; then
python3.9 ./dev/structured_logging_style.py
fi
- name: Java linter
run: ./dev/lint-java
- name: Spark connect jvm client mima check
Expand Down
94 changes: 94 additions & 0 deletions dev/structured_logging_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python3

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import sys
import re
import glob


def main():
log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\"\+(?!s?")).*|[^"]+\+\s*".*?"'
compiled_inner_log_pattern = re.compile(inner_log_pattern)

# Regex patterns for file paths to exclude from the Structured Logging style check
excluded_file_patterns = [
"[Tt]est",
"sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen"
"/CodeGenerator.scala",
"streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
"sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver"
"/SparkSQLCLIService.scala",
"core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
]

nonmigrated_files = {}

scala_files = glob.glob(os.path.join("../", "**", "*.scala"), recursive=True)

for file in scala_files:
skip_file = False
for exclude_pattern in excluded_file_patterns:
if re.search(exclude_pattern, file):
skip_file = True
break

if not skip_file and not os.path.isdir(file):
with open(file, "r") as f:
content = f.read()

log_statements = re.finditer(log_pattern, content, re.DOTALL)

if log_statements:
nonmigrated_files[file] = []
for log_statement in log_statements:
log_statement_str = log_statement.group(0).strip()
# trim first ( and last )
first_paren_index = log_statement_str.find("(")
inner_log_statement = re.sub(
r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
)

if compiled_inner_log_pattern.fullmatch(inner_log_statement):
start_pos = log_statement.start()
preceding_content = content[:start_pos]
line_number = preceding_content.count("\n") + 1
start_char = start_pos - preceding_content.rfind("\n") - 1
nonmigrated_files[file].append((line_number, start_char))

if all(len(issues) == 0 for issues in nonmigrated_files.values()):
print("Structured logging style check passed.", file=sys.stderr)
sys.exit(0)
else:
for file_path, issues in nonmigrated_files.items():
for line_number, start_char in issues:
print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
print(
"[error]\tPlease use the Structured Logging Framework for logging messages "
'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
"\n\tRefer to the guidelines in the file `internal/Logging.scala`.",
file=sys.stderr,
)

sys.exit(-1)


if __name__ == "__main__":
main()

0 comments on commit 41f67dd

Please sign in to comment.