Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-48592][INFRA] Add structured logging style script and GitHub workflow #47239

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
6ef59e6
add structured logging style script and github workflow
asl3 Jul 5, 2024
d640b94
improve error message
asl3 Jul 5, 2024
b862228
match scala build error message
asl3 Jul 5, 2024
42de4c5
account for line breaks in regex match
asl3 Jul 6, 2024
95e86cc
fix script name
asl3 Jul 6, 2024
667bc58
update regex
asl3 Jul 6, 2024
58e763d
separate mono regex
asl3 Jul 8, 2024
0f26de8
add java files check
asl3 Jul 8, 2024
8959688
reformat python
asl3 Jul 8, 2024
0bd4e8a
check only scala files
asl3 Jul 8, 2024
c8b1b27
add CodeGenerator to exclude list due to large code formatter variable
asl3 Jul 8, 2024
22940d7
Merge branch 'master' into structuredlogstylescript
asl3 Jul 9, 2024
ae67d94
check if script exists
asl3 Jul 9, 2024
9f17a40
fix inner regex
asl3 Jul 9, 2024
4bb0ccf
add exclude file
asl3 Jul 9, 2024
97c3a74
revise error message
asl3 Jul 10, 2024
1b3950d
update error message
asl3 Jul 10, 2024
6f47405
check if file is a directory
asl3 Jul 10, 2024
dd0fb47
update regex
asl3 Jul 10, 2024
d6206a9
rename script
asl3 Jul 11, 2024
62e3c0d
style
asl3 Jul 11, 2024
e6d88b2
update gha yml
asl3 Jul 11, 2024
72f0c1b
line char max
asl3 Jul 11, 2024
7bdf67c
stylize the error message
asl3 Jul 15, 2024
bc400cc
Merge branch 'master' into structuredlogstylescript
asl3 Jul 17, 2024
bed4256
modify + regex
asl3 Jul 17, 2024
44ef1a5
Merge branch 'master' into structuredlogstylescript
asl3 Jul 18, 2024
20dd905
Merge branch 'master' into structuredlogstylescript
asl3 Jul 18, 2024
a253657
run script with python3.9
asl3 Jul 19, 2024
0c22383
update success condition check
asl3 Jul 19, 2024
2333388
lint fix
asl3 Jul 20, 2024
5f78989
reformat
asl3 Jul 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,11 @@ jobs:
run: ./dev/mima
- name: Scala linter
run: ./dev/lint-scala
- name: Scala structured logging check
run: |
if [ -f ./dev/structured-logging-style.py ]; then
./dev/structured-logging-style.py
fi
- name: Java linter
run: ./dev/lint-java
- name: Spark connect jvm client mima check
Expand Down
92 changes: 92 additions & 0 deletions dev/structured-logging-style.py
asl3 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
asl3 marked this conversation as resolved.
Show resolved Hide resolved

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import sys
import re
import glob

from sparktestsupport import SPARK_HOME
asl3 marked this conversation as resolved.
Show resolved Hide resolved


def main():
log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"'
compiled_inner_log_pattern = re.compile(inner_log_pattern)

# Regex patterns for file paths to exclude from the Structured Logging style check
excluded_file_patterns = [
"[Tt]est",
"sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
"streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
"sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
"core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
]

nonmigrated_files = {}

scala_files = glob.glob(os.path.join(SPARK_HOME, "**", "*.scala"), recursive=True)

for file in scala_files:
skip_file = False
for exclude_pattern in excluded_file_patterns:
if re.search(exclude_pattern, file):
skip_file = True
break

if not skip_file and not os.path.isdir(file):
with open(file, "r") as f:
content = f.read()

log_statements = re.finditer(log_pattern, content, re.DOTALL)

if log_statements:
nonmigrated_files[file] = []
for log_statement in log_statements:
log_statement_str = log_statement.group(0).strip()
# trim first ( and last )
first_paren_index = log_statement_str.find("(")
inner_log_statement = re.sub(
r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
)

if compiled_inner_log_pattern.fullmatch(inner_log_statement):
start_pos = log_statement.start()
preceding_content = content[:start_pos]
line_number = preceding_content.count("\n") + 1
start_char = start_pos - preceding_content.rfind("\n") - 1
nonmigrated_files[file].append((line_number, start_char))

if not nonmigrated_files:
print("Structured logging style check passed.")
sys.exit(0)
else:
for file_path, issues in nonmigrated_files.items():
for line_number, start_char in issues:
print(f"[error] {file_path}:{line_number}:{start_char}")
asl3 marked this conversation as resolved.
Show resolved Hide resolved
print(
"""[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...".
Refer to the guidelines in the file `internal/Logging.scala`."""
)

sys.exit(-1)


if __name__ == "__main__":
main()
91 changes: 91 additions & 0 deletions dev/structured_logging_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import sys
import re
import glob


def main():
log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+).*|[^"]+\+\s*".*?"'
compiled_inner_log_pattern = re.compile(inner_log_pattern)

# Regex patterns for file paths to exclude from the Structured Logging style check
excluded_file_patterns = [
"[Tt]est",
"sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala",
"streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
"sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala",
"core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
]

nonmigrated_files = {}

scala_files = glob.glob(os.path.join("../", "**", "*.scala"), recursive=True)

for file in scala_files:
skip_file = False
for exclude_pattern in excluded_file_patterns:
if re.search(exclude_pattern, file):
skip_file = True
break

if not skip_file and not os.path.isdir(file):
with open(file, "r") as f:
content = f.read()

log_statements = re.finditer(log_pattern, content, re.DOTALL)

if log_statements:
nonmigrated_files[file] = []
for log_statement in log_statements:
log_statement_str = log_statement.group(0).strip()
# trim first ( and last )
first_paren_index = log_statement_str.find("(")
inner_log_statement = re.sub(
r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
)

if compiled_inner_log_pattern.fullmatch(inner_log_statement):
start_pos = log_statement.start()
preceding_content = content[:start_pos]
line_number = preceding_content.count("\n") + 1
start_char = start_pos - preceding_content.rfind("\n") - 1
nonmigrated_files[file].append((line_number, start_char))

if not nonmigrated_files:
print("Structured logging style check passed.", file=sys.stderr)
sys.exit(0)
else:
for file_path, issues in nonmigrated_files.items():
for line_number, start_char in issues:
print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
print(
"""[error]\tPlease use the Structured Logging Framework for logging messages with variables. For example: log"...${MDC(TASK_ID, taskId)...".
Refer to the guidelines in the file `internal/Logging.scala`.""",
file=sys.stderr,
)

sys.exit(-1)


if __name__ == "__main__":
main()