Skip to content

Commit

Permalink
pythongh-101282: Apply BOLT optimizations to libpython for shared bui…
Browse files Browse the repository at this point in the history
…lds (python#104709)

Apply BOLT optimizations to libpython for shared builds. Most of the C
code is in libpython so it is critical to apply BOLT there fully realize
BOLT benefits.

This change also reworks how BOLT instrumentation is applied. It
effectively removes the readelf based logic added in pythongh-101525 and
replaces it with a mechanism that saves a copy of the pre-bolt binary
and restores that copy when necessary. This allows us to perform BOLT
optimizations without having to manually delete the output binary to
force a new bolt run.

Also:
- add a clean-bolt target for purging BOLT files and hook that up to the
  clean target
- .gitignore BOLT related files

Before and after this refactor, `make` will no-op after a previous run.
Both versions should also share common make DAG deficiencies where
targets fail to trigger as often as they need to or can trigger
prematurely in certain scenarios. e.g. after this change you may need to
`rm profile-bolt-stamp` to force a BOLT run because there aren't
appropriate non-phony targets for BOLT's make target to depend on.

To make it easier to iterate on custom BOLT settings, the flags to pass
to instrumentation and application are now defined in configure and can
be overridden by passing BOLT_INSTRUMENT_FLAGS and BOLT_APPLY_FLAGS.
  • Loading branch information
indygreg committed May 22, 2023
1 parent 729b252 commit 5360cb3
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 130 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
*.gc??
*.profclang?
*.profraw
# Copies of binaries before BOLT optimizations.
*.prebolt
# BOLT profile data.
*.fdata
*.dyn
.gdb_history
.purify
Expand Down Expand Up @@ -124,6 +128,7 @@ Tools/unicode/data/
/platform
/profile-clean-stamp
/profile-run-stamp
/profile-bolt-stamp
/Python/deepfreeze/*.c
/pybuilddir.txt
/pyconfig.h
Expand Down
7 changes: 7 additions & 0 deletions Doc/using/configure.rst
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,13 @@ also be used to improve performance.
is dependent on a combination of the build environment + the other
optimization configure args + the CPU architecture, and not all combinations
are supported.
BOLT versions before LLVM 16 are known to crash BOLT under some scenarios.
Use of LLVM 16 or newer for BOLT optimization is strongly encouraged.

The :envvar:`!BOLT_INSTRUMENT_FLAGS` and :envvar:`!BOLT_APPLY_FLAGS`
:program:`configure` variables can be defined to override the default set of
arguments for :program:`llvm-bolt` to instrument and apply BOLT data to
binaries, respectively.

.. versionadded:: 3.12

Expand Down
65 changes: 50 additions & 15 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -672,21 +672,55 @@ profile-opt: profile-run-stamp
-rm -f profile-clean-stamp
$(MAKE) @DEF_MAKE_RULE@ CFLAGS_NODIST="$(CFLAGS_NODIST) $(PGO_PROF_USE_FLAG)" LDFLAGS_NODIST="$(LDFLAGS_NODIST)"

.PHONY: bolt-opt
bolt-opt: @PREBOLT_RULE@
# List of binaries that BOLT runs on.
BOLT_BINARIES := @BOLT_BINARIES@

BOLT_INSTRUMENT_FLAGS := @BOLT_INSTRUMENT_FLAGS@
BOLT_APPLY_FLAGS := @BOLT_APPLY_FLAGS@

.PHONY: clean-bolt
clean-bolt:
# Profile data.
rm -f *.fdata
@if $(READELF) -p .note.bolt_info $(BUILDPYTHON) | grep BOLT > /dev/null; then\
echo "skip: $(BUILDPYTHON) is already BOLTed."; \
else \
@LLVM_BOLT@ ./$(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst; \
./$(BUILDPYTHON).bolt_inst $(PROFILE_TASK) || true; \
@MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata; \
@LLVM_BOLT@ ./$(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=none -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot; \
rm -f *.fdata; \
rm -f $(BUILDPYTHON).bolt_inst; \
mv $(BUILDPYTHON).bolt $(BUILDPYTHON); \
fi
# Pristine binaries before BOLT optimization.
rm -f *.prebolt
# BOLT instrumented binaries.
rm -f *.bolt_inst

profile-bolt-stamp: $(BUILDPYTHON)
# Ensure a pristine, pre-BOLT copy of the binary and no profile data from last run.
for bin in $(BOLT_BINARIES); do \
prebolt="$${bin}.prebolt"; \
if [ -e "$${prebolt}" ]; then \
echo "Restoring pre-BOLT binary $${prebolt}"; \
mv "$${bin}.prebolt" "$${bin}"; \
fi; \
cp "$${bin}" "$${prebolt}"; \
rm -f $${bin}.bolt.*.fdata $${bin}.fdata; \
done
# Instrument each binary.
for bin in $(BOLT_BINARIES); do \
@LLVM_BOLT@ "$${bin}" -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $${bin}.bolt) -o $${bin}.bolt_inst $(BOLT_INSTRUMENT_FLAGS); \
mv "$${bin}.bolt_inst" "$${bin}"; \
done
# Run instrumented binaries to collect data.
$(RUNSHARED) ./$(BUILDPYTHON) $(PROFILE_TASK) || true
# Merge all the data files together.
for bin in $(BOLT_BINARIES); do \
@MERGE_FDATA@ $${bin}.*.fdata > "$${bin}.fdata"; \
rm -f $${bin}.*.fdata; \
done
# Run bolt against the merged data to produce an optimized binary.
for bin in $(BOLT_BINARIES); do \
@LLVM_BOLT@ "$${bin}.prebolt" -o "$${bin}.bolt" -data="$${bin}.fdata" $(BOLT_APPLY_FLAGS); \
mv "$${bin}.bolt" "$${bin}"; \
done
touch $@

.PHONY: bolt-opt
bolt-opt:
$(MAKE) @PREBOLT_RULE@
$(MAKE) profile-bolt-stamp

# Compile and run with gcov
.PHONY: coverage
Expand Down Expand Up @@ -2623,10 +2657,11 @@ profile-removal:
rm -f $(COVERAGE_INFO)
rm -rf $(COVERAGE_REPORT)
rm -f profile-run-stamp
rm -f profile-bolt-stamp

.PHONY: clean
clean: clean-retain-profile
@if test @DEF_MAKE_ALL_RULE@ = profile-opt; then \
clean: clean-retain-profile clean-bolt
@if test @DEF_MAKE_ALL_RULE@ = profile-opt -o @DEF_MAKE_ALL_RULE@ = bolt-opt; then \
rm -f profile-gen-stamp profile-clean-stamp; \
$(MAKE) profile-removal; \
fi
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
BOLT optimization is now applied to the libpython shared library if building
a shared library. BOLT instrumentation and application settings can now be
influenced via the ``BOLT_INSTRUMENT_FLAGS`` and ``BOLT_APPLY_FLAGS``
configure variables.
147 changes: 39 additions & 108 deletions configure

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 48 additions & 7 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -2028,13 +2028,6 @@ if test "$Py_BOLT" = 'true' ; then
DEF_MAKE_ALL_RULE="bolt-opt"
DEF_MAKE_RULE="build_all"

AC_SUBST(READELF)
AC_CHECK_TOOLS(READELF, [readelf], "notfound")
if test "$READELF" == "notfound"
then
AC_MSG_ERROR([readelf is required for a --enable-bolt build but could not be found.])
fi

# -fno-reorder-blocks-and-partition is required for bolt to work.
# Possibly GCC only.
AX_CHECK_COMPILE_FLAG([-fno-reorder-blocks-and-partition],[
Expand Down Expand Up @@ -2067,6 +2060,54 @@ if test "$Py_BOLT" = 'true' ; then
fi
fi

dnl Enable BOLT of libpython if built.
AC_SUBST(BOLT_BINARIES)
BOLT_BINARIES='$(BUILDPYTHON)'
AS_VAR_IF([enable_shared], [yes], [
BOLT_BINARIES="${BOLT_BINARIES} \$(INSTSONAME)"
])

AC_ARG_VAR(
[BOLT_INSTRUMENT_FLAGS],
[Arguments to llvm-bolt when instrumenting binaries]
)
AC_MSG_CHECKING([BOLT_INSTRUMENT_FLAGS])
if test -z "${BOLT_INSTRUMENT_FLAGS}"
then
BOLT_INSTRUMENT_FLAGS=
fi
AC_MSG_RESULT([$BOLT_INSTRUMENT_FLAGS])

AC_ARG_VAR(
[BOLT_APPLY_FLAGS],
[Arguments to llvm-bolt when creating a BOLT optimized binary]
)
AC_MSG_CHECKING([BOLT_APPLY_FLAGS])
if test -z "${BOLT_APPLY_FLAGS}"
then
AS_VAR_SET(
[BOLT_APPLY_FLAGS],
[m4_join([ ],
[-update-debug-sections],
[-reorder-blocks=ext-tsp],
[-reorder-functions=hfsort+],
[-split-functions],
[-icf=1],
[-inline-all],
[-split-eh],
[-reorder-functions-use-hot-size],
[-peepholes=none],
[-jump-tables=aggressive],
[-inline-ap],
[-indirect-call-promotion=all],
[-dyno-stats],
[-use-gnu-stack],
[-frame-opt=hot]
)]
)
fi
AC_MSG_RESULT([$BOLT_APPLY_FLAGS])

# XXX Shouldn't the code above that fiddles with BASECFLAGS and OPT be
# merged with this chunk of code?

Expand Down

0 comments on commit 5360cb3

Please sign in to comment.