Skip to content

Commit

Permalink
[mypyc] Inline increfs and decrefs in commonly executed blocks (#11540)
Browse files Browse the repository at this point in the history
These operations are performance-critical, but inlining everywhere
can slow down compilation a lot, so we only inline them outside error
handlers (and other rarely executed code paths).

This still can slow compilation by 10-15%, but I think that we just need
to live with it, since the performance gains are impressive. We can perhaps
claw back some of the loss by optimizing away redundant increfs/decrefs.
Also parallel compilation would make this much less significant.

This can speed up the richards benchmark by 65% (!).

With this change:

```
running richards
...........
interpreted: 0.181880s (avg of 6 iterations; stdev 0.91%)
compiled:    0.005314s (avg of 6 iterations; stdev 1.2%)

compiled is 34.229x faster
```

Using master:

```
running richards
...........
interpreted: 0.182124s (avg of 6 iterations; stdev 2.1%)
compiled:    0.008794s (avg of 6 iterations; stdev 1.9%)

compiled is 20.710x faster
```

Also, this makes the int_list microbenchmark up to 80% faster.
Compiled mypy was also around 3% faster.
  • Loading branch information
JukkaL committed Nov 15, 2021
1 parent 053a1be commit 7a5c6f0
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 26 deletions.
32 changes: 32 additions & 0 deletions mypyc/analysis/blockfreq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Find basic blocks that are likely to be executed frequently.
For example, this would not include blocks that have exception handlers.
We can use different optimization heuristics for common and rare code. For
example, we can make IR fast to compile instead of fast to execute for rare
code.
"""

from typing import Set

from mypyc.ir.ops import BasicBlock, Goto, Branch


def frequently_executed_blocks(entry_point: BasicBlock) -> Set[BasicBlock]:
result: Set[BasicBlock] = set()
worklist = [entry_point]
while worklist:
block = worklist.pop()
if block in result:
continue
result.add(block)
t = block.terminator
if isinstance(t, Goto):
worklist.append(t.label)
elif isinstance(t, Branch):
if t.rare or t.traceback_entry is not None:
worklist.append(t.false)
else:
worklist.append(t.true)
worklist.append(t.false)
return result
33 changes: 27 additions & 6 deletions mypyc/codegen/emit.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,35 +348,56 @@ def declare_tuple_struct(self, tuple_type: RTuple) -> None:
is_type=True,
)

def emit_inc_ref(self, dest: str, rtype: RType) -> None:
def emit_inc_ref(self, dest: str, rtype: RType, *, rare: bool = False) -> None:
"""Increment reference count of C expression `dest`.
For composite unboxed structures (e.g. tuples) recursively
increment reference counts for each component.
If rare is True, optimize for code size and compilation speed.
"""
if is_int_rprimitive(rtype):
self.emit_line('CPyTagged_IncRef(%s);' % dest)
if rare:
self.emit_line('CPyTagged_IncRef(%s);' % dest)
else:
self.emit_line('CPyTagged_INCREF(%s);' % dest)
elif isinstance(rtype, RTuple):
for i, item_type in enumerate(rtype.types):
self.emit_inc_ref('{}.f{}'.format(dest, i), item_type)
elif not rtype.is_unboxed:
# Always inline, since this is a simple op
self.emit_line('CPy_INCREF(%s);' % dest)
# Otherwise assume it's an unboxed, pointerless value and do nothing.

def emit_dec_ref(self, dest: str, rtype: RType, is_xdec: bool = False) -> None:
def emit_dec_ref(self,
dest: str,
rtype: RType,
*,
is_xdec: bool = False,
rare: bool = False) -> None:
"""Decrement reference count of C expression `dest`.
For composite unboxed structures (e.g. tuples) recursively
decrement reference counts for each component.
If rare is True, optimize for code size and compilation speed.
"""
x = 'X' if is_xdec else ''
if is_int_rprimitive(rtype):
self.emit_line('CPyTagged_%sDecRef(%s);' % (x, dest))
if rare:
self.emit_line('CPyTagged_%sDecRef(%s);' % (x, dest))
else:
# Inlined
self.emit_line('CPyTagged_%sDECREF(%s);' % (x, dest))
elif isinstance(rtype, RTuple):
for i, item_type in enumerate(rtype.types):
self.emit_dec_ref('{}.f{}'.format(dest, i), item_type, is_xdec)
self.emit_dec_ref('{}.f{}'.format(dest, i), item_type, is_xdec=is_xdec, rare=rare)
elif not rtype.is_unboxed:
self.emit_line('CPy_%sDecRef(%s);' % (x, dest))
if rare:
self.emit_line('CPy_%sDecRef(%s);' % (x, dest))
else:
# Inlined
self.emit_line('CPy_%sDECREF(%s);' % (x, dest))
# Otherwise assume it's an unboxed, pointerless value and do nothing.

def pretty_name(self, typ: RType) -> str:
Expand Down
11 changes: 8 additions & 3 deletions mypyc/codegen/emitfunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from mypyc.ir.func_ir import FuncIR, FuncDecl, FUNC_STATICMETHOD, FUNC_CLASSMETHOD, all_values
from mypyc.ir.class_ir import ClassIR
from mypyc.ir.pprint import generate_names_for_ir
from mypyc.analysis.blockfreq import frequently_executed_blocks

# Whether to insert debug asserts for all error handling, to quickly
# catch errors propagating without exceptions set.
Expand Down Expand Up @@ -77,8 +78,11 @@ def generate_native_function(fn: FuncIR,
for i, block in enumerate(blocks):
block.label = i

common = frequently_executed_blocks(fn.blocks[0])

for i in range(len(blocks)):
block = blocks[i]
visitor.rare = block not in common
next_block = None
if i + 1 < len(blocks):
next_block = blocks[i + 1]
Expand All @@ -105,6 +109,7 @@ def __init__(self,
self.source_path = source_path
self.module_name = module_name
self.literals = emitter.context.literals
self.rare = False
self.next_block: Optional[BasicBlock] = None

def temp_name(self) -> str:
Expand Down Expand Up @@ -416,7 +421,7 @@ def visit_inc_ref(self, op: IncRef) -> None:

def visit_dec_ref(self, op: DecRef) -> None:
src = self.reg(op.src)
self.emit_dec_ref(src, op.src.type, op.is_xdec)
self.emit_dec_ref(src, op.src.type, is_xdec=op.is_xdec)

def visit_box(self, op: Box) -> None:
self.emitter.emit_box(self.reg(op.src), self.reg(op), op.src.type, can_borrow=True)
Expand Down Expand Up @@ -574,10 +579,10 @@ def emit_lines(self, *lines: str) -> None:
self.emitter.emit_lines(*lines)

def emit_inc_ref(self, dest: str, rtype: RType) -> None:
self.emitter.emit_inc_ref(dest, rtype)
self.emitter.emit_inc_ref(dest, rtype, rare=self.rare)

def emit_dec_ref(self, dest: str, rtype: RType, is_xdec: bool) -> None:
self.emitter.emit_dec_ref(dest, rtype, is_xdec)
self.emitter.emit_dec_ref(dest, rtype, is_xdec=is_xdec, rare=self.rare)

def emit_declaration(self, line: str) -> None:
self.declarations.emit_line(line)
Expand Down
3 changes: 2 additions & 1 deletion mypyc/ir/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,8 @@ def __init__(self,
self.negated = False
# If not None, the true label should generate a traceback entry (func name, line number)
self.traceback_entry: Optional[Tuple[str, int]] = None
# If True, the condition is expected to be usually False (for optimization purposes)
# If True, we expect to usually take the false branch (for optimization purposes);
# this is implicitly treated as true if there is a traceback entry
self.rare = rare

def targets(self) -> Sequence[BasicBlock]:
Expand Down
27 changes: 22 additions & 5 deletions mypyc/lib-rt/CPy.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,24 @@ static inline int CPyTagged_CheckShort(CPyTagged x) {
return !CPyTagged_CheckLong(x);
}

static inline void CPyTagged_INCREF(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
CPyTagged_IncRef(x);
}
}

static inline void CPyTagged_DECREF(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
CPyTagged_DecRef(x);
}
}

static inline void CPyTagged_XDECREF(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
CPyTagged_XDecRef(x);
}
}

static inline Py_ssize_t CPyTagged_ShortAsSsize_t(CPyTagged x) {
// NOTE: Assume that we sign extend.
return (Py_ssize_t)x >> 1;
Expand Down Expand Up @@ -253,11 +271,10 @@ static inline bool CPyTagged_IsLe(CPyTagged left, CPyTagged right) {
// Generic operations (that work with arbitrary types)


/* We use intentionally non-inlined decrefs since it pretty
* substantially speeds up compile time while only causing a ~1%
* performance degradation. We have our own copies both to avoid the
* null check in Py_DecRef and to avoid making an indirect PIC
* call. */
/* We use intentionally non-inlined decrefs in rarely executed code
* paths since it pretty substantially speeds up compile time. We have
* our own copies both to avoid the null check in Py_DecRef and to avoid
* making an indirect PIC call. */
CPy_NOINLINE
static void CPy_DecRef(PyObject *p) {
CPy_DECREF(p);
Expand Down
124 changes: 124 additions & 0 deletions mypyc/test-data/exceptions-freq.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
-- Test cases for basic block execution frequency analysis.
--
-- These test cases are using exception transform test machinery for convenience.
--
-- NOTE: These must all have the _freq suffix

[case testSimpleError_freq]
from typing import List
def f(x: List[int]) -> int:
return x[0]
[out]
def f(x):
x :: list
r0 :: object
r1, r2 :: int
L0:
r0 = CPyList_GetItemShort(x, 0)
if is_error(r0) goto L3 (error at f:3) else goto L1
L1:
r1 = unbox(int, r0)
dec_ref r0
if is_error(r1) goto L3 (error at f:3) else goto L2
L2:
return r1
L3:
r2 = <error> :: int
return r2
hot blocks: [0, 1, 2]

[case testHotBranch_freq]
from typing import List
def f(x: bool) -> None:
if x:
y = 1
else:
y = 2
[out]
def f(x):
x :: bool
y :: int
L0:
if x goto L1 else goto L2 :: bool
L1:
y = 2
dec_ref y :: int
goto L3
L2:
y = 4
dec_ref y :: int
L3:
return 1
hot blocks: [0, 1, 2, 3]

[case testGoto_freq]
from typing import List
def f(x: bool) -> int:
if x:
y = 1
else:
return 2
return y
[out]
def f(x):
x :: bool
y :: int
L0:
if x goto L1 else goto L2 :: bool
L1:
y = 2
goto L3
L2:
return 4
L3:
return y
hot blocks: [0, 1, 2, 3]

[case testFalseOnError_freq]
from typing import List
def f(x: List[int]) -> None:
x[0] = 1
[out]
def f(x):
x :: list
r0 :: object
r1 :: bit
r2 :: None
L0:
r0 = box(short_int, 2)
r1 = CPyList_SetItem(x, 0, r0)
if not r1 goto L2 (error at f:3) else goto L1 :: bool
L1:
return 1
L2:
r2 = <error> :: None
return r2
hot blocks: [0, 1]

[case testRareBranch_freq]
from typing_extensions import Final

x: Final = str()

def f() -> str:
return x
[out]
def f():
r0 :: str
r1 :: bool
r2 :: str
L0:
r0 = __main__.x :: static
if is_error(r0) goto L1 else goto L3
L1:
r1 = raise NameError('value for final name "x" was not set')
if not r1 goto L4 (error at f:6) else goto L2 :: bool
L2:
unreachable
L3:
inc_ref r0
return r0
L4:
r2 = <error> :: str
return r2
hot blocks: [0, 3]
Loading

0 comments on commit 7a5c6f0

Please sign in to comment.