Skip to content

gh-127022: Simplify PyStackRef_FromPyObjectSteal #127024

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions Include/internal/pycore_stackref.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ _PyStackRef_FromPyObjectSteal(PyObject *obj)
assert(obj != NULL);
// Make sure we don't take an already tagged value.
assert(((uintptr_t)obj & Py_TAG_BITS) == 0);
unsigned int tag = _Py_IsImmortal(obj) ? (Py_TAG_DEFERRED) : Py_TAG_PTR;
return ((_PyStackRef){.bits = ((uintptr_t)(obj)) | tag});
return (_PyStackRef){ .bits = (uintptr_t)obj };
}
# define PyStackRef_FromPyObjectSteal(obj) _PyStackRef_FromPyObjectSteal(_PyObject_CAST(obj))

Expand Down Expand Up @@ -190,9 +189,15 @@ static const _PyStackRef PyStackRef_NULL = { .bits = 0 };

#endif // Py_GIL_DISABLED

// Note: this is a macro because MSVC (Windows) has trouble inlining it.
// Check if a stackref is exactly the same as another stackref, including the
// the deferred bit. This can only be used safely if you know that the deferred
// of `a` and `b` bits match.
#define PyStackRef_IsExactly(a, b) ((a).bits == (b).bits)

#define PyStackRef_Is(a, b) ((a).bits == (b).bits)
// Checks that mask out the deferred bit in the free threading build.
#define PyStackRef_IsNone(ref) (PyStackRef_AsPyObjectBorrow(ref) == Py_None)
#define PyStackRef_IsTrue(ref) (PyStackRef_AsPyObjectBorrow(ref) == Py_True)
#define PyStackRef_IsFalse(ref) (PyStackRef_AsPyObjectBorrow(ref) == Py_False)

// Converts a PyStackRef back to a PyObject *, converting the
// stackref to a new reference.
Expand Down
42 changes: 17 additions & 25 deletions Python/bytecodes.c
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ dummy_func(

pure inst(UNARY_NOT, (value -- res)) {
assert(PyStackRef_BoolCheck(value));
res = PyStackRef_Is(value, PyStackRef_False)
res = PyStackRef_IsExactly(value, PyStackRef_False)
? PyStackRef_True : PyStackRef_False;
DEAD(value);
}
Expand Down Expand Up @@ -441,7 +441,7 @@ dummy_func(

inst(TO_BOOL_NONE, (unused/1, unused/2, value -- res)) {
// This one is a bit weird, because we expect *some* failures:
EXIT_IF(!PyStackRef_Is(value, PyStackRef_None));
EXIT_IF(!PyStackRef_IsNone(value));
DEAD(value);
STAT_INC(TO_BOOL, hit);
res = PyStackRef_False;
Expand Down Expand Up @@ -651,9 +651,7 @@ dummy_func(
// specializations, but there is no output.
// At the end we just skip over the STORE_FAST.
op(_BINARY_OP_INPLACE_ADD_UNICODE, (left, right --)) {
#ifndef NDEBUG
PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
#endif
PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);

int next_oparg;
Expand All @@ -664,7 +662,7 @@ dummy_func(
next_oparg = CURRENT_OPERAND0();
#endif
_PyStackRef *target_local = &GETLOCAL(next_oparg);
DEOPT_IF(!PyStackRef_Is(*target_local, left));
DEOPT_IF(PyStackRef_AsPyObjectBorrow(*target_local) != left_o);
STAT_INC(BINARY_OP, hit);
/* Handle `left = left + right` or `left += right` for str.
*
Expand Down Expand Up @@ -1141,7 +1139,7 @@ dummy_func(
gen_frame->previous = frame;
DISPATCH_INLINED(gen_frame);
}
if (PyStackRef_Is(v, PyStackRef_None) && PyIter_Check(receiver_o)) {
if (PyStackRef_IsNone(v) && PyIter_Check(receiver_o)) {
retval_o = Py_TYPE(receiver_o)->tp_iternext(receiver_o);
}
else {
Expand Down Expand Up @@ -1249,7 +1247,7 @@ dummy_func(
inst(POP_EXCEPT, (exc_value -- )) {
_PyErr_StackItem *exc_info = tstate->exc_info;
Py_XSETREF(exc_info->exc_value,
PyStackRef_Is(exc_value, PyStackRef_None)
PyStackRef_IsNone(exc_value)
? NULL : PyStackRef_AsPyObjectSteal(exc_value));
}

Expand Down Expand Up @@ -2481,13 +2479,7 @@ dummy_func(
}

inst(IS_OP, (left, right -- b)) {
#ifdef Py_GIL_DISABLED
// On free-threaded builds, objects are conditionally immortalized.
// So their bits don't always compare equally.
int res = Py_Is(PyStackRef_AsPyObjectBorrow(left), PyStackRef_AsPyObjectBorrow(right)) ^ oparg;
#else
int res = PyStackRef_Is(left, right) ^ oparg;
#endif
DECREF_INPUTS();
b = res ? PyStackRef_True : PyStackRef_False;
}
Expand Down Expand Up @@ -2693,22 +2685,22 @@ dummy_func(

replaced op(_POP_JUMP_IF_FALSE, (cond -- )) {
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
int flag = PyStackRef_IsFalse(cond);
DEAD(cond);
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}

replaced op(_POP_JUMP_IF_TRUE, (cond -- )) {
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
int flag = PyStackRef_IsExactly(cond, PyStackRef_True);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we use PyStackRef_IsExactly here (which doesn't mask out the deferred bit) but use PyStackRef_IsFalse (which does mask out the deferred bit) in _POP_JUMP_IF_FALSE above? Is this the rare case where it's safe?

Copy link
Contributor Author

@colesbury colesbury Nov 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our codegen ensures that these ops only see True or False. That's often by adding a TO_BOOL immediately before, which may be folded into COMPARE_OP. The preceding TO_BOOL, including in COMPARE_OP, ensures the canonical representation of PyStackRef_False or PyStackRef_True with the deferred bit set.

However, there are two places in codegen.c that omit the TO_BOOL because they have other reasons to know that the result is exactly a boolean:

cpython/Python/codegen.c

Lines 678 to 682 in 09c240f

ADDOP_I(c, loc, LOAD_FAST, 0);
ADDOP_LOAD_CONST(c, loc, _PyLong_GetOne());
ADDOP_I(c, loc, COMPARE_OP, (Py_NE << 5) | compare_masks[Py_NE]);
NEW_JUMP_TARGET_LABEL(c, body);
ADDOP_JUMP(c, loc, POP_JUMP_IF_FALSE, body);

cpython/Python/codegen.c

Lines 5746 to 5749 in 09c240f

ADDOP(c, LOC(p), GET_LEN);
ADDOP_LOAD_CONST_NEW(c, LOC(p), PyLong_FromSsize_t(size));
ADDOP_COMPARE(c, LOC(p), GtE);
RETURN_IF_ERROR(jump_to_fail_pop(c, LOC(p), pc, POP_JUMP_IF_FALSE));

The COMPARE_OPs here still generate bools, but not always in the canonical representation. So we can either:

  1. Modify COMPARE_OP to ensure the canonical representation like https://github.com/colesbury/cpython/blob/5583ac0c311132e36ef458842e087945898ffdec/Python/bytecodes.c#L2409-L2416
  2. Use PyStackRef_IsFalse (instead of PyStackRef_IsExactly) in the JUMP_IF_FALSE
  3. Modify the codegen by inserting TO_BOOL in those two spots.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense, thanks for the explanation. Since using PyStackRef_IsExactly safely is sensitive to code generation changes, I might suggest using it only when we're sure it actually matters for performance, and default to using the variants that mask out the deferred bits everywhere by default since those are always safe. I'd guess that this wouldn't affect the performance improvement of this change much, since it should come from avoiding the tagging in _PyStackRef_FromPyObjectSteal. I don't feel super strongly though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll switch to using PyStackRef_IsFalse and PyStackRef_IsTrue.

I'm no longer convinced that PyStackRef_IsExactly is actually a performance win (and I didn't see it in measurements). I think we have issues with code generation quality that we'll need to address later. Things like POP_JUMP_IF_NONE are composed of _IS_NONE and _POP_JUMP_IF_TRUE and we pack the intermediate result in a tagged _PyStackRef. Clang does a pretty good job of optimizing through it. GCC less so: https://gcc.godbolt.org/z/Ejs8c78qd.

DEAD(cond);
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}

op(_IS_NONE, (value -- b)) {
if (PyStackRef_Is(value, PyStackRef_None)) {
if (PyStackRef_IsNone(value)) {
b = PyStackRef_True;
DEAD(value);
}
Expand Down Expand Up @@ -3752,7 +3744,7 @@ dummy_func(

inst(EXIT_INIT_CHECK, (should_be_none -- )) {
assert(STACK_LEVEL() == 2);
if (!PyStackRef_Is(should_be_none, PyStackRef_None)) {
if (!PyStackRef_IsNone(should_be_none)) {
PyErr_Format(PyExc_TypeError,
"__init__() should return None, not '%.200s'",
Py_TYPE(PyStackRef_AsPyObjectBorrow(should_be_none))->tp_name);
Expand Down Expand Up @@ -4712,7 +4704,7 @@ dummy_func(
inst(INSTRUMENTED_POP_JUMP_IF_TRUE, (unused/1 -- )) {
_PyStackRef cond = POP();
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
int flag = PyStackRef_IsExactly(cond, PyStackRef_True);
int offset = flag * oparg;
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
Expand All @@ -4721,15 +4713,15 @@ dummy_func(
inst(INSTRUMENTED_POP_JUMP_IF_FALSE, (unused/1 -- )) {
_PyStackRef cond = POP();
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
int flag = PyStackRef_IsFalse(cond);
int offset = flag * oparg;
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
}

inst(INSTRUMENTED_POP_JUMP_IF_NONE, (unused/1 -- )) {
_PyStackRef value_stackref = POP();
int flag = PyStackRef_Is(value_stackref, PyStackRef_None);
int flag = PyStackRef_IsNone(value_stackref);
int offset;
if (flag) {
offset = oparg;
Expand All @@ -4745,7 +4737,7 @@ dummy_func(
inst(INSTRUMENTED_POP_JUMP_IF_NOT_NONE, (unused/1 -- )) {
_PyStackRef value_stackref = POP();
int offset;
int nflag = PyStackRef_Is(value_stackref, PyStackRef_None);
int nflag = PyStackRef_IsNone(value_stackref);
if (nflag) {
offset = 0;
}
Expand Down Expand Up @@ -4780,21 +4772,21 @@ dummy_func(
///////// Tier-2 only opcodes /////////

op (_GUARD_IS_TRUE_POP, (flag -- )) {
int is_true = PyStackRef_Is(flag, PyStackRef_True);
int is_true = PyStackRef_IsTrue(flag);
DEAD(flag);
SYNC_SP();
EXIT_IF(!is_true);
}

op (_GUARD_IS_FALSE_POP, (flag -- )) {
int is_false = PyStackRef_Is(flag, PyStackRef_False);
int is_false = PyStackRef_IsFalse(flag);
DEAD(flag);
SYNC_SP();
EXIT_IF(!is_false);
}

op (_GUARD_IS_NONE_POP, (val -- )) {
int is_none = PyStackRef_Is(val, PyStackRef_None);
int is_none = PyStackRef_IsNone(val);
if (!is_none) {
PyStackRef_CLOSE(val);
SYNC_SP();
Expand All @@ -4804,7 +4796,7 @@ dummy_func(
}

op (_GUARD_IS_NOT_NONE_POP, (val -- )) {
int is_none = PyStackRef_Is(val, PyStackRef_None);
int is_none = PyStackRef_IsNone(val);
PyStackRef_CLOSE(val);
SYNC_SP();
EXIT_IF(is_none);
Expand Down
28 changes: 10 additions & 18 deletions Python/executor_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading