From e3cfa51e9782eb78286b4802b73d7372beb951fe Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 10 Apr 2026 03:57:40 +0800 Subject: [PATCH 1/6] Move stackref buffer to per-interpreter. Check for C recursion limit in testcapi --- Modules/_testcapi/vectorcall.c | 10 +++++++++- Python/ceval.c | 7 +++++++ Python/ceval_macros.h | 3 +-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/Modules/_testcapi/vectorcall.c b/Modules/_testcapi/vectorcall.c index f89dcb6c4cf03c..60dd4b423fa82c 100644 --- a/Modules/_testcapi/vectorcall.c +++ b/Modules/_testcapi/vectorcall.c @@ -98,7 +98,15 @@ _testcapi_pyobject_vectorcall_impl(PyObject *module, PyObject *func, PyErr_SetString(PyExc_TypeError, "kwnames must be None or a tuple"); return NULL; } - return PyObject_Vectorcall(func, stack, nargs, kwnames); + PyObject *res; + // The CPython interpreter does not guarantee that vectorcalls are + // checked for recursion limit. It's thus up to the C extension themselves to check. + if (Py_EnterRecursiveCall("in _testcapi.pyobject_vectorcall")) { + return NULL; + } + res = PyObject_Vectorcall(func, stack, nargs, kwnames); + Py_LeaveRecursiveCall(); + return res; } static PyObject * diff --git a/Python/ceval.c b/Python/ceval.c index 377b4644eddd2a..e11f75a85790f1 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1168,6 +1168,13 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int return NULL; } + /* +1 because vectorcall might use -1 to write self */ + /* gh-138115: This must not be in individual cases for + non-tail-call interpreters, as it results in excessive + stack usage in some compilers. + */ + PyObject *STACKREF_SCRATCH[MAX_STACKREF_SCRATCH+1]; + /* Local "register" variables. * These are cached values from the frame and code object. */ _Py_CODEUNIT *next_instr; diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 4a878d6dff4353..e45590c4efdb31 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -428,8 +428,7 @@ do { \ #define MAX_STACKREF_SCRATCH 10 #define STACKREFS_TO_PYOBJECTS(ARGS, ARG_COUNT, NAME) \ - /* +1 because vectorcall might use -1 to write self */ \ - PyObject *NAME##_temp[MAX_STACKREF_SCRATCH+1]; \ + PyObject **NAME##_temp = (PyObject **)&STACKREF_SCRATCH; \ PyObject **NAME = _PyObjectArray_FromStackRefArray(ARGS, ARG_COUNT, NAME##_temp + 1); #define STACKREFS_TO_PYOBJECTS_CLEANUP(NAME) \ From e630cf2f2cf975ffe2f9397cde94b6916058542e Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 10 Apr 2026 03:59:58 +0800 Subject: [PATCH 2/6] fix for TC interpreter --- Python/ceval_macros.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index e45590c4efdb31..3fc7589dda03b5 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -427,9 +427,16 @@ do { \ /* How much scratch space to give stackref to PyObject* conversion. */ #define MAX_STACKREF_SCRATCH 10 +#if Py_TAIL_CALL_INTERP +#define STACKREFS_TO_PYOBJECTS(ARGS, ARG_COUNT, NAME) \ + /* +1 because vectorcall might use -1 to write self */ \ + PyObject *NAME##_temp[MAX_STACKREF_SCRATCH+1]; \ + PyObject **NAME = _PyObjectArray_FromStackRefArray(ARGS, ARG_COUNT, NAME##_temp + 1); +#else #define STACKREFS_TO_PYOBJECTS(ARGS, ARG_COUNT, NAME) \ PyObject **NAME##_temp = (PyObject **)&STACKREF_SCRATCH; \ PyObject **NAME = _PyObjectArray_FromStackRefArray(ARGS, ARG_COUNT, NAME##_temp + 1); +#endif #define STACKREFS_TO_PYOBJECTS_CLEANUP(NAME) \ /* +1 because we +1 previously */ \ From 9da7756b83f53df650636e0e7d869991f1e98865 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 20:02:11 +0000 Subject: [PATCH 3/6] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2026-04-09-20-02-06.gh-issue-148284.DTBhaX.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-04-09-20-02-06.gh-issue-148284.DTBhaX.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-09-20-02-06.gh-issue-148284.DTBhaX.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-09-20-02-06.gh-issue-148284.DTBhaX.rst new file mode 100644 index 00000000000000..93bc7119d02b8e --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-09-20-02-06.gh-issue-148284.DTBhaX.rst @@ -0,0 +1 @@ +Reduce C stack usage in the Python interpreter on recent versions of Clang. From d108567665591904577a5913e4f3706d3b1dcd4e Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 10 Apr 2026 05:11:05 +0800 Subject: [PATCH 4/6] Address Sam's review, fix root cause --- Modules/_testcapi/vectorcall.c | 10 +--------- Python/ceval.c | 8 +++++++- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/Modules/_testcapi/vectorcall.c b/Modules/_testcapi/vectorcall.c index 60dd4b423fa82c..f89dcb6c4cf03c 100644 --- a/Modules/_testcapi/vectorcall.c +++ b/Modules/_testcapi/vectorcall.c @@ -98,15 +98,7 @@ _testcapi_pyobject_vectorcall_impl(PyObject *module, PyObject *func, PyErr_SetString(PyExc_TypeError, "kwnames must be None or a tuple"); return NULL; } - PyObject *res; - // The CPython interpreter does not guarantee that vectorcalls are - // checked for recursion limit. It's thus up to the C extension themselves to check. - if (Py_EnterRecursiveCall("in _testcapi.pyobject_vectorcall")) { - return NULL; - } - res = PyObject_Vectorcall(func, stack, nargs, kwnames); - Py_LeaveRecursiveCall(); - return res; + return PyObject_Vectorcall(func, stack, nargs, kwnames); } static PyObject * diff --git a/Python/ceval.c b/Python/ceval.c index e11f75a85790f1..5e3d839a2b23c4 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1141,7 +1141,11 @@ typedef struct { _PyStackRef stack[1]; } _PyEntryFrame; -PyObject* _Py_HOT_FUNCTION DONT_SLP_VECTORIZE +/* gh-148284: *Do not* mark this function as _Py_HOT_FUNCTION. + * On certain compilers (Clang-22 and above), this overrides PGO information + * leading possibly to miss-optimization and over-inlining. + */ +PyObject* DONT_SLP_VECTORIZE _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag) { _Py_EnsureTstateNotNULL(tstate); @@ -1173,7 +1177,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int non-tail-call interpreters, as it results in excessive stack usage in some compilers. */ +#if !Py_TAIL_CALL_INTERP PyObject *STACKREF_SCRATCH[MAX_STACKREF_SCRATCH+1]; +#endif /* Local "register" variables. * These are cached values from the frame and code object. */ From 71c91ecf0722784f84176940b136fe6a81ca087f Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 10 Apr 2026 07:08:47 +0800 Subject: [PATCH 5/6] Fix for real this time --- Python/ceval.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 5e3d839a2b23c4..710caf141405bd 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1142,8 +1142,9 @@ typedef struct { } _PyEntryFrame; /* gh-148284: *Do not* mark this function as _Py_HOT_FUNCTION. - * On certain compilers (Clang-22 and above), this overrides PGO information + * On certain compilers (Clang), this overrides PGO information * leading possibly to miss-optimization and over-inlining. + * On GCC, _Py_HOT_FUNCTION is ignored when PGO is enabled. */ PyObject* DONT_SLP_VECTORIZE _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag) @@ -1151,6 +1152,17 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int _Py_EnsureTstateNotNULL(tstate); CALL_STAT_INC(pyeval_calls); + /* +1 because vectorcall might use -1 to write self */ + /* gh-138115: This must not be in individual cases for + non-tail-call interpreters, as it results in excessive + stack usage in some compilers. + This must also be placed before any branches to avoid + interaction with other optimization passes. + */ +#if !Py_TAIL_CALL_INTERP + PyObject *STACKREF_SCRATCH[MAX_STACKREF_SCRATCH+1]; +#endif + #if USE_COMPUTED_GOTOS && !Py_TAIL_CALL_INTERP /* Import the static jump table */ #include "opcode_targets.h" @@ -1172,14 +1184,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int return NULL; } - /* +1 because vectorcall might use -1 to write self */ - /* gh-138115: This must not be in individual cases for - non-tail-call interpreters, as it results in excessive - stack usage in some compilers. - */ -#if !Py_TAIL_CALL_INTERP - PyObject *STACKREF_SCRATCH[MAX_STACKREF_SCRATCH+1]; -#endif /* Local "register" variables. * These are cached values from the frame and code object. */ From 924193a94e9dc8227255d826a2fb936bec2591c0 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 10 Apr 2026 07:13:49 +0800 Subject: [PATCH 6/6] make things more robust --- Python/ceval.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 710caf141405bd..e6e2ee78216a20 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1149,9 +1149,6 @@ typedef struct { PyObject* DONT_SLP_VECTORIZE _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag) { - _Py_EnsureTstateNotNULL(tstate); - CALL_STAT_INC(pyeval_calls); - /* +1 because vectorcall might use -1 to write self */ /* gh-138115: This must not be in individual cases for non-tail-call interpreters, as it results in excessive @@ -1163,6 +1160,10 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int PyObject *STACKREF_SCRATCH[MAX_STACKREF_SCRATCH+1]; #endif + _Py_EnsureTstateNotNULL(tstate); + CALL_STAT_INC(pyeval_calls); + + #if USE_COMPUTED_GOTOS && !Py_TAIL_CALL_INTERP /* Import the static jump table */ #include "opcode_targets.h"