diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-09-20-02-06.gh-issue-148284.DTBhaX.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-09-20-02-06.gh-issue-148284.DTBhaX.rst new file mode 100644 index 00000000000000..93bc7119d02b8e --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-09-20-02-06.gh-issue-148284.DTBhaX.rst @@ -0,0 +1 @@ +Reduce C stack usage in the Python interpreter on recent versions of Clang. diff --git a/Python/ceval.c b/Python/ceval.c index 377b4644eddd2a..e6e2ee78216a20 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1141,12 +1141,29 @@ typedef struct { _PyStackRef stack[1]; } _PyEntryFrame; -PyObject* _Py_HOT_FUNCTION DONT_SLP_VECTORIZE +/* gh-148284: *Do not* mark this function as _Py_HOT_FUNCTION. + * On certain compilers (Clang), this overrides PGO information + * leading possibly to miss-optimization and over-inlining. + * On GCC, _Py_HOT_FUNCTION is ignored when PGO is enabled. + */ +PyObject* DONT_SLP_VECTORIZE _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag) { + /* +1 because vectorcall might use -1 to write self */ + /* gh-138115: This must not be in individual cases for + non-tail-call interpreters, as it results in excessive + stack usage in some compilers. + This must also be placed before any branches to avoid + interaction with other optimization passes. + */ +#if !Py_TAIL_CALL_INTERP + PyObject *STACKREF_SCRATCH[MAX_STACKREF_SCRATCH+1]; +#endif + _Py_EnsureTstateNotNULL(tstate); CALL_STAT_INC(pyeval_calls); + #if USE_COMPUTED_GOTOS && !Py_TAIL_CALL_INTERP /* Import the static jump table */ #include "opcode_targets.h" @@ -1168,6 +1185,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int return NULL; } + /* Local "register" variables. * These are cached values from the frame and code object. */ _Py_CODEUNIT *next_instr; diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 4a878d6dff4353..3fc7589dda03b5 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -427,10 +427,16 @@ do { \ /* How much scratch space to give stackref to PyObject* conversion. */ #define MAX_STACKREF_SCRATCH 10 +#if Py_TAIL_CALL_INTERP #define STACKREFS_TO_PYOBJECTS(ARGS, ARG_COUNT, NAME) \ /* +1 because vectorcall might use -1 to write self */ \ PyObject *NAME##_temp[MAX_STACKREF_SCRATCH+1]; \ PyObject **NAME = _PyObjectArray_FromStackRefArray(ARGS, ARG_COUNT, NAME##_temp + 1); +#else +#define STACKREFS_TO_PYOBJECTS(ARGS, ARG_COUNT, NAME) \ + PyObject **NAME##_temp = (PyObject **)&STACKREF_SCRATCH; \ + PyObject **NAME = _PyObjectArray_FromStackRefArray(ARGS, ARG_COUNT, NAME##_temp + 1); +#endif #define STACKREFS_TO_PYOBJECTS_CLEANUP(NAME) \ /* +1 because we +1 previously */ \