diff --git a/Doc/c-api/perfmaps.rst b/Doc/c-api/perfmaps.rst index 76a1e9f528dc70..bd05e628faaaa1 100644 --- a/Doc/c-api/perfmaps.rst +++ b/Doc/c-api/perfmaps.rst @@ -31,7 +31,7 @@ Note that holding an :term:`attached thread state` is not required for these API or ``-2`` on failure to create a lock. Check ``errno`` for more information about the cause of a failure. -.. c:function:: int PyUnstable_WritePerfMapEntry(const void *code_addr, unsigned int code_size, const char *entry_name) +.. c:function:: int PyUnstable_WritePerfMapEntry(const void *code_addr, size_t code_size, const char *entry_name) Write one single entry to the ``/tmp/perf-$pid.map`` file. This function is thread safe. Here is what an example entry looks like:: diff --git a/Include/cpython/ceval.h b/Include/cpython/ceval.h index bbab8d35b75cb2..5b66fa1040d738 100644 --- a/Include/cpython/ceval.h +++ b/Include/cpython/ceval.h @@ -38,7 +38,7 @@ typedef struct { PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void); PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( const void *code_addr, - unsigned int code_size, + size_t code_size, const char *entry_name); PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void); PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename); diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index f9507fda1606db..fd4221f0816d24 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -94,7 +94,7 @@ typedef struct { void* (*init_state)(void); // Callback to register every trampoline being created void (*write_state)(void* state, const void *code_addr, - unsigned int code_size, PyCodeObject* code); + size_t code_size, PyCodeObject* code); // Callback to free the trampoline state int (*free_state)(void* state); } _PyPerf_Callbacks; @@ -108,6 +108,10 @@ extern PyStatus _PyPerfTrampoline_AfterFork_Child(void); #ifdef PY_HAVE_PERF_TRAMPOLINE extern _PyPerf_Callbacks _Py_perfmap_callbacks; extern _PyPerf_Callbacks _Py_perfmap_jit_callbacks; +extern void _PyPerfJit_WriteNamedCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename); #endif static inline PyObject* diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index cccfe3565db6e0..86f018e328656e 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -69,7 +69,7 @@ struct code_arena_st; struct trampoline_api_st { void* (*init_state)(void); void (*write_state)(void* state, const void *code_addr, - unsigned int code_size, PyCodeObject* code); + size_t code_size, PyCodeObject* code); int (*free_state)(void* state); void *state; Py_ssize_t code_padding; diff --git a/Include/internal/pycore_jit.h b/Include/internal/pycore_jit.h index b3cadcce8247d0..2f97cc26eaf115 100644 --- a/Include/internal/pycore_jit.h +++ b/Include/internal/pycore_jit.h @@ -23,7 +23,7 @@ typedef _Py_CODEUNIT *(*jit_func)( _PyStackRef _tos_cache0, _PyStackRef _tos_cache1, _PyStackRef _tos_cache2 ); -_Py_CODEUNIT *_PyJIT( +_Py_CODEUNIT *_PyJIT_Entry( _PyExecutorObject *executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate ); diff --git a/Include/internal/pycore_jit_unwind.h b/Include/internal/pycore_jit_unwind.h new file mode 100644 index 00000000000000..2d325ad9562286 --- /dev/null +++ b/Include/internal/pycore_jit_unwind.h @@ -0,0 +1,68 @@ +#ifndef Py_INTERNAL_JIT_UNWIND_H +#define Py_INTERNAL_JIT_UNWIND_H + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include +#include + +#if defined(_Py_JIT) && defined(__linux__) && defined(__ELF__) +# define PY_HAVE_JIT_GDB_UNWIND +#endif + +#if defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND) + +#if defined(PY_HAVE_JIT_GDB_UNWIND) +extern PyMutex _Py_jit_debug_mutex; +#endif + +/* DWARF exception-handling pointer encodings shared by JIT unwind users. */ +enum { + DWRF_EH_PE_absptr = 0x00, + DWRF_EH_PE_omit = 0xff, + + /* Data type encodings */ + DWRF_EH_PE_uleb128 = 0x01, + DWRF_EH_PE_udata2 = 0x02, + DWRF_EH_PE_udata4 = 0x03, + DWRF_EH_PE_udata8 = 0x04, + DWRF_EH_PE_sleb128 = 0x09, + DWRF_EH_PE_sdata2 = 0x0a, + DWRF_EH_PE_sdata4 = 0x0b, + DWRF_EH_PE_sdata8 = 0x0c, + DWRF_EH_PE_signed = 0x08, + + /* Reference type encodings */ + DWRF_EH_PE_pcrel = 0x10, + DWRF_EH_PE_textrel = 0x20, + DWRF_EH_PE_datarel = 0x30, + DWRF_EH_PE_funcrel = 0x40, + DWRF_EH_PE_aligned = 0x50, + DWRF_EH_PE_indirect = 0x80 +}; + +/* Return the size of the generated .eh_frame data for the given encoding. */ +size_t _PyJitUnwind_EhFrameSize(int absolute_addr); + +/* + * Build DWARF .eh_frame data for JIT code; returns size written or 0 on error. + * absolute_addr selects the FDE address encoding: + * - 0: PC-relative offsets (perf jitdump synthesized DSO). + * - nonzero: absolute addresses (GDB JIT in-memory ELF). + */ +size_t _PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size, + const void *code_addr, size_t code_size, + int absolute_addr); + +void *_PyJitUnwind_GdbRegisterCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename); + +void _PyJitUnwind_GdbUnregisterCode(void *handle); + +#endif // defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND) + +#endif // Py_INTERNAL_JIT_UNWIND_H diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index f356d60ae5c7a7..a0727c045e51ec 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -198,6 +198,7 @@ typedef struct _PyExecutorObject { uint32_t code_size; size_t jit_size; void *jit_code; + void *jit_gdb_handle; _PyExitData exits[1]; } _PyExecutorObject; diff --git a/Lib/test/test_gdb/gdb_jit_sample.py b/Lib/test/test_gdb/gdb_jit_sample.py new file mode 100644 index 00000000000000..b439e82e8b312f --- /dev/null +++ b/Lib/test/test_gdb/gdb_jit_sample.py @@ -0,0 +1,27 @@ +# Sample script for use by test_gdb.test_jit + +import _testinternalcapi +import operator + + +WARMUP_ITERATIONS = _testinternalcapi.TIER2_THRESHOLD + 10 + + +def jit_bt_hot(depth, warming_up_caller=False): + if depth == 0: + if not warming_up_caller: + id(42) + return + + for iteration in range(WARMUP_ITERATIONS): + operator.call( + jit_bt_hot, + depth - 1, + warming_up_caller or iteration + 1 != WARMUP_ITERATIONS, + ) + + +# Warm the shared shim once without hitting builtin_id so the real run uses +# the steady-state shim path when GDB breaks inside id(42). +jit_bt_hot(1, warming_up_caller=True) +jit_bt_hot(1) diff --git a/Lib/test/test_gdb/test_jit.py b/Lib/test/test_gdb/test_jit.py new file mode 100644 index 00000000000000..ea88d7b0a1fe75 --- /dev/null +++ b/Lib/test/test_gdb/test_jit.py @@ -0,0 +1,201 @@ +import os +import platform +import re +import sys +import unittest + +from .util import setup_module, DebuggerTests + + +JIT_SAMPLE_SCRIPT = os.path.join(os.path.dirname(__file__), "gdb_jit_sample.py") +# In batch GDB, break in builtin_id() while it is running under JIT, +# then repeatedly "finish" until the selected frame is the JIT executor. +# That gives a deterministic backtrace starting with py::jit:executor. +# +# builtin_id() sits only a few helper frames above the JIT entry on this path. +# This bound is just a generous upper limit so the test fails clearly if the +# expected stack shape changes. +MAX_FINISH_STEPS = 20 +# After landing on the JIT entry frame, single-step a bounded number of +# instructions further into the blob so the backtrace is taken from JIT code +# itself rather than the immediate helper-return site. The exact number of +# steps is not significant: each step is cross-checked against the selected +# frame's symbol so the test fails loudly if stepping escapes the registered +# JIT region, instead of asserting against a misleading backtrace. +MAX_JIT_ENTRY_STEPS = 4 +EVAL_FRAME_RE = r"(_PyEval_EvalFrameDefault|_PyEval_Vector)" +JIT_EXECUTOR_FRAME = "py::jit:executor" +JIT_ENTRY_SYMBOL = "_PyJIT_Entry" +BACKTRACE_FRAME_RE = re.compile(r"^#\d+\s+.*$", re.MULTILINE) + +FINISH_TO_JIT_EXECUTOR = ( + "python exec(\"import gdb\\n" + f"target = {JIT_EXECUTOR_FRAME!r}\\n" + f"for _ in range({MAX_FINISH_STEPS}):\\n" + " frame = gdb.selected_frame()\\n" + " if frame is not None and frame.name() == target:\\n" + " break\\n" + " gdb.execute('finish')\\n" + "else:\\n" + " raise RuntimeError('did not reach %s' % target)\\n\")" +) +STEP_INSIDE_JIT_EXECUTOR = ( + "python exec(\"import gdb\\n" + f"target = {JIT_EXECUTOR_FRAME!r}\\n" + f"for _ in range({MAX_JIT_ENTRY_STEPS}):\\n" + " frame = gdb.selected_frame()\\n" + " if frame is None or frame.name() != target:\\n" + " raise RuntimeError('left JIT region during stepping: '\\n" + " + repr(frame and frame.name()))\\n" + " gdb.execute('si')\\n" + "frame = gdb.selected_frame()\\n" + "if frame is None or frame.name() != target:\\n" + " raise RuntimeError('stepped out of JIT region after si')\\n\")" +) + + +def setUpModule(): + setup_module() + + +# The GDB JIT interface registration is gated on __linux__ && __ELF__ in +# Python/jit_unwind.c, and the synthetic EH-frame is only implemented for +# x86_64 and AArch64 (a #error fires otherwise). Skip cleanly on other +# platforms or architectures instead of producing timeouts / empty backtraces. +# is_enabled() implies is_available() and also implies that the runtime has +# JIT execution active; interpreter-only tier 2 builds don't hit this path. +@unittest.skipUnless(sys.platform == "linux", + "GDB JIT interface is only implemented for Linux + ELF") +@unittest.skipUnless(platform.machine() in ("x86_64", "aarch64"), + "GDB JIT CFI emitter only supports x86_64 and AArch64") +@unittest.skipUnless(hasattr(sys, "_jit") and sys._jit.is_enabled(), + "requires a JIT-enabled build with JIT execution active") +class JitBacktraceTests(DebuggerTests): + def get_stack_trace(self, **kwargs): + # These tests validate the JIT-relevant part of the backtrace via + # _assert_jit_backtrace_shape, so an unrelated "?? ()" frame below + # the JIT/eval segment (e.g. libc without debug info) is tolerable. + kwargs.setdefault("skip_on_truncation", False) + return super().get_stack_trace(**kwargs) + + def _extract_backtrace_frames(self, gdb_output): + frames = BACKTRACE_FRAME_RE.findall(gdb_output) + self.assertGreater( + len(frames), 0, + f"expected at least one GDB backtrace frame in output:\n{gdb_output}", + ) + return frames + + def _assert_jit_backtrace_shape(self, gdb_output, *, anchor_at_top): + # Shape assertions applied to every JIT backtrace we produce: + # 1. The synthetic JIT symbol appears exactly once. A second + # py::jit:executor frame would mean the unwinder is + # materializing two native frames for a single logical JIT + # region, or failing to unwind out of the region entirely. + # 2. The unwinder must climb directly back out of the JIT region + # into the eval loop. _PyJIT_Entry only exists to establish the + # physical frame; the synthetic executor FDE collapses it away. + # 3. For tests that assert a specific entry PC, the JIT frame + # is also at #0. + frames = self._extract_backtrace_frames(gdb_output) + backtrace = "\n".join(frames) + + jit_frames = [frame for frame in frames if JIT_EXECUTOR_FRAME in frame] + jit_count = len(jit_frames) + self.assertEqual( + jit_count, 1, + f"expected exactly 1 {JIT_EXECUTOR_FRAME} frame, got {jit_count}\n" + f"backtrace:\n{backtrace}", + ) + eval_frames = [frame for frame in frames if re.search(EVAL_FRAME_RE, frame)] + eval_count = len(eval_frames) + self.assertGreaterEqual( + eval_count, 1, + f"expected at least one _PyEval_* frame, got {eval_count}\n" + f"backtrace:\n{backtrace}", + ) + jit_frame_index = next( + i for i, frame in enumerate(frames) if JIT_EXECUTOR_FRAME in frame + ) + frames_after_jit = frames[jit_frame_index + 1:] + first_eval_offset = next( + ( + i for i, frame in enumerate(frames_after_jit) + if re.search(EVAL_FRAME_RE, frame) + ), + None, + ) + self.assertIsNotNone( + first_eval_offset, + f"expected an eval frame after the JIT frame\n" + f"backtrace:\n{backtrace}", + ) + unexpected_between = frames_after_jit[:first_eval_offset] + self.assertFalse( + unexpected_between, + "expected the executor frame to unwind directly into eval\n" + f"backtrace:\n{backtrace}", + ) + relevant_end = max( + i + for i, frame in enumerate(frames) + if ( + JIT_EXECUTOR_FRAME in frame + or re.search(EVAL_FRAME_RE, frame) + ) + ) + truncated_frames = [ + frame for frame in frames[: relevant_end + 1] + if " ?? ()" in frame + ] + self.assertFalse( + truncated_frames, + "unexpected truncated frame before the validated JIT/eval segment\n" + f"backtrace:\n{backtrace}", + ) + if anchor_at_top: + self.assertRegex( + frames[0], + re.compile(rf"^#0\s+{re.escape(JIT_EXECUTOR_FRAME)}"), + ) + + def test_bt_unwinds_through_jit_frames(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + cmds_after_breakpoint=["bt"], + PYTHON_JIT="1", + ) + # The executor should appear as a named JIT frame and unwind back into + # the eval loop. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=False) + + def test_bt_handoff_from_jit_entry_to_executor(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + breakpoint=JIT_ENTRY_SYMBOL, + cmds_after_breakpoint=[ + "delete 1", + "tbreak builtin_id", + "continue", + "bt", + ], + PYTHON_JIT="1", + ) + # If we stop first in the shim and then continue into the real JIT + # workload, the final backtrace should match the architecture's + # executor unwind contract. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=False) + + def test_bt_unwinds_from_inside_jit_executor(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + cmds_after_breakpoint=[ + FINISH_TO_JIT_EXECUTOR, + STEP_INSIDE_JIT_EXECUTOR, + "bt", + ], + PYTHON_JIT="1", + ) + # Once the selected PC is inside the JIT executor, we require that GDB + # identifies the JIT frame at #0 and keeps unwinding into _PyEval_*. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=True) diff --git a/Lib/test/test_gdb/util.py b/Lib/test/test_gdb/util.py index 8097fd52ababe6..d903adcf2903f3 100644 --- a/Lib/test/test_gdb/util.py +++ b/Lib/test/test_gdb/util.py @@ -20,6 +20,27 @@ PYTHONHASHSEED = '123' +# gh-91960, bpo-40019: gdb reports these when the optimizer has dropped +# python-frame debug info; the test can't read what's not there. +_OPTIMIZED_OUT_PATTERNS = ( + '(frame information optimized out)', + 'Unable to read information on python frame', + '(unable to read python frame information)', +) +# gdb prints this when the unwinder genuinely failed to walk a frame — +# i.e. the CFI (ours or a library's) is wrong. Treat as a hard failure, +# not a skip, so regressions in our own unwind info don't hide. +_UNWIND_FAILURE_PATTERNS = ( + 'Backtrace stopped: frame did not save the PC', +) +# gh-104736: " ?? ()" in the bt usually means the unwinder bailed early, +# but can also be unrelated frames without debug info (e.g. libc). Tests +# that validate the JIT-relevant part of the backtrace themselves can +# opt out via skip_on_truncation=False. +_TRUNCATED_BACKTRACE_PATTERNS = ( + ' ?? ()', +) + def clean_environment(): # Remove PYTHON* environment variables such as PYTHONHOME @@ -160,7 +181,9 @@ def get_stack_trace(self, source=None, script=None, breakpoint=BREAKPOINT_FN, cmds_after_breakpoint=None, import_site=False, - ignore_stderr=False): + ignore_stderr=False, + skip_on_truncation=True, + **env_vars): ''' Run 'python -c SOURCE' under gdb with a breakpoint. @@ -239,7 +262,7 @@ def get_stack_trace(self, source=None, script=None, args += [script] # Use "args" to invoke gdb, capturing stdout, stderr: - out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED) + out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED, **env_vars) if not ignore_stderr: for line in err.splitlines(): @@ -255,26 +278,20 @@ def get_stack_trace(self, source=None, script=None, " because the Program Counter is" " not present") + for pattern in _UNWIND_FAILURE_PATTERNS: + if pattern in out: + raise AssertionError( + f"gdb unwinder failed ({pattern!r}) — CFI bug in our " + f"generated code or in a linked library.\n" + f"Full gdb output:\n{out}" + ) + # bpo-40019: Skip the test if gdb failed to read debug information # because the Python binary is optimized. - for pattern in ( - '(frame information optimized out)', - 'Unable to read information on python frame', - - # gh-91960: On Python built with "clang -Og", gdb gets - # "frame=" for _PyEval_EvalFrameDefault() parameter - '(unable to read python frame information)', - - # gh-104736: On Python built with "clang -Og" on ppc64le, - # "py-bt" displays a truncated or not traceback, but "where" - # logs this error message: - 'Backtrace stopped: frame did not save the PC', - - # gh-104736: When "bt" command displays something like: - # "#1 0x0000000000000000 in ?? ()", the traceback is likely - # truncated or wrong. - ' ?? ()', - ): + patterns = _OPTIMIZED_OUT_PATTERNS + if skip_on_truncation: + patterns = patterns + _TRUNCATED_BACKTRACE_PATTERNS + for pattern in patterns: if pattern in out: raise unittest.SkipTest(f"{pattern!r} found in gdb output") diff --git a/Lib/test/test_perfmaps.py b/Lib/test/test_perfmaps.py index 647c32656abd6d..ee4eb50033c470 100644 --- a/Lib/test/test_perfmaps.py +++ b/Lib/test/test_perfmaps.py @@ -1,4 +1,5 @@ import os +import sys import sysconfig import unittest @@ -17,6 +18,9 @@ def supports_trampoline_profiling(): raise unittest.SkipTest("perf trampoline profiling not supported") class TestPerfMapWriting(unittest.TestCase): + def tearDown(self): + perf_map_state_teardown() + def test_write_perf_map_entry(self): self.assertEqual(write_perf_map_entry(0x1234, 5678, "entry1"), 0) self.assertEqual(write_perf_map_entry(0x2345, 6789, "entry2"), 0) @@ -24,4 +28,15 @@ def test_write_perf_map_entry(self): perf_file_contents = f.read() self.assertIn("1234 162e entry1", perf_file_contents) self.assertIn("2345 1a85 entry2", perf_file_contents) - perf_map_state_teardown() + + @unittest.skipIf(sys.maxsize <= 2**32, "requires size_t wider than unsigned int") + def test_write_perf_map_entry_large_size(self): + code_addr = 0x3456 + code_size = 1 << 33 + entry_name = "entry_big" + + self.assertEqual(write_perf_map_entry(code_addr, code_size, entry_name), 0) + with open(f"/tmp/perf-{os.getpid()}.map") as f: + perf_file_contents = f.read() + self.assertIn(f"{code_addr:x} {code_size:x} {entry_name}", + perf_file_contents) diff --git a/Makefile.pre.in b/Makefile.pre.in index 0edf55d991a05e..fc44399434fe95 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -514,6 +514,7 @@ PYTHON_OBJS= \ Python/suggestions.o \ Python/perf_trampoline.o \ Python/perf_jit_trampoline.o \ + Python/jit_unwind.o \ Python/remote_debugging.o \ Python/$(DYNLOADFILE) \ $(LIBOBJS) \ @@ -3209,7 +3210,8 @@ Python/emscripten_trampoline_wasm.c: Python/emscripten_trampoline_inner.wasm $(PYTHON_FOR_REGEN) $(srcdir)/Platforms/emscripten/prepare_external_wasm.py $< $@ getWasmTrampolineModule JIT_SHIM_BUILD_OBJS= @JIT_SHIM_BUILD_O@ -JIT_BUILD_TARGETS= jit_stencils.h @JIT_STENCILS_H@ $(JIT_SHIM_BUILD_OBJS) +JIT_UNWIND_INFO_H= $(if $(JIT_OBJS),jit_unwind_info.h $(patsubst jit_stencils-%.h,jit_unwind_info-%.h,@JIT_STENCILS_H@)) +JIT_BUILD_TARGETS= jit_stencils.h @JIT_STENCILS_H@ $(JIT_UNWIND_INFO_H) $(JIT_SHIM_BUILD_OBJS) JIT_TARGETS= $(JIT_BUILD_TARGETS) $(filter-out $(JIT_SHIM_BUILD_OBJS),$(JIT_OBJS)) JIT_GENERATED_STAMP= .jit-stamp @@ -3237,6 +3239,9 @@ jit_shim-universal2-apple-darwin.o: jit_shim-aarch64-apple-darwin.o jit_shim-x86 Python/jit.o: $(srcdir)/Python/jit.c @JIT_STENCILS_H@ $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< +Python/jit_unwind.o: $(srcdir)/Python/jit_unwind.c $(JIT_UNWIND_INFO_H) + $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< + .PHONY: regen-jit regen-jit: $(JIT_TARGETS) @@ -3362,7 +3367,7 @@ clean-profile: clean-retain-profile clean-bolt # gh-141808: The JIT stencils are deliberately kept in clean-profile .PHONY: clean-jit-stencils clean-jit-stencils: - -rm -f $(JIT_TARGETS) $(JIT_GENERATED_STAMP) jit_stencils*.h jit_shim*.o + -rm -f $(JIT_TARGETS) $(JIT_GENERATED_STAMP) jit_stencils*.h jit_unwind_info*.h jit_shim*.o .PHONY: clean clean: clean-profile clean-jit-stencils diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst new file mode 100644 index 00000000000000..4d2634d0dd1e81 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst @@ -0,0 +1 @@ +Add support for unwinding JIT frames using GDB. Patch by Diego Russo and Pablo Galindo. diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 5319d9c7a4819b..a07675bb66d8cc 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1256,16 +1256,22 @@ write_perf_map_entry(PyObject *self, PyObject *args) { PyObject *code_addr_v; const void *code_addr; - unsigned int code_size; + PyObject *code_size_s; + size_t code_size; const char *entry_name; - if (!PyArg_ParseTuple(args, "OIs", &code_addr_v, &code_size, &entry_name)) + if (!PyArg_ParseTuple(args, "OOs", &code_addr_v, &code_size_s, &entry_name)) return NULL; code_addr = PyLong_AsVoidPtr(code_addr_v); if (code_addr == NULL) { return NULL; } + code_size = PyLong_AsSize_t(code_size_s); + if (code_size == (size_t)-1 && PyErr_Occurred()) { + return NULL; + } + int ret = PyUnstable_WritePerfMapEntry(code_addr, code_size, entry_name); if (ret < 0) { PyErr_SetFromErrno(PyExc_OSError); diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index 3bdbf2ef81679e..5bd53c2146a822 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -20,6 +20,7 @@ #include "pycore_fileutils.h" // _Py_closerange() #include "pycore_import.h" // _PyImport_AcquireLock() #include "pycore_initconfig.h" // _PyStatus_EXCEPTION() +#include "pycore_jit_unwind.h" // _Py_jit_debug_mutex #include "pycore_long.h" // _PyLong_IsNegative() #include "pycore_moduleobject.h" // _PyModule_GetState() #include "pycore_object.h" // _PyObject_LookupSpecial() @@ -758,6 +759,13 @@ PyOS_AfterFork_Child(void) goto fatal_error; } +#if defined(PY_HAVE_JIT_GDB_UNWIND) + // The child can inherit this mutex locked if another thread held it at + // fork(), but the child itself cannot be inside gdb_jit_register_code(). + // Reinitialize it before any executor cleanup can unregister JIT code. + _Py_jit_debug_mutex = (PyMutex){0}; +#endif + reset_remotedebug_data(tstate); reset_asyncio_state((_PyThreadStateImpl *)tstate); diff --git a/Python/ceval.c b/Python/ceval.c index 0d7f8a62e246ae..28087ba58d4855 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1305,7 +1305,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } #ifdef _Py_TIER2 #ifdef _Py_JIT -_PyJitEntryFuncPtr _Py_jit_entry = _PyJIT; +_PyJitEntryFuncPtr _Py_jit_entry = _PyJIT_Entry; #else _PyJitEntryFuncPtr _Py_jit_entry = _PyTier2Interpreter; #endif diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index a7d63fd3b82211..a4e9980589e4a3 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -168,7 +168,6 @@ #define STOP_TRACING() ((void)(0)); #endif - /* PRE_DISPATCH_GOTO() does lltrace if enabled. Normally a no-op */ #ifdef Py_DEBUG #define PRE_DISPATCH_GOTO() if (frame->lltrace >= 5) { \ diff --git a/Python/jit.c b/Python/jit.c index 26e01b25d48c04..8b555105129a9f 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -15,6 +15,7 @@ #include "pycore_interpframe.h" #include "pycore_interpolation.h" #include "pycore_intrinsics.h" +#include "pycore_jit_unwind.h" #include "pycore_lazyimportobject.h" #include "pycore_list.h" #include "pycore_long.h" @@ -60,6 +61,40 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } +/* + * Publish JIT code to optional tooling backends. + * + * The return value is a backend-specific deregistration handle, not a + * success/failure indicator. NULL means there is nothing to unregister later: + * perf does not need a handle, and GDB registration failures are intentionally + * non-fatal because tooling support must not make JIT compilation fail. + */ +static void * +jit_record_code(const void *code_addr, size_t code_size, + const char *entry, const char *filename) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + _PyPerf_Callbacks callbacks; + _PyPerfTrampoline_GetCallbacks(&callbacks); + if (callbacks.write_state == _Py_perfmap_jit_callbacks.write_state) { + _PyPerfJit_WriteNamedCode( + code_addr, code_size, entry, filename); + return NULL; + } +#endif + +#if defined(PY_HAVE_JIT_GDB_UNWIND) + return _PyJitUnwind_GdbRegisterCode( + code_addr, code_size, entry, filename); +#else + (void)code_addr; + (void)code_size; + (void)entry; + (void)filename; + return NULL; +#endif +} + static int address_in_executor_array(_PyExecutorObject **ptrs, size_t count, uintptr_t addr) { @@ -715,6 +750,10 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz } executor->jit_code = memory; executor->jit_size = total_size; + executor->jit_gdb_handle = jit_record_code(memory, + code_size + state.trampolines.size, + "jit", + "executor"); return 0; } @@ -727,6 +766,12 @@ _PyJIT_Free(_PyExecutorObject *executor) if (memory) { executor->jit_code = NULL; executor->jit_size = 0; +#if defined(PY_HAVE_JIT_GDB_UNWIND) + if (executor->jit_gdb_handle != NULL) { + _PyJitUnwind_GdbUnregisterCode(executor->jit_gdb_handle); + executor->jit_gdb_handle = NULL; + } +#endif if (jit_free(memory, size)) { PyErr_FormatUnraisable("Exception ignored while " "freeing JIT memory"); diff --git a/Python/jit_unwind.c b/Python/jit_unwind.c new file mode 100644 index 00000000000000..09002feafec17c --- /dev/null +++ b/Python/jit_unwind.c @@ -0,0 +1,986 @@ +/* + * Python JIT - DWARF .eh_frame builder + * + * This file contains the DWARF CFI generator used to build .eh_frame + * data for JIT code (perf jitdump and other unwinders). + */ + +#include "Python.h" +#include "pycore_jit_unwind.h" +#include "pycore_lock.h" + +#if defined(PY_HAVE_JIT_GDB_UNWIND) +# include "jit_unwind_info.h" +# if !JIT_UNWIND_INFO_SUPPORTED +# error "JIT unwind info was not generated for this target" +# endif +#endif + +#if defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND) + +#if defined(PY_HAVE_JIT_GDB_UNWIND) +# include +#endif +#include +#include + +// ============================================================================= +// DWARF CONSTANTS +// ============================================================================= + +/* + * DWARF (Debug With Arbitrary Record Formats) constants + * + * DWARF is a debugging data format used to provide stack unwinding information. + * These constants define the various encoding types and opcodes used in + * DWARF Call Frame Information (CFI) records. + */ + +/* DWARF Call Frame Information version */ +#define DWRF_CIE_VERSION 1 + +/* DWARF CFA (Call Frame Address) opcodes */ +enum { + DWRF_CFA_nop = 0x0, // No operation + DWRF_CFA_offset_extended = 0x5, // Extended offset instruction + DWRF_CFA_def_cfa = 0xc, // Define CFA rule + DWRF_CFA_def_cfa_register = 0xd, // Define CFA register + DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset + DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset + DWRF_CFA_advance_loc = 0x40, // Advance location counter + DWRF_CFA_offset = 0x80, // Simple offset instruction + DWRF_CFA_restore = 0xc0 // Restore register +}; + +/* + * Architecture-specific DWARF register numbers + * + * These constants define the register numbering scheme used by DWARF + * for each supported architecture. The numbers must match the ABI + * specification for proper stack unwinding. + */ +enum { +#ifdef __x86_64__ + /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ + DWRF_REG_AX, // RAX + DWRF_REG_DX, // RDX + DWRF_REG_CX, // RCX + DWRF_REG_BX, // RBX + DWRF_REG_SI, // RSI + DWRF_REG_DI, // RDI + DWRF_REG_BP, // RBP + DWRF_REG_SP, // RSP + DWRF_REG_8, // R8 + DWRF_REG_9, // R9 + DWRF_REG_10, // R10 + DWRF_REG_11, // R11 + DWRF_REG_12, // R12 + DWRF_REG_13, // R13 + DWRF_REG_14, // R14 + DWRF_REG_15, // R15 + DWRF_REG_RA, // Return address (RIP) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 register numbering */ + DWRF_REG_FP = 29, // Frame Pointer + DWRF_REG_RA = 30, // Link register (return address) + DWRF_REG_SP = 31, // Stack pointer +#else +# error "Unsupported target architecture" +#endif +}; + +// ============================================================================= +// ELF OBJECT CONTEXT +// ============================================================================= + +/* + * Context for building ELF/DWARF structures + * + * This structure maintains state while constructing DWARF unwind information. + * It acts as a simple buffer manager with pointers to track current position + * and important landmarks within the buffer. + */ +typedef struct ELFObjectContext { + uint8_t* p; // Current write position in buffer + uint8_t* startp; // Start of buffer (for offset calculations) + uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) + uintptr_t code_addr; // Address of the code section + size_t code_size; // Size of the code section +} ELFObjectContext; + +// ============================================================================= +// DWARF GENERATION UTILITIES +// ============================================================================= + +/* + * Append a null-terminated string to the ELF context buffer. + * + * Args: + * ctx: ELF object context + * str: String to append (must be null-terminated) + * + * Returns: Offset from start of buffer where string was written + */ +static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { + uint8_t* p = ctx->p; + uint32_t ofs = (uint32_t)(p - ctx->startp); + + /* Copy string including null terminator */ + do { + *p++ = (uint8_t)*str; + } while (*str++); + + ctx->p = p; + return ofs; +} + +/* + * Append a SLEB128 (Signed Little Endian Base 128) value + * + * SLEB128 is a variable-length encoding used extensively in DWARF. + * It efficiently encodes small numbers in fewer bytes. + * + * Args: + * ctx: ELF object context + * v: Signed value to encode + */ +static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { + uint8_t* p = ctx->p; + + /* Encode 7 bits at a time, with continuation bit in MSB */ + for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { + *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit + } + *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit + + ctx->p = p; +} + +/* + * Append a ULEB128 (Unsigned Little Endian Base 128) value + * + * Similar to SLEB128 but for unsigned values. + * + * Args: + * ctx: ELF object context + * v: Unsigned value to encode + */ +static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { + uint8_t* p = ctx->p; + + /* Encode 7 bits at a time, with continuation bit in MSB */ + for (; v >= 0x80; v >>= 7) { + *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit + } + *p++ = (char)v; // Final byte without continuation bit + + ctx->p = p; +} + +/* + * Macros for generating DWARF structures + * + * These macros provide a convenient way to write various data types + * to the DWARF buffer while automatically advancing the pointer. + */ +#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit +#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit +#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit +#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit +#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address +#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 +#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 +#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string + +/* Align to specified boundary with NOP instructions */ +#define DWRF_ALIGNNOP(s) \ + while ((uintptr_t)p & ((s)-1)) { \ + *p++ = DWRF_CFA_nop; \ + } + +/* Write a DWARF section with automatic size calculation */ +#define DWRF_SECTION(name, stmt) \ + { \ + uint32_t* szp_##name = (uint32_t*)p; \ + p += 4; \ + stmt; \ + *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ + } + +// ============================================================================= +// DWARF EH FRAME GENERATION +// ============================================================================= + +static void elf_init_ehframe_perf(ELFObjectContext* ctx); +#if defined(PY_HAVE_JIT_GDB_UNWIND) +static void elf_init_ehframe_gdb(ELFObjectContext* ctx); +#endif + +static inline void elf_init_ehframe(ELFObjectContext* ctx, int absolute_addr) { + if (absolute_addr) { +#if defined(PY_HAVE_JIT_GDB_UNWIND) + elf_init_ehframe_gdb(ctx); +#else + Py_UNREACHABLE(); +#endif + } + else { + elf_init_ehframe_perf(ctx); + } +} + +size_t +_PyJitUnwind_EhFrameSize(int absolute_addr) +{ + /* The .eh_frame we emit is small and bounded; keep a generous buffer. */ + uint8_t scratch[512]; + _Static_assert(sizeof(scratch) >= 256, + "scratch buffer may be too small for elf_init_ehframe"); + ELFObjectContext ctx; + ctx.code_size = 1; + ctx.code_addr = 0; + ctx.startp = ctx.p = scratch; + ctx.fde_p = NULL; + /* Generate once into scratch to learn the required size. */ + elf_init_ehframe(&ctx, absolute_addr); + ptrdiff_t size = ctx.p - ctx.startp; + assert(size <= (ptrdiff_t)sizeof(scratch)); + return (size_t)size; +} + +size_t +_PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size, + const void *code_addr, size_t code_size, + int absolute_addr) +{ + if (buffer == NULL || code_addr == NULL || code_size == 0) { + return 0; + } + /* Generate the frame twice: once to size-check, once to write. */ + size_t required = _PyJitUnwind_EhFrameSize(absolute_addr); + if (required == 0 || required > buffer_size) { + return 0; + } + ELFObjectContext ctx; + ctx.code_size = code_size; + ctx.code_addr = (uintptr_t)code_addr; + ctx.startp = ctx.p = buffer; + ctx.fde_p = NULL; + elf_init_ehframe(&ctx, absolute_addr); + size_t written = (size_t)(ctx.p - ctx.startp); + /* The frame size is independent of code_addr/code_size (fixed-width fields). */ + assert(written == required); + return written; +} + +/* + * Generate a minimal .eh_frame for a single JIT code region. + * + * The .eh_frame section contains Call Frame Information (CFI) that describes + * how to unwind the stack at any point in the code. This is essential for + * unwinding through JIT-generated code. + * + * The generated data contains: + * 1. A CIE (Common Information Entry) describing the calling convention. + * 2. An FDE (Frame Description Entry) describing how to unwind the JIT frame. + * + * Two flavors are emitted, dispatched on the absolute_addr flag: + * + * - absolute_addr == 0 (elf_init_ehframe_perf): PC-relative FDE address + * encoding for perf's synthesized DSO layout. The CIE describes the + * trampoline's entry state and the FDE walks through the prologue and + * epilogue with advance_loc instructions. This matches the pre-existing + * perf_jit_trampoline behavior byte-for-byte. + * + * - absolute_addr == 1 (elf_init_ehframe_gdb): absolute FDE address + * encoding for the GDB JIT in-memory ELF. The CIE describes the + * steady-state frame layout (CFA = %rbp+16 / x29+16, with saved fp and + * return-address column at fixed offsets) and the FDE emits no further + * CFI. The same rule applies at every PC in the registered region, + * which is correct for executor stencils (they pin the frame pointer + * across the region). This is the GDB-side fix; see elf_init_ehframe_gdb + * for details. + */ +static void elf_init_ehframe_perf(ELFObjectContext* ctx) { + int fde_ptr_enc = DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4; + uint8_t* p = ctx->p; + uint8_t* framep = p; // Remember start of frame data + + /* + * DWARF Unwind Table for Trampoline Function + * + * This section defines DWARF Call Frame Information (CFI) using encoded macros + * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function + * preserves and restores registers. This is used by profiling tools (e.g., `perf`) + * and debuggers for stack unwinding in JIT-compiled code. + * + * ------------------------------------------------- + * TO REGENERATE THIS TABLE FROM GCC OBJECTS: + * ------------------------------------------------- + * + * 1. Create a trampoline source file (e.g., `trampoline.c`): + * + * #include + * typedef PyObject* (*py_evaluator)(void*, void*, int); + * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { + * return evaluator(ts, f, throwflag); + * } + * + * 2. Compile to an object file with frame pointer preservation: + * + * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c + * + * 3. Extract DWARF unwind info from the object file: + * + * readelf -w trampoline.o + * + * Example output from `.eh_frame`: + * + * 00000000 CIE + * Version: 1 + * Augmentation: "zR" + * Code alignment factor: 4 + * Data alignment factor: -8 + * Return address column: 30 + * DW_CFA_def_cfa: r31 (sp) ofs 0 + * + * 00000014 FDE cie=00000000 pc=0..14 + * DW_CFA_advance_loc: 4 + * DW_CFA_def_cfa_offset: 16 + * DW_CFA_offset: r29 at cfa-16 + * DW_CFA_offset: r30 at cfa-8 + * DW_CFA_advance_loc: 12 + * DW_CFA_restore: r30 + * DW_CFA_restore: r29 + * DW_CFA_def_cfa_offset: 0 + * + * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. + * + * ---------------------------------- + * HOW TO TRANSLATE TO DWRF_* MACROS: + * ---------------------------------- + * + * After compiling your trampoline with: + * + * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c + * + * run: + * + * readelf -w trampoline.o + * + * to inspect the generated `.eh_frame` data. You will see two main components: + * + * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. + * 2. An FDE (Frame Description Entry): function-specific unwind instructions. + * + * --------------------- + * Translating the CIE: + * --------------------- + * From `readelf -w`, you might see: + * + * 00000000 0000000000000010 00000000 CIE + * Version: 1 + * Augmentation: "zR" + * Code alignment factor: 4 + * Data alignment factor: -8 + * Return address column: 30 + * Augmentation data: 1b + * DW_CFA_def_cfa: r31 (sp) ofs 0 + * + * Map this to: + * + * DWRF_SECTION(CIE, + * DWRF_U32(0); // CIE ID (always 0 for CIEs) + * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 + * DWRF_STR("zR"); // Augmentation string "zR" + * DWRF_UV(4); // Code alignment factor = 4 + * DWRF_SV(-8); // Data alignment factor = -8 + * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) + * DWRF_UV(1); // Augmentation data length = 1 + * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers + * + * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa + * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) + * DWRF_UV(0); // Offset = 0 + * + * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary + * ) + * + * Notes: + * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. + * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. + * + * --------------------- + * Translating the FDE: + * --------------------- + * From `readelf -w`: + * + * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 + * DW_CFA_advance_loc: 4 + * DW_CFA_def_cfa_offset: 16 + * DW_CFA_offset: r29 at cfa-16 + * DW_CFA_offset: r30 at cfa-8 + * DW_CFA_advance_loc: 12 + * DW_CFA_restore: r30 + * DW_CFA_restore: r29 + * DW_CFA_def_cfa_offset: 0 + * + * Map the FDE header and instructions to: + * + * DWRF_SECTION(FDE, + * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) + * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) + * DWRF_U32(ctx->code_size); // Code range covered by this FDE + * DWRF_U8(0); // Augmentation data length (none) + * + * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) + * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + * DWRF_UV(16); + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) + * DWRF_UV(2); // At offset 2 * 8 = 16 bytes + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) + * DWRF_UV(1); // At offset 1 * 8 = 8 bytes + * + * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 + * + * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + * DWRF_UV(0); + * ) + * + * To regenerate: + * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. + * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as + * the code is in a different address space every time. + * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: + * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) + * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) + * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset + * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) + * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. + * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. + */ + + /* + * Emit DWARF EH CIE (Common Information Entry) + * + * The CIE describes the calling conventions and basic unwinding rules + * that apply to all functions in this compilation unit. + */ + DWRF_SECTION(CIE, + DWRF_U32(0); // CIE ID (0 indicates this is a CIE) + DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) + DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) +#ifdef __x86_64__ + DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) +#endif + DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) + DWRF_U8(DWRF_REG_RA); // Return address register number + DWRF_UV(1); // Augmentation data length + DWRF_U8(fde_ptr_enc); // FDE pointer encoding + + /* Initial CFI instructions - describe default calling convention */ +#ifdef __x86_64__ + /* x86_64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size + DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved + DWRF_UV(1); // At offset 1 from CFA +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) + // No initial register saves in AArch64 CIE +#endif + DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary + ) + + /* + * Emit DWARF EH FDE (Frame Description Entry) + * + * The FDE describes unwinding information specific to this function. + * It references the CIE and provides function-specific CFI instructions. + * + * The PC-relative offset is calculated after the entire EH frame is built + * to ensure accurate positioning relative to the synthesized DSO layout. + */ + DWRF_SECTION(FDE, + DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) + /* + * In perf jitdump mode the FDE PC field is encoded PC-relative and + * points back to code_start. Record where that field lives so we can + * patch in the final offset after the rest of the synthetic DSO + * layout is known. + */ + ctx->fde_p = p; // Remember where PC offset field is located for later calculation + DWRF_U32(0); // Placeholder for PC-relative offset (calculated below) + DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) + DWRF_U8(0); // Augmentation data length (none) + + /* + * Architecture-specific CFI instructions + * + * These instructions describe how registers are saved and restored + * during function calls. Each architecture has different calling + * conventions and register usage patterns. + */ +#ifdef __x86_64__ + /* x86_64 calling convention unwinding rules */ +# if defined(__CET__) && (__CET__ & 1) + DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) +# endif + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) + DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 + DWRF_UV(16); // New offset: SP + 16 + DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 + DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) + DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 + DWRF_UV(DWRF_REG_BP); // Use base pointer register + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 + DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 + DWRF_UV(DWRF_REG_SP); // Use stack pointer register + DWRF_UV(8); // New offset: SP + 8 +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 calling convention unwinding rules */ + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) + DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + DWRF_UV(16); // Stack pointer moved by 16 bytes + DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved + DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) + DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved + DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) + DWRF_U8(DWRF_CFA_def_cfa_register); // CFA = FP (x29) + 16 + DWRF_UV(DWRF_REG_FP); + DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_def_cfa); // CFA = SP + 0 (stack restored) + DWRF_UV(DWRF_REG_SP); + DWRF_UV(0); + +#else +# error "Unsupported target architecture" +#endif + + DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary + ) + + ctx->p = p; // Update context pointer to end of generated data + + /* Calculate and update the PC-relative offset in the FDE + * + * When perf processes the jitdump, it creates a synthesized DSO with this layout: + * + * Synthesized DSO Memory Layout: + * ┌─────────────────────────────────────────────────────────────┐ < code_start + * │ Code Section │ + * │ (round_up(code_size, 8) bytes) │ + * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data + * │ EH Frame Data │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ CIE data │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ FDE Header: │ │ + * │ │ - CIE offset (4 bytes) │ │ + * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start + * │ │ - address range (4 bytes) │ │ (this specific field) + * │ │ CFI Instructions... │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * ├─────────────────────────────────────────────────────────────┤ < reference_point + * │ EhFrameHeader │ + * │ (navigation metadata) │ + * └─────────────────────────────────────────────────────────────┘ + * + * The PC offset field in the FDE must contain the distance from itself to code_start: + * + * distance = code_start - fde_pc_field + * + * Where: + * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame + * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) + * + * Therefore: + * distance = code_start_location - fde_pc_field_location + * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) + * = -rounded_code_size - fde_offset_in_frame + * = -(round_up(code_size, 8) + fde_offset_in_frame) + * + * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field. + * + */ + int32_t rounded_code_size = + (int32_t)_Py_SIZE_ROUND_UP(ctx->code_size, 8); + int32_t fde_offset_in_frame = (int32_t)(ctx->fde_p - framep); + *(int32_t *)ctx->fde_p = -(rounded_code_size + fde_offset_in_frame); +} + +/* + * Build .eh_frame data for the GDB JIT interface. + * + * The executor runs inside the frame established by _PyJIT_Entry, but the + * synthetic executor FDE collapses that state into a single logical JIT frame + * that unwinds directly into _PyEval_*. Executor stencils never touch the + * frame pointer - enforced by Tools/jit/_optimizers.py _validate() and + * -mframe-pointer=reserved - so the steady-state rule is valid at every PC + * and the FDE body is empty. Tools/jit/_targets.py derives the initial CFI + * rules from the row active at the executor call in the compiled shim object. + */ +#if defined(PY_HAVE_JIT_GDB_UNWIND) +static void elf_init_ehframe_gdb(ELFObjectContext* ctx) { + int fde_ptr_enc = DWRF_EH_PE_absptr; + uint8_t* p = ctx->p; + uint8_t* framep = p; + + DWRF_SECTION(CIE, + DWRF_U32(0); // CIE ID + DWRF_U8(DWRF_CIE_VERSION); + DWRF_STR("zR"); // aug data length + FDE ptr encoding follow + DWRF_UV(JIT_UNWIND_CODE_ALIGNMENT_FACTOR); + DWRF_SV(JIT_UNWIND_DATA_ALIGNMENT_FACTOR); + DWRF_U8(JIT_UNWIND_RA_REG); + DWRF_UV(1); // Augmentation data length + DWRF_U8(fde_ptr_enc); // FDE pointer encoding + + /* Executor steady-state rule (our invariant, not the compiler's). */ + DWRF_U8(DWRF_CFA_def_cfa); + DWRF_UV(JIT_UNWIND_CFA_REG); + DWRF_UV(JIT_UNWIND_CFA_OFFSET); + DWRF_U8(DWRF_CFA_offset | JIT_UNWIND_FP_REG); + DWRF_UV(JIT_UNWIND_FP_OFFSET); + DWRF_U8(DWRF_CFA_offset | JIT_UNWIND_RA_REG); + DWRF_UV(JIT_UNWIND_RA_OFFSET); + DWRF_ALIGNNOP(sizeof(uintptr_t)); + ) + + DWRF_SECTION(FDE, + DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) + DWRF_ADDR(ctx->code_addr); // Absolute code start + DWRF_ADDR((uintptr_t)ctx->code_size); // Code range covered + DWRF_U8(0); // Augmentation data length (none) + DWRF_ALIGNNOP(sizeof(uintptr_t)); + ) + + ctx->p = p; +} +#endif + +#if defined(PY_HAVE_JIT_GDB_UNWIND) +enum { + JIT_NOACTION = 0, + JIT_REGISTER_FN = 1, + JIT_UNREGISTER_FN = 2, +}; + +struct jit_code_entry { + struct jit_code_entry *next; + struct jit_code_entry *prev; + const char *symfile_addr; + uint64_t symfile_size; + const void *code_addr; +}; + +struct jit_descriptor { + uint32_t version; + uint32_t action_flag; + struct jit_code_entry *relevant_entry; + struct jit_code_entry *first_entry; +}; + +PyMutex _Py_jit_debug_mutex = {0}; + +Py_EXPORTED_SYMBOL volatile struct jit_descriptor __jit_debug_descriptor = { + 1, JIT_NOACTION, NULL, NULL +}; + +Py_EXPORTED_SYMBOL void __attribute__((noinline)) +__jit_debug_register_code(void) +{ + /* Keep this call visible to debuggers and not optimized away. */ + (void)__jit_debug_descriptor.action_flag; +#if defined(__GNUC__) || defined(__clang__) + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static uint16_t +gdb_jit_machine_id(void) +{ + /* Map the current target to ELF e_machine; return 0 to skip registration. */ +#if defined(__x86_64__) || defined(_M_X64) + return EM_X86_64; +#elif defined(__aarch64__) && !defined(__ILP32__) + return EM_AARCH64; +#else + return 0; +#endif +} + +static struct jit_code_entry * +gdb_jit_register_code( + const void *code_addr, + size_t code_size, + const char *symname, + const uint8_t *eh_frame, + size_t eh_frame_size +) +{ + /* + * Build a minimal in-memory ELF for GDB's JIT interface and link it into + * __jit_debug_descriptor so debuggers can resolve JIT code. + */ + if (code_addr == NULL || code_size == 0 || symname == NULL) { + return NULL; + } + + const uint16_t machine = gdb_jit_machine_id(); + if (machine == 0) { + return NULL; + } + + enum { + SH_NULL = 0, + SH_TEXT, + SH_EH_FRAME, + SH_SHSTRTAB, + SH_STRTAB, + SH_SYMTAB, + SH_NUM, + }; + static const char shstrtab[] = + "\0.text\0.eh_frame\0.shstrtab\0.strtab\0.symtab"; + _Static_assert(sizeof(shstrtab) == + 1 + sizeof(".text") + sizeof(".eh_frame") + + sizeof(".shstrtab") + sizeof(".strtab") + sizeof(".symtab"), + "shstrtab size mismatch"); + const size_t shstrtab_size = sizeof(shstrtab); + const size_t sh_text = 1; + const size_t sh_eh_frame = sh_text + sizeof(".text"); + const size_t sh_shstrtab = sh_eh_frame + sizeof(".eh_frame"); + const size_t sh_strtab = sh_shstrtab + sizeof(".shstrtab"); + const size_t sh_symtab = sh_strtab + sizeof(".strtab"); + const size_t text_size = code_size; + const size_t text_padded = _Py_SIZE_ROUND_UP(text_size, 8); + const size_t strtab_size = 1 + strlen(symname) + 1; + const size_t symtab_size = 3 * sizeof(Elf64_Sym); + + size_t offset = sizeof(Elf64_Ehdr); + offset = _Py_SIZE_ROUND_UP(offset, 16); + const size_t text_off = offset; + const size_t eh_off = text_off + text_padded; + offset = eh_off + eh_frame_size; + const size_t shstr_off = offset; + offset += shstrtab_size; + const size_t str_off = offset; + offset += strtab_size; + /* Elf64_Sym requires 8-byte alignment for st_value/st_size. */ + offset = _Py_SIZE_ROUND_UP(offset, 8); + const size_t sym_off = offset; + offset += symtab_size; + offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Shdr)); + const size_t sh_off = offset; + + const size_t shnum = SH_NUM; + const size_t total_size = sh_off + shnum * sizeof(Elf64_Shdr); + uint8_t *buf = (uint8_t *)PyMem_RawMalloc(total_size); + if (buf == NULL) { + return NULL; + } + memset(buf, 0, total_size); + + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)buf; + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELFOSABI_NONE; + ehdr->e_type = ET_DYN; + ehdr->e_machine = machine; + ehdr->e_version = EV_CURRENT; + ehdr->e_entry = 0; + ehdr->e_phoff = 0; + ehdr->e_shoff = sh_off; + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_shentsize = sizeof(Elf64_Shdr); + ehdr->e_shnum = shnum; + ehdr->e_shstrndx = SH_SHSTRTAB; + + memcpy(buf + text_off, code_addr, text_size); + memcpy(buf + eh_off, eh_frame, eh_frame_size); + + char *shstr = (char *)(buf + shstr_off); + memcpy(shstr, shstrtab, shstrtab_size); + + char *strtab = (char *)(buf + str_off); + strtab[0] = '\0'; + memcpy(strtab + 1, symname, strlen(symname)); + strtab[strtab_size - 1] = '\0'; + + Elf64_Sym *syms = (Elf64_Sym *)(buf + sym_off); + memset(syms, 0, symtab_size); + /* Section symbol for .text (local) */ + syms[1].st_info = ELF64_ST_INFO(STB_LOCAL, STT_SECTION); + syms[1].st_shndx = 1; + /* Function symbol */ + syms[2].st_name = 1; + syms[2].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC); + syms[2].st_other = STV_DEFAULT; + syms[2].st_shndx = 1; + /* For ET_DYN/ET_EXEC, st_value is the absolute virtual address. */ + syms[2].st_value = (Elf64_Addr)(uintptr_t)code_addr; + syms[2].st_size = code_size; + + Elf64_Shdr *shdrs = (Elf64_Shdr *)(buf + sh_off); + memset(shdrs, 0, shnum * sizeof(Elf64_Shdr)); + + shdrs[SH_TEXT].sh_name = sh_text; + shdrs[SH_TEXT].sh_type = SHT_PROGBITS; + shdrs[SH_TEXT].sh_flags = SHF_ALLOC | SHF_EXECINSTR; + shdrs[SH_TEXT].sh_addr = (Elf64_Addr)(uintptr_t)code_addr; + shdrs[SH_TEXT].sh_offset = text_off; + shdrs[SH_TEXT].sh_size = text_size; + shdrs[SH_TEXT].sh_addralign = 16; + + shdrs[SH_EH_FRAME].sh_name = sh_eh_frame; + shdrs[SH_EH_FRAME].sh_type = SHT_PROGBITS; + shdrs[SH_EH_FRAME].sh_flags = SHF_ALLOC; + shdrs[SH_EH_FRAME].sh_addr = + (Elf64_Addr)((uintptr_t)code_addr + text_padded); + shdrs[SH_EH_FRAME].sh_offset = eh_off; + shdrs[SH_EH_FRAME].sh_size = eh_frame_size; + shdrs[SH_EH_FRAME].sh_addralign = 8; + + shdrs[SH_SHSTRTAB].sh_name = sh_shstrtab; + shdrs[SH_SHSTRTAB].sh_type = SHT_STRTAB; + shdrs[SH_SHSTRTAB].sh_offset = shstr_off; + shdrs[SH_SHSTRTAB].sh_size = shstrtab_size; + shdrs[SH_SHSTRTAB].sh_addralign = 1; + + shdrs[SH_STRTAB].sh_name = sh_strtab; + shdrs[SH_STRTAB].sh_type = SHT_STRTAB; + shdrs[SH_STRTAB].sh_offset = str_off; + shdrs[SH_STRTAB].sh_size = strtab_size; + shdrs[SH_STRTAB].sh_addralign = 1; + + shdrs[SH_SYMTAB].sh_name = sh_symtab; + shdrs[SH_SYMTAB].sh_type = SHT_SYMTAB; + shdrs[SH_SYMTAB].sh_offset = sym_off; + shdrs[SH_SYMTAB].sh_size = symtab_size; + shdrs[SH_SYMTAB].sh_link = SH_STRTAB; + shdrs[SH_SYMTAB].sh_info = 2; + shdrs[SH_SYMTAB].sh_addralign = 8; + shdrs[SH_SYMTAB].sh_entsize = sizeof(Elf64_Sym); + + struct jit_code_entry *entry = PyMem_RawMalloc(sizeof(*entry)); + if (entry == NULL) { + PyMem_RawFree(buf); + return NULL; + } + entry->symfile_addr = (const char *)buf; + entry->symfile_size = total_size; + entry->code_addr = code_addr; + + PyMutex_Lock(&_Py_jit_debug_mutex); + entry->prev = NULL; + entry->next = __jit_debug_descriptor.first_entry; + if (entry->next != NULL) { + entry->next->prev = entry; + } + __jit_debug_descriptor.first_entry = entry; + __jit_debug_descriptor.relevant_entry = entry; + __jit_debug_descriptor.action_flag = JIT_REGISTER_FN; + __jit_debug_register_code(); + __jit_debug_descriptor.action_flag = JIT_NOACTION; + __jit_debug_descriptor.relevant_entry = NULL; + PyMutex_Unlock(&_Py_jit_debug_mutex); + return entry; +} +#endif // defined(PY_HAVE_JIT_GDB_UNWIND) + +void * +_PyJitUnwind_GdbRegisterCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename) +{ +#if defined(PY_HAVE_JIT_GDB_UNWIND) + /* GDB expects a stable symbol name and absolute addresses in .eh_frame. */ + if (entry == NULL) { + entry = ""; + } + if (filename == NULL) { + filename = ""; + } + size_t name_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; + char *name = (char *)PyMem_RawMalloc(name_size); + if (name == NULL) { + return NULL; + } + snprintf(name, name_size, "py::%s:%s", entry, filename); + + uint8_t buffer[1024]; + size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( + buffer, sizeof(buffer), code_addr, code_size, 1); + if (eh_frame_size == 0) { + PyMem_RawFree(name); + return NULL; + } + + void *handle = gdb_jit_register_code(code_addr, code_size, name, + buffer, eh_frame_size); + PyMem_RawFree(name); + return handle; +#else + (void)code_addr; + (void)code_size; + (void)entry; + (void)filename; + return NULL; +#endif +} + +void +_PyJitUnwind_GdbUnregisterCode(void *handle) +{ +#if defined(PY_HAVE_JIT_GDB_UNWIND) + struct jit_code_entry *entry = (struct jit_code_entry *)handle; + if (entry == NULL) { + return; + } + + PyMutex_Lock(&_Py_jit_debug_mutex); + if (entry->prev != NULL) { + entry->prev->next = entry->next; + } + else { + __jit_debug_descriptor.first_entry = entry->next; + } + if (entry->next != NULL) { + entry->next->prev = entry->prev; + } + + __jit_debug_descriptor.relevant_entry = entry; + __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN; + __jit_debug_register_code(); + __jit_debug_descriptor.action_flag = JIT_NOACTION; + __jit_debug_descriptor.relevant_entry = NULL; + + PyMutex_Unlock(&_Py_jit_debug_mutex); + + PyMem_RawFree((void *)entry->symfile_addr); + PyMem_RawFree(entry); +#else + (void)handle; +#endif +} + +#endif // defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND) diff --git a/Python/optimizer.c b/Python/optimizer.c index 820a0771a2cb0f..11658fca0da595 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1448,6 +1448,7 @@ allocate_executor(int exit_count, int length) res->trace = (_PyUOpInstruction *)(res->exits + exit_count); res->code_size = length; res->exit_count = exit_count; + res->jit_gdb_handle = NULL; return res; } diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 0ba856ea610e59..0c460282feceef 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -62,6 +62,7 @@ #include "pycore_frame.h" #include "pycore_interp.h" #include "pycore_mmap.h" // _PyAnnotateMemoryMap() +#include "pycore_jit_unwind.h" #include "pycore_runtime.h" // _PyRuntime #ifdef PY_HAVE_PERF_TRAMPOLINE @@ -73,6 +74,7 @@ #include // File control operations #include // Standard I/O operations #include // Standard library functions +#include // memcpy, strlen #include // Memory mapping functions (mmap) #include // System data types #include // System calls (sysconf, getpid) @@ -246,6 +248,25 @@ typedef struct { */ } CodeUnwindingInfoEvent; +/* + * EH Frame Header structure for DWARF unwinding + * + * This header provides metadata about the .eh_frame data that follows. + * It uses PC-relative and data-relative encodings to keep the synthesized + * DSO self-contained when perf injects it. + */ +typedef struct __attribute__((packed)) { + uint8_t version; + uint8_t eh_frame_ptr_enc; + uint8_t fde_count_enc; + uint8_t table_enc; + int32_t eh_frame_ptr; + uint32_t eh_fde_count; + int32_t from; + int32_t to; +} EhFrameHeader; +_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch"); + // ============================================================================= // GLOBAL STATE MANAGEMENT // ============================================================================= @@ -259,10 +280,11 @@ typedef struct { */ typedef struct { FILE* perf_map; // File handle for the jitdump file - PyThread_type_lock map_lock; // Thread synchronization lock + PyMutex map_lock; // Thread synchronization lock void* mapped_buffer; // Memory-mapped region (signals perf we're active) size_t mapped_size; // Size of the mapped region - int code_id; // Counter for unique code region identifiers + uint32_t code_id; // Counter for unique code region identifiers + uint64_t build_id_salt; // Per-process salt for unique synthetic DSOs } PerfMapJitState; /* Global singleton instance */ @@ -316,40 +338,6 @@ static int64_t get_current_time_microseconds(void) { return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; } -// ============================================================================= -// UTILITY FUNCTIONS -// ============================================================================= - -/* - * Round up a value to the next multiple of a given number - * - * This is essential for maintaining proper alignment requirements in the - * jitdump format. Many structures need to be aligned to specific boundaries - * (typically 8 or 16 bytes) for efficient processing by perf. - * - * Args: - * value: The value to round up - * multiple: The multiple to round up to - * - * Returns: The smallest value >= input that is a multiple of 'multiple' - */ -static size_t round_up(int64_t value, int64_t multiple) { - if (multiple == 0) { - return value; // Avoid division by zero - } - - int64_t remainder = value % multiple; - if (remainder == 0) { - return value; // Already aligned - } - - /* Calculate how much to add to reach the next multiple */ - int64_t difference = multiple - remainder; - int64_t rounded_up_value = value + difference; - - return rounded_up_value; -} - // ============================================================================= // FILE I/O UTILITIES // ============================================================================= @@ -406,623 +394,6 @@ static void perf_map_jit_write_header(int pid, FILE* out_file) { perf_map_jit_write_fully(&header, sizeof(header)); } -// ============================================================================= -// DWARF CONSTANTS AND UTILITIES -// ============================================================================= - -/* - * DWARF (Debug With Arbitrary Record Formats) constants - * - * DWARF is a debugging data format used to provide stack unwinding information. - * These constants define the various encoding types and opcodes used in - * DWARF Call Frame Information (CFI) records. - */ - -/* DWARF Call Frame Information version */ -#define DWRF_CIE_VERSION 1 - -/* DWARF CFA (Call Frame Address) opcodes */ -enum { - DWRF_CFA_nop = 0x0, // No operation - DWRF_CFA_offset_extended = 0x5, // Extended offset instruction - DWRF_CFA_def_cfa = 0xc, // Define CFA rule - DWRF_CFA_def_cfa_register = 0xd, // Define CFA register - DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset - DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset - DWRF_CFA_advance_loc = 0x40, // Advance location counter - DWRF_CFA_offset = 0x80, // Simple offset instruction - DWRF_CFA_restore = 0xc0 // Restore register -}; - -/* DWARF Exception Handling pointer encodings */ -enum { - DWRF_EH_PE_absptr = 0x00, // Absolute pointer - DWRF_EH_PE_omit = 0xff, // Omitted value - - /* Data type encodings */ - DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 - DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte - DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte - DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte - DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 - DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte - DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte - DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte - DWRF_EH_PE_signed = 0x08, // Signed flag - - /* Reference type encodings */ - DWRF_EH_PE_pcrel = 0x10, // PC-relative - DWRF_EH_PE_textrel = 0x20, // Text-relative - DWRF_EH_PE_datarel = 0x30, // Data-relative - DWRF_EH_PE_funcrel = 0x40, // Function-relative - DWRF_EH_PE_aligned = 0x50, // Aligned - DWRF_EH_PE_indirect = 0x80 // Indirect -}; - -/* Additional DWARF constants for debug information */ -enum { DWRF_TAG_compile_unit = 0x11 }; -enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; -enum { - DWRF_AT_name = 0x03, // Name attribute - DWRF_AT_stmt_list = 0x10, // Statement list - DWRF_AT_low_pc = 0x11, // Low PC address - DWRF_AT_high_pc = 0x12 // High PC address -}; -enum { - DWRF_FORM_addr = 0x01, // Address form - DWRF_FORM_data4 = 0x06, // 4-byte data - DWRF_FORM_string = 0x08 // String form -}; - -/* Line number program opcodes */ -enum { - DWRF_LNS_extended_op = 0, // Extended opcode - DWRF_LNS_copy = 1, // Copy operation - DWRF_LNS_advance_pc = 2, // Advance program counter - DWRF_LNS_advance_line = 3 // Advance line number -}; - -/* Line number extended opcodes */ -enum { - DWRF_LNE_end_sequence = 1, // End of sequence - DWRF_LNE_set_address = 2 // Set address -}; - -/* - * Architecture-specific DWARF register numbers - * - * These constants define the register numbering scheme used by DWARF - * for each supported architecture. The numbers must match the ABI - * specification for proper stack unwinding. - */ -enum { -#ifdef __x86_64__ - /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ - DWRF_REG_AX, // RAX - DWRF_REG_DX, // RDX - DWRF_REG_CX, // RCX - DWRF_REG_BX, // RBX - DWRF_REG_SI, // RSI - DWRF_REG_DI, // RDI - DWRF_REG_BP, // RBP - DWRF_REG_SP, // RSP - DWRF_REG_8, // R8 - DWRF_REG_9, // R9 - DWRF_REG_10, // R10 - DWRF_REG_11, // R11 - DWRF_REG_12, // R12 - DWRF_REG_13, // R13 - DWRF_REG_14, // R14 - DWRF_REG_15, // R15 - DWRF_REG_RA, // Return address (RIP) -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 register numbering */ - DWRF_REG_FP = 29, // Frame Pointer - DWRF_REG_RA = 30, // Link register (return address) - DWRF_REG_SP = 31, // Stack pointer -#else -# error "Unsupported target architecture" -#endif -}; - -/* DWARF encoding constants used in EH frame headers */ -static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data -static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data -static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding -static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding - -// ============================================================================= -// ELF OBJECT CONTEXT -// ============================================================================= - -/* - * Context for building ELF/DWARF structures - * - * This structure maintains state while constructing DWARF unwind information. - * It acts as a simple buffer manager with pointers to track current position - * and important landmarks within the buffer. - */ -typedef struct ELFObjectContext { - uint8_t* p; // Current write position in buffer - uint8_t* startp; // Start of buffer (for offset calculations) - uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) - uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) - uint32_t code_size; // Size of the code being described -} ELFObjectContext; - -/* - * EH Frame Header structure for DWARF unwinding - * - * This structure provides metadata about the DWARF unwinding information - * that follows. It's required by the perf jitdump format to enable proper - * stack unwinding during profiling. - */ -typedef struct { - unsigned char version; // EH frame version (always 1) - unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer - unsigned char fde_count_enc; // Encoding of FDE count - unsigned char table_enc; // Encoding of table entries - int32_t eh_frame_ptr; // Pointer to EH frame data - int32_t eh_fde_count; // Number of FDEs (Frame Description Entries) - int32_t from; // Start address of code range - int32_t to; // End address of code range -} EhFrameHeader; - -// ============================================================================= -// DWARF GENERATION UTILITIES -// ============================================================================= - -/* - * Append a null-terminated string to the ELF context buffer - * - * Args: - * ctx: ELF object context - * str: String to append (must be null-terminated) - * - * Returns: Offset from start of buffer where string was written - */ -static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { - uint8_t* p = ctx->p; - uint32_t ofs = (uint32_t)(p - ctx->startp); - - /* Copy string including null terminator */ - do { - *p++ = (uint8_t)*str; - } while (*str++); - - ctx->p = p; - return ofs; -} - -/* - * Append a SLEB128 (Signed Little Endian Base 128) value - * - * SLEB128 is a variable-length encoding used extensively in DWARF. - * It efficiently encodes small numbers in fewer bytes. - * - * Args: - * ctx: ELF object context - * v: Signed value to encode - */ -static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { - uint8_t* p = ctx->p; - - /* Encode 7 bits at a time, with continuation bit in MSB */ - for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { - *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit - } - *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit - - ctx->p = p; -} - -/* - * Append a ULEB128 (Unsigned Little Endian Base 128) value - * - * Similar to SLEB128 but for unsigned values. - * - * Args: - * ctx: ELF object context - * v: Unsigned value to encode - */ -static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { - uint8_t* p = ctx->p; - - /* Encode 7 bits at a time, with continuation bit in MSB */ - for (; v >= 0x80; v >>= 7) { - *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit - } - *p++ = (char)v; // Final byte without continuation bit - - ctx->p = p; -} - -/* - * Macros for generating DWARF structures - * - * These macros provide a convenient way to write various data types - * to the DWARF buffer while automatically advancing the pointer. - */ -#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit -#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit -#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit -#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit -#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address -#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 -#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 -#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string - -/* Align to specified boundary with NOP instructions */ -#define DWRF_ALIGNNOP(s) \ - while ((uintptr_t)p & ((s)-1)) { \ - *p++ = DWRF_CFA_nop; \ - } - -/* Write a DWARF section with automatic size calculation */ -#define DWRF_SECTION(name, stmt) \ - { \ - uint32_t* szp_##name = (uint32_t*)p; \ - p += 4; \ - stmt; \ - *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ - } - -// ============================================================================= -// DWARF EH FRAME GENERATION -// ============================================================================= - -static void elf_init_ehframe(ELFObjectContext* ctx); - -/* - * Initialize DWARF .eh_frame section for a code region - * - * The .eh_frame section contains Call Frame Information (CFI) that describes - * how to unwind the stack at any point in the code. This is essential for - * proper profiling as it allows perf to generate accurate call graphs. - * - * The function generates two main components: - * 1. CIE (Common Information Entry) - describes calling conventions - * 2. FDE (Frame Description Entry) - describes specific function unwinding - * - * Args: - * ctx: ELF object context containing code size and buffer pointers - */ -static size_t calculate_eh_frame_size(void) { - /* Calculate the EH frame size for the trampoline function */ - extern void *_Py_trampoline_func_start; - extern void *_Py_trampoline_func_end; - - size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start; - - ELFObjectContext ctx; - char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) - ctx.code_size = code_size; - ctx.startp = ctx.p = (uint8_t*)buffer; - ctx.fde_p = NULL; - - elf_init_ehframe(&ctx); - return ctx.p - ctx.startp; -} - -static void elf_init_ehframe(ELFObjectContext* ctx) { - uint8_t* p = ctx->p; - uint8_t* framep = p; // Remember start of frame data - - /* - * DWARF Unwind Table for Trampoline Function - * - * This section defines DWARF Call Frame Information (CFI) using encoded macros - * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function - * preserves and restores registers. This is used by profiling tools (e.g., `perf`) - * and debuggers for stack unwinding in JIT-compiled code. - * - * ------------------------------------------------- - * TO REGENERATE THIS TABLE FROM GCC OBJECTS: - * ------------------------------------------------- - * - * 1. Create a trampoline source file (e.g., `trampoline.c`): - * - * #include - * typedef PyObject* (*py_evaluator)(void*, void*, int); - * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { - * return evaluator(ts, f, throwflag); - * } - * - * 2. Compile to an object file with frame pointer preservation: - * - * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c - * - * 3. Extract DWARF unwind info from the object file: - * - * readelf -w trampoline.o - * - * Example output from `.eh_frame`: - * - * 00000000 CIE - * Version: 1 - * Augmentation: "zR" - * Code alignment factor: 4 - * Data alignment factor: -8 - * Return address column: 30 - * DW_CFA_def_cfa: r31 (sp) ofs 0 - * - * 00000014 FDE cie=00000000 pc=0..14 - * DW_CFA_advance_loc: 4 - * DW_CFA_def_cfa_offset: 16 - * DW_CFA_offset: r29 at cfa-16 - * DW_CFA_offset: r30 at cfa-8 - * DW_CFA_advance_loc: 12 - * DW_CFA_restore: r30 - * DW_CFA_restore: r29 - * DW_CFA_def_cfa_offset: 0 - * - * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. - * - * ---------------------------------- - * HOW TO TRANSLATE TO DWRF_* MACROS: - * ---------------------------------- - * - * After compiling your trampoline with: - * - * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c - * - * run: - * - * readelf -w trampoline.o - * - * to inspect the generated `.eh_frame` data. You will see two main components: - * - * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. - * 2. An FDE (Frame Description Entry): function-specific unwind instructions. - * - * --------------------- - * Translating the CIE: - * --------------------- - * From `readelf -w`, you might see: - * - * 00000000 0000000000000010 00000000 CIE - * Version: 1 - * Augmentation: "zR" - * Code alignment factor: 4 - * Data alignment factor: -8 - * Return address column: 30 - * Augmentation data: 1b - * DW_CFA_def_cfa: r31 (sp) ofs 0 - * - * Map this to: - * - * DWRF_SECTION(CIE, - * DWRF_U32(0); // CIE ID (always 0 for CIEs) - * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 - * DWRF_STR("zR"); // Augmentation string "zR" - * DWRF_UV(4); // Code alignment factor = 4 - * DWRF_SV(-8); // Data alignment factor = -8 - * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) - * DWRF_UV(1); // Augmentation data length = 1 - * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers - * - * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa - * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) - * DWRF_UV(0); // Offset = 0 - * - * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary - * ) - * - * Notes: - * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. - * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. - * - * --------------------- - * Translating the FDE: - * --------------------- - * From `readelf -w`: - * - * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 - * DW_CFA_advance_loc: 4 - * DW_CFA_def_cfa_offset: 16 - * DW_CFA_offset: r29 at cfa-16 - * DW_CFA_offset: r30 at cfa-8 - * DW_CFA_advance_loc: 12 - * DW_CFA_restore: r30 - * DW_CFA_restore: r29 - * DW_CFA_def_cfa_offset: 0 - * - * Map the FDE header and instructions to: - * - * DWRF_SECTION(FDE, - * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) - * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) - * DWRF_U32(ctx->code_size); // Code range covered by this FDE - * DWRF_U8(0); // Augmentation data length (none) - * - * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) - * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 - * DWRF_UV(16); - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) - * DWRF_UV(2); // At offset 2 * 8 = 16 bytes - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) - * DWRF_UV(1); // At offset 1 * 8 = 8 bytes - * - * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 - * - * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP - * DWRF_UV(0); - * ) - * - * To regenerate: - * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. - * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as - * the code is in a different address space every time. - * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: - * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) - * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) - * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset - * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) - * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. - * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. - */ - - /* - * Emit DWARF EH CIE (Common Information Entry) - * - * The CIE describes the calling conventions and basic unwinding rules - * that apply to all functions in this compilation unit. - */ - DWRF_SECTION(CIE, - DWRF_U32(0); // CIE ID (0 indicates this is a CIE) - DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) - DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) -#ifdef __x86_64__ - DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) -#endif - DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) - DWRF_U8(DWRF_REG_RA); // Return address register number - DWRF_UV(1); // Augmentation data length - DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding - - /* Initial CFI instructions - describe default calling convention */ -#ifdef __x86_64__ - /* x86_64 initial CFI state */ - DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) - DWRF_UV(DWRF_REG_SP); // CFA = SP register - DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size - DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved - DWRF_UV(1); // At offset 1 from CFA -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 initial CFI state */ - DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) - DWRF_UV(DWRF_REG_SP); // CFA = SP register - DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) - // No initial register saves in AArch64 CIE -#endif - DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary - ) - - ctx->eh_frame_p = p; // Remember start of FDE data - - /* - * Emit DWARF EH FDE (Frame Description Entry) - * - * The FDE describes unwinding information specific to this function. - * It references the CIE and provides function-specific CFI instructions. - * - * The PC-relative offset is calculated after the entire EH frame is built - * to ensure accurate positioning relative to the synthesized DSO layout. - */ - DWRF_SECTION(FDE, - DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) - ctx->fde_p = p; // Remember where PC offset field is located for later calculation - DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe) - DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) - DWRF_U8(0); // Augmentation data length (none) - - /* - * Architecture-specific CFI instructions - * - * These instructions describe how registers are saved and restored - * during function calls. Each architecture has different calling - * conventions and register usage patterns. - */ -#ifdef __x86_64__ - /* x86_64 calling convention unwinding rules with frame pointer */ -# if defined(__CET__) && (__CET__ & 1) - DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) -# endif - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) - DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 - DWRF_UV(16); // New offset: SP + 16 - DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 - DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) - DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 - DWRF_UV(DWRF_REG_BP); // Use base pointer register - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 - DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 - DWRF_UV(DWRF_REG_SP); // Use stack pointer register - DWRF_UV(8); // New offset: SP + 8 -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 calling convention unwinding rules */ - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) - DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 - DWRF_UV(16); // Stack pointer moved by 16 bytes - DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved - DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) - DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved - DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) - DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! - DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! - DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored) - DWRF_UV(0); // Back to original stack position -#else -# error "Unsupported target architecture" -#endif - - DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary - ) - - ctx->p = p; // Update context pointer to end of generated data - - /* Calculate and update the PC-relative offset in the FDE - * - * When perf processes the jitdump, it creates a synthesized DSO with this layout: - * - * Synthesized DSO Memory Layout: - * ┌─────────────────────────────────────────────────────────────┐ < code_start - * │ Code Section │ - * │ (round_up(code_size, 8) bytes) │ - * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data - * │ EH Frame Data │ - * │ ┌─────────────────────────────────────────────────────┐ │ - * │ │ CIE data │ │ - * │ └─────────────────────────────────────────────────────┘ │ - * │ ┌─────────────────────────────────────────────────────┐ │ - * │ │ FDE Header: │ │ - * │ │ - CIE offset (4 bytes) │ │ - * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start - * │ │ - address range (4 bytes) │ │ (this specific field) - * │ │ CFI Instructions... │ │ - * │ └─────────────────────────────────────────────────────┘ │ - * ├─────────────────────────────────────────────────────────────┤ < reference_point - * │ EhFrameHeader │ - * │ (navigation metadata) │ - * └─────────────────────────────────────────────────────────────┘ - * - * The PC offset field in the FDE must contain the distance from itself to code_start: - * - * distance = code_start - fde_pc_field - * - * Where: - * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame - * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) - * - * Therefore: - * distance = code_start_location - fde_pc_field_location - * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) - * = -rounded_code_size - fde_offset_in_frame - * = -(round_up(code_size, 8) + fde_offset_in_frame) - * - * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field, - * - */ - if (ctx->fde_p != NULL) { - int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp); - int32_t rounded_code_size = round_up(ctx->code_size, 8); - int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame); - - - // Update the PC-relative offset in the FDE - *(int32_t*)ctx->fde_p = pc_relative_offset; - } -} - // ============================================================================= // JITDUMP INITIALIZATION // ============================================================================= @@ -1042,6 +413,12 @@ static void elf_init_ehframe(ELFObjectContext* ctx) { * Returns: Pointer to initialized state, or NULL on failure */ static void* perf_map_jit_init(void) { + PyMutex_Lock(&perf_jit_map_state.map_lock); + if (perf_jit_map_state.perf_map != NULL) { + PyMutex_Unlock(&perf_jit_map_state.map_lock); + return &perf_jit_map_state; + } + char filename[100]; int pid = getpid(); @@ -1051,6 +428,7 @@ static void* perf_map_jit_init(void) { /* Create/open the jitdump file with appropriate permissions */ const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); if (fd == -1) { + PyMutex_Unlock(&perf_jit_map_state.map_lock); return NULL; // Failed to create file } @@ -1058,6 +436,7 @@ static void* perf_map_jit_init(void) { const long page_size = sysconf(_SC_PAGESIZE); if (page_size == -1) { close(fd); + PyMutex_Unlock(&perf_jit_map_state.map_lock); return NULL; // Failed to get page size } @@ -1086,6 +465,7 @@ static void* perf_map_jit_init(void) { if (perf_jit_map_state.mapped_buffer == MAP_FAILED) { perf_jit_map_state.mapped_buffer = NULL; close(fd); + PyMutex_Unlock(&perf_jit_map_state.map_lock); return NULL; // Memory mapping failed } (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size, @@ -1098,6 +478,7 @@ static void* perf_map_jit_init(void) { perf_jit_map_state.perf_map = fdopen(fd, "w+"); if (perf_jit_map_state.perf_map == NULL) { close(fd); + PyMutex_Unlock(&perf_jit_map_state.map_lock); return NULL; // Failed to create FILE* } @@ -1113,28 +494,18 @@ static void* perf_map_jit_init(void) { /* Write the jitdump file header */ perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); - /* - * Initialize thread synchronization lock - * - * Multiple threads may attempt to write to the jitdump file - * simultaneously. This lock ensures thread-safe access to the - * global jitdump state. - */ - perf_jit_map_state.map_lock = PyThread_allocate_lock(); - if (perf_jit_map_state.map_lock == NULL) { - fclose(perf_jit_map_state.perf_map); - return NULL; // Failed to create lock - } - /* Initialize code ID counter */ perf_jit_map_state.code_id = 0; + perf_jit_map_state.build_id_salt = + ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks(); /* Calculate padding size based on actual unwind info requirements */ - size_t eh_frame_size = calculate_eh_frame_size(); + size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0); size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; - trampoline_api.code_padding = round_up(unwind_data_size, 16); + trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16); trampoline_api.code_alignment = 32; + PyMutex_Unlock(&perf_jit_map_state.map_lock); return &perf_jit_map_state; } @@ -1143,54 +514,31 @@ static void* perf_map_jit_init(void) { // ============================================================================= /* - * Write a complete jitdump entry for a Python function - * - * This is the main function called by Python's trampoline system whenever - * a new piece of JIT-compiled code needs to be recorded. It writes both - * the unwinding information and the code load event to the jitdump file. - * - * The function performs these steps: - * 1. Initialize jitdump system if not already done - * 2. Extract function name and filename from Python code object - * 3. Generate DWARF unwinding information - * 4. Write unwinding info event to jitdump file - * 5. Write code load event to jitdump file - * - * Args: - * state: Jitdump state (currently unused, uses global state) - * code_addr: Address where the compiled code resides - * code_size: Size of the compiled code in bytes - * co: Python code object containing metadata + * Write a complete jitdump entry for a code region with a provided name. * - * IMPORTANT: This function signature is part of Python's internal API - * and must not be changed without coordinating with core Python development. + * This shares the same implementation as the trampoline callback, but + * allows callers that don't have a PyCodeObject to reuse the jitdump + * infrastructure. */ -static void perf_map_jit_write_entry(void *state, const void *code_addr, - unsigned int code_size, PyCodeObject *co) +static void perf_map_jit_write_entry_with_name( + void *state, + const void *code_addr, + size_t code_size, + const char *entry, + const char *filename +) { /* Initialize jitdump system on first use */ - if (perf_jit_map_state.perf_map == NULL) { - void* ret = perf_map_jit_init(); - if(ret == NULL){ - return; // Initialization failed, silently abort - } + void* ret = perf_map_jit_init(); + if (ret == NULL) { + return; // Initialization failed, silently abort } - /* - * Extract function information from Python code object - * - * We create a human-readable function name by combining the qualified - * name (includes class/module context) with the filename. This helps - * developers identify functions in perf reports. - */ - const char *entry = ""; - if (co->co_qualname != NULL) { - entry = PyUnicode_AsUTF8(co->co_qualname); + if (entry == NULL) { + entry = ""; } - - const char *filename = ""; - if (co->co_filename != NULL) { - filename = PyUnicode_AsUTF8(co->co_filename); + if (filename == NULL) { + filename = ""; } /* @@ -1218,15 +566,20 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, * Without it, perf cannot generate accurate call graphs, especially * in optimized code where frame pointers may be omitted. */ - ELFObjectContext ctx; - char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) - ctx.code_size = code_size; - ctx.startp = ctx.p = (uint8_t*)buffer; - ctx.fde_p = NULL; // Initialize to NULL, will be set when FDE is written + uint8_t buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) + size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( + buffer, sizeof(buffer), code_addr, code_size, 0); + if (eh_frame_size == 0) { + PyMem_RawFree(perf_map_entry); + return; + } - /* Generate EH frame (Exception Handling frame) data */ - elf_init_ehframe(&ctx); - int eh_frame_size = ctx.p - ctx.startp; + /* + * A logical jitdump entry is written as multiple records and also consumes + * a process-global code_id. Serialize the whole sequence so concurrent JIT + * compilation cannot interleave records or reuse an ID. + */ + PyMutex_Lock(&perf_jit_map_state.map_lock); /* * Write Code Unwinding Information Event @@ -1244,12 +597,12 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); - ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment + ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16); // 16-byte alignment /* Calculate total event size with padding */ - int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; - int padding_size = round_up(content_size, 8) - content_size; // 8-byte align - ev2.base.size = content_size + padding_size; + int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size); + int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size; // 8-byte align + ev2.base.size = (uint32_t)(content_size + padding_size); /* Write the unwinding info event header */ perf_map_jit_write_fully(&ev2, sizeof(ev2)); @@ -1263,20 +616,21 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, */ EhFrameHeader f; f.version = 1; - f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; // PC-relative signed 4-byte - f.fde_count_enc = DwarfUData4; // Unsigned 4-byte count - f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte + f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel; + f.fde_count_enc = DWRF_EH_PE_udata4; + f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel; /* Calculate relative offsets for EH frame navigation */ - f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); + f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char)); f.eh_fde_count = 1; // We generate exactly one FDE per function - f.from = -(round_up(code_size, 8) + eh_frame_size); - - int cie_size = ctx.eh_frame_p - ctx.startp; - f.to = -(eh_frame_size - cie_size); + f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size); + uint32_t cie_payload_size; + memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size)); + int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size); + f.to = -(int32_t)(eh_frame_size - cie_size); /* Write EH frame data and header */ - perf_map_jit_write_fully(ctx.startp, eh_frame_size); + perf_map_jit_write_fully(buffer, eh_frame_size); perf_map_jit_write_fully(&f, sizeof(f)); /* Write padding to maintain alignment */ @@ -1313,12 +667,86 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, /* Write code load event and associated data */ perf_map_jit_write_fully(&ev, sizeof(ev)); perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator - perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code + /* + * Ensure each synthetic DSO has unique .text bytes. + * + * perf merges DSOs that share a build-id. Since trampolines can share + * identical code and unwind bytes, perf may resolve all JIT frames to + * the first symbol it saw (including entries from previous runs when + * build-id caching is enabled). Patch a small marker in the emitted + * bytes to make the build-id depend on a per-process salt and code id + * without modifying the live code. + */ + uint64_t marker = perf_jit_map_state.build_id_salt ^ + ((uint64_t)perf_jit_map_state.code_id << 32) ^ + (uint64_t)code_size; + if (size >= sizeof(marker)) { + size_t prefix = size - sizeof(marker); + perf_map_jit_write_fully((void *)(base), prefix); + perf_map_jit_write_fully(&marker, sizeof(marker)); + } + else if (size > 0) { + uint8_t tmp[sizeof(marker)]; + memcpy(tmp, (void *)(base), size); + for (size_t i = 0; i < size; i++) { + tmp[i] ^= (uint8_t)(marker >> (i * 8)); + } + perf_map_jit_write_fully(tmp, size); + } /* Clean up allocated memory */ + PyMutex_Unlock(&perf_jit_map_state.map_lock); PyMem_RawFree(perf_map_entry); } +/* + * Write a complete jitdump entry for a Python function + * + * This is the main function called by Python's trampoline system whenever + * a new piece of JIT-compiled code needs to be recorded. It writes both + * the unwinding information and the code load event to the jitdump file. + * + * The function performs these steps: + * 1. Initialize jitdump system if not already done + * 2. Extract function name and filename from Python code object + * 3. Generate DWARF unwinding information + * 4. Write unwinding info event to jitdump file + * 5. Write code load event to jitdump file + * + * Args: + * state: Jitdump state (currently unused, uses global state) + * code_addr: Address where the compiled code resides + * code_size: Size of the compiled code in bytes + * co: Python code object containing metadata + * + * IMPORTANT: This function signature is part of Python's internal API + * and must not be changed without coordinating with core Python development. + */ +static void perf_map_jit_write_entry(void *state, const void *code_addr, + size_t code_size, PyCodeObject *co) +{ + const char *entry = ""; + const char *filename = ""; + if (co != NULL) { + if (co->co_qualname != NULL) { + entry = PyUnicode_AsUTF8(co->co_qualname); + } + if (co->co_filename != NULL) { + filename = PyUnicode_AsUTF8(co->co_filename); + } + } + perf_map_jit_write_entry_with_name(state, code_addr, code_size, + entry, filename); +} + +void +_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size, + const char *entry, const char *filename) +{ + perf_map_jit_write_entry_with_name( + NULL, code_addr, code_size, entry, filename); +} + // ============================================================================= // CLEANUP AND FINALIZATION // ============================================================================= @@ -1346,15 +774,12 @@ static int perf_map_jit_fini(void* state) { * writing to the file when we close it. This prevents corruption * and ensures all data is properly flushed. */ + PyMutex_Lock(&perf_jit_map_state.map_lock); if (perf_jit_map_state.perf_map != NULL) { - PyThread_acquire_lock(perf_jit_map_state.map_lock, 1); fclose(perf_jit_map_state.perf_map); // This also flushes buffers - PyThread_release_lock(perf_jit_map_state.map_lock); - - /* Clean up synchronization primitive */ - PyThread_free_lock(perf_jit_map_state.map_lock); perf_jit_map_state.perf_map = NULL; } + PyMutex_Unlock(&perf_jit_map_state.map_lock); /* * Unmap the memory region diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c index 0d835f3b7f56a9..58c61e64bfc4e9 100644 --- a/Python/perf_trampoline.c +++ b/Python/perf_trampoline.c @@ -243,7 +243,7 @@ perf_trampoline_code_watcher(PyCodeEvent event, PyCodeObject *co) static void perf_map_write_entry(void *state, const void *code_addr, - unsigned int code_size, PyCodeObject *co) + size_t code_size, PyCodeObject *co) { const char *entry = ""; if (co->co_qualname != NULL) { diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 1ee0b3bec684f9..c6447d03369a94 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2707,7 +2707,7 @@ PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void) { PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( const void *code_addr, - unsigned int code_size, + size_t code_size, const char *entry_name ) { #ifndef MS_WINDOWS @@ -2718,7 +2718,7 @@ PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( } } PyThread_acquire_lock(perf_map_state.map_lock, 1); - fprintf(perf_map_state.perf_map, "%" PRIxPTR " %x %s\n", (uintptr_t) code_addr, code_size, entry_name); + fprintf(perf_map_state.perf_map, "%" PRIxPTR " %zx %s\n", (uintptr_t) code_addr, code_size, entry_name); fflush(perf_map_state.perf_map); PyThread_release_lock(perf_map_state.map_lock); #endif diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py index 7197d70bc8bd0c..17547d4d916e9f 100755 --- a/Tools/build/smelly.py +++ b/Tools/build/smelly.py @@ -25,6 +25,8 @@ # "Legacy": some old symbols are prefixed by "PY_". EXCEPTIONS = frozenset({ 'PY_TIMEOUT_MAX', + '__jit_debug_descriptor', + '__jit_debug_register_code', }) IGNORED_EXTENSION = "_ctypes_test" diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py index a251a045b91144..a16d5773d5544c 100644 --- a/Tools/c-analyzer/cpython/_parser.py +++ b/Tools/c-analyzer/cpython/_parser.py @@ -324,8 +324,10 @@ def format_tsv_lines(lines): _abs('Objects/stringlib/unicode_format.h'): (10_000, 400), _abs('Objects/typeobject.c'): (380_000, 13_000), _abs('Python/compile.c'): (20_000, 500), + _abs('Python/jit_unwind.c'): (20_000, 300), _abs('Python/optimizer.c'): (100_000, 5_000), _abs('Python/parking_lot.c'): (40_000, 1000), + _abs('Python/perf_jit_trampoline.c'): (40_000, 1000), _abs('Python/pylifecycle.c'): (750_000, 5000), _abs('Python/pystate.c'): (750_000, 5000), _abs('Python/initconfig.c'): (50_000, 500), diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index d2489387f46caa..aa89e312b62482 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -386,6 +386,8 @@ Python/intrinsics.c - _PyIntrinsics_UnaryFunctions - Python/intrinsics.c - _PyIntrinsics_BinaryFunctions - Python/lock.c - TIME_TO_BE_FAIR_NS - Python/opcode_targets.h - opcode_targets - +Python/jit_unwind.c - __jit_debug_descriptor - +Python/jit_unwind.c - _Py_jit_debug_mutex - Python/perf_trampoline.c - _Py_perfmap_callbacks - Python/perf_jit_trampoline.c - _Py_perfmap_jit_callbacks - Python/perf_jit_trampoline.c - perf_jit_map_state - diff --git a/Tools/jit/README.md b/Tools/jit/README.md index 9361f39dcc64f2..2a687bb9e899e7 100644 --- a/Tools/jit/README.md +++ b/Tools/jit/README.md @@ -9,7 +9,7 @@ Python 3.11 or newer is required to build the JIT. The JIT compiler does not require end users to install any third-party dependencies, but part of it must be *built* using LLVM[^why-llvm]. You are *not* required to build the rest of CPython using LLVM, or even the same version of LLVM (in fact, this is uncommon). -LLVM version 21 is the officially supported version. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-21`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. +LLVM version 21 is the officially supported version. The tools `clang`, `llvm-readobj`, `llvm-objdump`, and `llvm-dwarfdump` need to be installed and discoverable (version suffixes, like `clang-21`, are okay). You can customize the LLVM configuration using environment variables before running configure: @@ -85,7 +85,7 @@ If you're looking for information on how to update the JIT build dependencies, s [^pep-744]: [PEP 744](https://peps.python.org/pep-0744/) -[^why-llvm]: Clang is specifically needed because it's the only C compiler with support for guaranteed tail calls (`musttail`), which are required by CPython's continuation-passing-style approach to JIT compilation. Since LLVM also includes other functionalities we need (namely, object file parsing and disassembly), it's convenient to only support one toolchain at this time. +[^why-llvm]: Clang is specifically needed because it's the only C compiler with support for guaranteed tail calls (`musttail`), which are required by CPython's continuation-passing-style approach to JIT compilation. Since LLVM also includes other functionalities we need (namely, object file parsing, disassembly, and DWARF inspection), it's convenient to only support one toolchain at this time. ### Understanding JIT behavior diff --git a/Tools/jit/_dwarf.py b/Tools/jit/_dwarf.py new file mode 100644 index 00000000000000..5b6b148562e109 --- /dev/null +++ b/Tools/jit/_dwarf.py @@ -0,0 +1,236 @@ +"""Utilities for deriving JIT unwind information from DWARF CFI.""" + +import dataclasses +import pathlib +import re +import typing + +_LLVMRun = typing.Callable[..., typing.Awaitable[str]] + + +@dataclasses.dataclass(frozen=True) +class UnwindInfo: + code_alignment_factor: int + data_alignment_factor: int + return_address_register: int + cfa_register: int + cfa_offset: int + frame_pointer_register: int + frame_pointer_offset: int + return_address_offset: int + + +@dataclasses.dataclass(frozen=True) +class ELFUnwindConfig: + frame_pointer: str + return_address: str + register_numbers: typing.Mapping[str, int] + call_instruction_prefixes: tuple[str, ...] + + def is_call_instruction(self, instruction: str) -> bool: + return instruction.startswith(self.call_instruction_prefixes) + + +@dataclasses.dataclass(frozen=True) +class _UnwindRow: + pc: int + cfa_register: str + cfa_offset: int + saved_registers: dict[str, int] + + +class ELFUnwindInfo: + def __init__( + self, + target_name: str, + *, + config: ELFUnwindConfig, + verbose: bool = False, + llvm_version: str, + llvm_tools_install_dir: str | None = None, + llvm_run: _LLVMRun, + ) -> None: + self.target_name = target_name + self.config = config + self.verbose = verbose + self.llvm_version = llvm_version + self.llvm_tools_install_dir = llvm_tools_install_dir + self.llvm_run = llvm_run + + @staticmethod + def _parse_dwarfdump_int( + dump: str, field: str, *, required: bool = True + ) -> int | None: + match = re.search(rf"^\s*{field}:\s+(-?\d+)$", dump, re.MULTILINE) + if match is None: + if required: + raise ValueError(f"missing {field} in llvm-dwarfdump output") + return None + return int(match.group(1)) + + @staticmethod + def _parse_dwarfdump_rows(dump: str) -> list[_UnwindRow]: + row_pattern = re.compile( + r"^\s*0x(?P[0-9a-f]+):\s+" + r"CFA=(?P[A-Z][A-Z0-9]*)" + r"(?P[+-]\d+)?" + r"(?::\s*(?P.*))?$" + ) + saved_pattern = re.compile( + r"(?P[A-Z][A-Z0-9]*)=\[CFA(?P[+-]\d+)?\]" + ) + rows = [] + for line in dump.splitlines(): + row_match = row_pattern.match(line) + if row_match is None: + continue + saved_registers = {} + saved = row_match["saved"] + if saved: + for saved_match in saved_pattern.finditer(saved): + offset = saved_match["offset"] + saved_registers[saved_match["register"]] = ( + int(offset) if offset is not None else 0 + ) + cfa_offset = row_match["cfa_offset"] + rows.append( + _UnwindRow( + pc=int(row_match["pc"], 16), + cfa_register=row_match["cfa_register"], + cfa_offset=int(cfa_offset) if cfa_offset is not None else 0, + saved_registers=saved_registers, + ) + ) + if not rows: + raise ValueError("missing interpreted CFI rows in llvm-dwarfdump output") + return rows + + @staticmethod + def _parse_objdump_instructions(dump: str) -> list[tuple[int, str]]: + instructions = [] + for line in dump.splitlines(): + match = re.match( + r"^\s*(?P[0-9a-f]+):\s+" + r"(?:(?:[0-9a-f]{2}|[0-9a-f]{8})\s+)+" + r"(?P.+)$", + line, + ) + if match: + instructions.append( + ( + int(match["pc"], 16), + re.sub(r"\s+", " ", match["instruction"].strip()), + ) + ) + if not instructions: + raise ValueError("missing instructions in llvm-objdump output") + return instructions + + def _reg_number(self, register: str) -> int: + try: + return self.config.register_numbers[register] + except KeyError as exc: + raise ValueError( + f"unsupported register {register!r} in llvm-dwarfdump output" + ) from exc + + @staticmethod + def _encoded_cfa_offset(byte_offset: int, data_alignment_factor: int) -> int: + if data_alignment_factor == 0: + raise ValueError("DWARF data alignment factor must not be zero") + if byte_offset % data_alignment_factor: + raise ValueError( + f"offset {byte_offset} is not a multiple of " + f"data alignment factor {data_alignment_factor}" + ) + return byte_offset // data_alignment_factor + + async def _read_objdump(self, output: pathlib.Path) -> str: + return await self.llvm_run( + "llvm-objdump", + ["-d", f"{output}"], + echo=self.verbose, + llvm_version=self.llvm_version, + llvm_tools_install_dir=self.llvm_tools_install_dir, + ) + + async def _read_eh_frame(self, output: pathlib.Path) -> str: + return await self.llvm_run( + "llvm-dwarfdump", + ["--eh-frame", f"{output}"], + echo=self.verbose, + llvm_version=self.llvm_version, + llvm_tools_install_dir=self.llvm_tools_install_dir, + ) + + def _executor_call_pc(self, disassembly: str) -> int: + calls = [ + pc + for pc, instruction in self._parse_objdump_instructions(disassembly) + if self.config.is_call_instruction(instruction) + ] + if len(calls) != 1: + raise ValueError( + f"{self.target_name} JIT shim should contain exactly one executor call" + ) + call_pc = calls[0] + return call_pc + + def _active_row(self, eh_frame: str, call_pc: int) -> _UnwindRow: + rows = self._parse_dwarfdump_rows(eh_frame) + active_rows = [row for row in rows if row.pc <= call_pc] + if not active_rows: + raise ValueError( + f"{self.target_name} JIT shim has no CFI row for executor call " + f"at 0x{call_pc:x}" + ) + return max(active_rows, key=lambda row: row.pc) + + def _check_saved_registers(self, row: _UnwindRow) -> None: + if ( + self.config.frame_pointer not in row.saved_registers + or self.config.return_address not in row.saved_registers + ): + raise ValueError( + f"{self.target_name} JIT shim CFI row at 0x{row.pc:x} " + f"does not save {self.config.frame_pointer} and " + f"{self.config.return_address}" + ) + + def _build_unwind_info(self, eh_frame: str, active_row: _UnwindRow) -> UnwindInfo: + code_alignment_factor = self._parse_dwarfdump_int( + eh_frame, "Code alignment factor" + ) + data_alignment_factor = self._parse_dwarfdump_int( + eh_frame, "Data alignment factor" + ) + return_address_register = self._parse_dwarfdump_int( + eh_frame, "Return address column" + ) + assert code_alignment_factor is not None + assert data_alignment_factor is not None + assert return_address_register is not None + return UnwindInfo( + code_alignment_factor=code_alignment_factor, + data_alignment_factor=data_alignment_factor, + return_address_register=return_address_register, + cfa_register=self._reg_number(active_row.cfa_register), + cfa_offset=active_row.cfa_offset, + frame_pointer_register=self._reg_number(self.config.frame_pointer), + frame_pointer_offset=self._encoded_cfa_offset( + active_row.saved_registers[self.config.frame_pointer], + data_alignment_factor, + ), + return_address_offset=self._encoded_cfa_offset( + active_row.saved_registers[self.config.return_address], + data_alignment_factor, + ), + ) + + async def extract(self, output: pathlib.Path) -> UnwindInfo: + disassembly = await self._read_objdump(output) + call_pc = self._executor_call_pc(disassembly) + eh_frame = await self._read_eh_frame(output) + active_row = self._active_row(eh_frame, call_pc) + self._check_saved_registers(active_row) + return self._build_unwind_info(eh_frame, active_row) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 15cac3de3fe11f..ceee383ea680d7 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -12,6 +12,7 @@ import typing import shlex +import _dwarf import _llvm import _optimizers import _schema @@ -37,6 +38,27 @@ ) +_ELF_UNWIND_AARCH64 = _dwarf.ELFUnwindConfig( + frame_pointer="W29", + return_address="W30", + register_numbers={ + "W29": 29, + "W30": 30, + }, + call_instruction_prefixes=("blr ",), +) + +_ELF_UNWIND_X86_64 = _dwarf.ELFUnwindConfig( + frame_pointer="RBP", + return_address="RIP", + register_numbers={ + "RBP": 6, + "RIP": 16, + }, + call_instruction_prefixes=("callq ", "call "), +) + + @dataclasses.dataclass class _Target(typing.Generic[_S, _R]): triple: str @@ -52,6 +74,7 @@ class _Target(typing.Generic[_S, _R]): verbose: bool = False cflags: str = "" frame_pointers: bool = False + unwind: _dwarf.ELFUnwindConfig | None = None llvm_version: str = _llvm._LLVM_VERSION llvm_tools_install_dir: str | None = None known_symbols: dict[str, int] = dataclasses.field(default_factory=dict) @@ -88,6 +111,32 @@ def _compute_digest(self) -> str: hasher.update(pathlib.Path(dirpath, filename).read_bytes()) return hasher.hexdigest() + def _write_generated_header( + self, + output: pathlib.Path, + *, + digest: str, + comment: str, + lines: typing.Iterable[str], + ) -> None: + output_new = output.with_name(f"{output.name}.new") + try: + with output_new.open("w") as file: + file.write(digest) + if comment: + file.write(f"// {comment}\n") + file.write("\n") + for line in lines: + file.write(f"{line}\n") + try: + output_new.replace(output) + except FileNotFoundError: + # another process probably already moved the file + if not output.is_file(): + raise + finally: + output_new.unlink(missing_ok=True) + async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup: group = _stencils.StencilGroup() args = ["--disassemble", "--reloc", f"{path}"] @@ -234,7 +283,7 @@ async def _build_shim_object(self, output: pathlib.Path) -> None: args_o += self._shim_compile_args() args_o += [ "-c", - # The linked shim is a real function in the final binary, so + # The shim is a real function in the final binary, so # keep unwind info for debuggers and stack walkers. "-fasynchronous-unwind-tables", ] @@ -251,6 +300,11 @@ async def _build_shim_object(self, output: pathlib.Path) -> None: llvm_tools_install_dir=self.llvm_tools_install_dir, ) + async def _get_shim_unwind_info( + self, output: pathlib.Path + ) -> _dwarf.UnwindInfo | None: + return None + async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: generated_cases = PYTHON_EXECUTOR_CASES_C_H.read_text() cases_and_opnames = sorted( @@ -289,6 +343,7 @@ def build( force: bool = False, jit_stencils: pathlib.Path, jit_shim_object: pathlib.Path, + jit_unwind_info: pathlib.Path, ) -> None: """Build jit_stencils.h and the shim object in the given directory.""" jit_stencils.parent.mkdir(parents=True, exist_ok=True) @@ -300,39 +355,42 @@ def build( outline = "=" * len(warning) print("\n".join(["", outline, warning, request, outline, ""])) digest = f"// {self._compute_digest()}\n" + # The generated headers include the input digest as their first line. + # If every generated artifact is current, skip the expensive rebuild. if ( not force and jit_stencils.exists() and jit_stencils.read_text().startswith(digest) and jit_shim_object.exists() + and jit_unwind_info.exists() + and jit_unwind_info.read_text().startswith(digest) ): return + # Build the shim first so its compiled DWARF CFI can be used to derive + # the unwind rules emitted into jit_unwind_info-.h. ASYNCIO_RUNNER.run(self._build_shim_object(jit_shim_object)) + unwind_info = ASYNCIO_RUNNER.run(self._get_shim_unwind_info(jit_shim_object)) + self._write_generated_header( + jit_unwind_info, + digest=digest, + comment=comment, + lines=_writer.dump_unwind_info(unwind_info), + ) + # Build the uop stencils after the shim metadata has been emitted. stencil_groups = ASYNCIO_RUNNER.run(self._build_stencils()) - jit_stencils_new = jit_stencils.parent / "jit_stencils.h.new" - try: - with jit_stencils_new.open("w") as file: - file.write(digest) - if comment: - file.write(f"// {comment}\n") - file.write("\n") - for line in _writer.dump(stencil_groups, self.known_symbols): - file.write(f"{line}\n") - try: - jit_stencils_new.replace(jit_stencils) - except FileNotFoundError: - # another process probably already moved the file - if not jit_stencils.is_file(): - raise - finally: - jit_stencils_new.unlink(missing_ok=True) + self._write_generated_header( + jit_stencils, + digest=digest, + comment=comment, + lines=_writer.dump(stencil_groups, self.known_symbols), + ) class _COFF( _Target[_schema.COFFSection, _schema.COFFRelocation] ): # pylint: disable = too-few-public-methods def _shim_compile_args(self) -> list[str]: - # The linked shim is part of pythoncore, not a shared extension. + # The shim is part of pythoncore, not a shared extension. # On Windows, Py_BUILD_CORE_MODULE makes public APIs import from # pythonXY.lib, which creates a self-dependency when linking # pythoncore.dll. Build the shim with builtin/core semantics. @@ -450,6 +508,19 @@ class _ELF( symbol_prefix = "" re_global = re.compile(r'\s*\.globl\s+(?P