diff --git a/Doc/library/profiling.sampling.rst b/Doc/library/profiling.sampling.rst index 1f60e2cb578c4d..9026b443a0e3a0 100644 --- a/Doc/library/profiling.sampling.rst +++ b/Doc/library/profiling.sampling.rst @@ -200,6 +200,36 @@ On most systems, attaching to another process requires appropriate permissions. See :ref:`profiling-permissions` for platform-specific requirements. +.. _replay-command: + +The ``replay`` command +---------------------- + +The ``replay`` command converts binary profile files to other output formats:: + + python -m profiling.sampling replay profile.bin + python -m profiling.sampling replay --flamegraph -o profile.html profile.bin + +This command is useful when you have captured profiling data in binary format +and want to analyze it later or convert it to a visualization format. Binary +profiles can be replayed multiple times to different formats without +re-profiling. + +:: + + # Convert binary to pstats (default, prints to stdout) + python -m profiling.sampling replay profile.bin + + # Convert binary to flame graph + python -m profiling.sampling replay --flamegraph -o output.html profile.bin + + # Convert binary to gecko format for Firefox Profiler + python -m profiling.sampling replay --gecko -o profile.json profile.bin + + # Convert binary to heatmap + python -m profiling.sampling replay --heatmap -o my_heatmap profile.bin + + Profiling in production ----------------------- @@ -1041,6 +1071,57 @@ intuitive view that shows exactly where time is spent without requiring interpretation of hierarchical visualizations. +Binary format +------------- + +Binary format (:option:`--binary`) produces a compact binary file for efficient +storage of profiling data:: + + python -m profiling.sampling run --binary -o profile.bin script.py + python -m profiling.sampling attach --binary -o profile.bin 12345 + +The :option:`--compression` option controls data compression: + +- ``auto`` (default): Use zstd compression if available, otherwise no + compression +- ``zstd``: Force zstd compression (requires zstd support) +- ``none``: Disable compression + +:: + + python -m profiling.sampling run --binary --compression=zstd -o profile.bin script.py + +To analyze binary profiles, use the :ref:`replay-command` to convert them to +other formats like flame graphs or pstats output. + + +Record and replay workflow +========================== + +The binary format combined with the replay command enables a record-and-replay +workflow that separates data capture from analysis. Rather than generating +visualizations during profiling, you capture raw data to a compact binary file +and convert it to different formats later. + +This approach has three main benefits. First, sampling runs faster because the +work of building data structures for visualization is deferred until replay. +Second, a single binary capture can be converted to multiple output formats +without re-profiling---pstats for a quick overview, flame graph for visual +exploration, heatmap for line-level detail. Third, binary files are compact +and easy to share with colleagues who can convert them to their preferred +format. + +A typical workflow:: + + # Capture profile in production or during tests + python -m profiling.sampling attach --binary -o profile.bin 12345 + + # Later, analyze with different formats + python -m profiling.sampling replay profile.bin + python -m profiling.sampling replay --flamegraph -o profile.html profile.bin + python -m profiling.sampling replay --heatmap -o heatmap profile.bin + + Live mode ========= @@ -1252,6 +1333,10 @@ Global options Attach to and profile a running process by PID. +.. option:: replay + + Convert a binary profile file to another output format. + Sampling options ---------------- @@ -1335,12 +1420,22 @@ Output options Generate HTML heatmap with line-level sample counts. +.. option:: --binary + + Generate high-performance binary format for later conversion with the + ``replay`` command. + +.. option:: --compression + + Compression for binary format: ``auto`` (use zstd if available, default), + ``zstd``, or ``none``. + .. option:: -o , --output Output file or directory path. Default behavior varies by format: - ``--pstats`` writes to stdout, ``--flamegraph`` and ``--gecko`` generate - files like ``flamegraph.PID.html``, and ``--heatmap`` creates a directory - named ``heatmap_PID``. + ``--pstats`` writes to stdout, ``--flamegraph``, ``--gecko``, and + ``--binary`` generate files like ``flamegraph.PID.html``, and ``--heatmap`` + creates a directory named ``heatmap_PID``. pstats display options diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 56bc003ac3e246..e625bf2fef1912 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -1653,9 +1653,11 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_varnames)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(code)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(col_offset)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(collector)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(command)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(comment_factory)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile_mode)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compression)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(config)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(consts)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(context)); @@ -1718,7 +1720,9 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(event)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eventmask)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_tb)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_type)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_val)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_value)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(excepthook)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception)); @@ -1974,6 +1978,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(print_file_and_line)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(priority)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_callback)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_routine)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(proto)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(protocol)); @@ -2014,6 +2019,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reversed)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(rounding)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(salt)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sample_interval_us)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sched_priority)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(scheduler)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(script)); @@ -2053,8 +2059,10 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(spam)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src_dir_fd)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stack_frames)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start_time_us)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status)); @@ -2095,6 +2103,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(times)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timespec)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp_us)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timetuple)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timeunit)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(top)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 8be948b92ec8f9..771f0f8cb4ad87 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -376,9 +376,11 @@ struct _Py_global_strings { STRUCT_FOR_ID(co_varnames) STRUCT_FOR_ID(code) STRUCT_FOR_ID(col_offset) + STRUCT_FOR_ID(collector) STRUCT_FOR_ID(command) STRUCT_FOR_ID(comment_factory) STRUCT_FOR_ID(compile_mode) + STRUCT_FOR_ID(compression) STRUCT_FOR_ID(config) STRUCT_FOR_ID(consts) STRUCT_FOR_ID(context) @@ -441,7 +443,9 @@ struct _Py_global_strings { STRUCT_FOR_ID(event) STRUCT_FOR_ID(eventmask) STRUCT_FOR_ID(exc) + STRUCT_FOR_ID(exc_tb) STRUCT_FOR_ID(exc_type) + STRUCT_FOR_ID(exc_val) STRUCT_FOR_ID(exc_value) STRUCT_FOR_ID(excepthook) STRUCT_FOR_ID(exception) @@ -697,6 +701,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(print_file_and_line) STRUCT_FOR_ID(priority) STRUCT_FOR_ID(progress) + STRUCT_FOR_ID(progress_callback) STRUCT_FOR_ID(progress_routine) STRUCT_FOR_ID(proto) STRUCT_FOR_ID(protocol) @@ -737,6 +742,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(reversed) STRUCT_FOR_ID(rounding) STRUCT_FOR_ID(salt) + STRUCT_FOR_ID(sample_interval_us) STRUCT_FOR_ID(sched_priority) STRUCT_FOR_ID(scheduler) STRUCT_FOR_ID(script) @@ -776,8 +782,10 @@ struct _Py_global_strings { STRUCT_FOR_ID(spam) STRUCT_FOR_ID(src) STRUCT_FOR_ID(src_dir_fd) + STRUCT_FOR_ID(stack_frames) STRUCT_FOR_ID(stacklevel) STRUCT_FOR_ID(start) + STRUCT_FOR_ID(start_time_us) STRUCT_FOR_ID(statement) STRUCT_FOR_ID(stats) STRUCT_FOR_ID(status) @@ -818,6 +826,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(times) STRUCT_FOR_ID(timespec) STRUCT_FOR_ID(timestamp) + STRUCT_FOR_ID(timestamp_us) STRUCT_FOR_ID(timetuple) STRUCT_FOR_ID(timeunit) STRUCT_FOR_ID(top) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index d381fb9d2d42a3..499a2569b9a06c 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -1651,9 +1651,11 @@ extern "C" { INIT_ID(co_varnames), \ INIT_ID(code), \ INIT_ID(col_offset), \ + INIT_ID(collector), \ INIT_ID(command), \ INIT_ID(comment_factory), \ INIT_ID(compile_mode), \ + INIT_ID(compression), \ INIT_ID(config), \ INIT_ID(consts), \ INIT_ID(context), \ @@ -1716,7 +1718,9 @@ extern "C" { INIT_ID(event), \ INIT_ID(eventmask), \ INIT_ID(exc), \ + INIT_ID(exc_tb), \ INIT_ID(exc_type), \ + INIT_ID(exc_val), \ INIT_ID(exc_value), \ INIT_ID(excepthook), \ INIT_ID(exception), \ @@ -1972,6 +1976,7 @@ extern "C" { INIT_ID(print_file_and_line), \ INIT_ID(priority), \ INIT_ID(progress), \ + INIT_ID(progress_callback), \ INIT_ID(progress_routine), \ INIT_ID(proto), \ INIT_ID(protocol), \ @@ -2012,6 +2017,7 @@ extern "C" { INIT_ID(reversed), \ INIT_ID(rounding), \ INIT_ID(salt), \ + INIT_ID(sample_interval_us), \ INIT_ID(sched_priority), \ INIT_ID(scheduler), \ INIT_ID(script), \ @@ -2051,8 +2057,10 @@ extern "C" { INIT_ID(spam), \ INIT_ID(src), \ INIT_ID(src_dir_fd), \ + INIT_ID(stack_frames), \ INIT_ID(stacklevel), \ INIT_ID(start), \ + INIT_ID(start_time_us), \ INIT_ID(statement), \ INIT_ID(stats), \ INIT_ID(status), \ @@ -2093,6 +2101,7 @@ extern "C" { INIT_ID(times), \ INIT_ID(timespec), \ INIT_ID(timestamp), \ + INIT_ID(timestamp_us), \ INIT_ID(timetuple), \ INIT_ID(timeunit), \ INIT_ID(top), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index 24e50828935106..1375f46018f943 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1284,6 +1284,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(collector); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(command); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -1296,6 +1300,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(compression); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(config); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -1544,10 +1552,18 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(exc_tb); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(exc_type); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(exc_val); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(exc_value); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -2568,6 +2584,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(progress_callback); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(progress_routine); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -2728,6 +2748,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(sample_interval_us); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(sched_priority); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -2884,6 +2908,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(stack_frames); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(stacklevel); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -2892,6 +2920,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(start_time_us); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(statement); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -3052,6 +3084,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(timestamp_us); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(timetuple); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); diff --git a/InternalDocs/profiling_binary_format.md b/InternalDocs/profiling_binary_format.md new file mode 100644 index 00000000000000..b4ec2b39323d32 --- /dev/null +++ b/InternalDocs/profiling_binary_format.md @@ -0,0 +1,442 @@ +# Profiling Binary Format + +The profiling module includes a binary file format for storing sampling +profiler data. This document describes the format's structure and the +design decisions behind it. + +The implementation is in +[`Modules/_remote_debugging/binary_io.c`](../Modules/_remote_debugging/binary_io.c) +with declarations in +[`Modules/_remote_debugging/binary_io.h`](../Modules/_remote_debugging/binary_io.h). + +## Overview + +The sampling profiler can generate enormous amounts of data. A typical +profiling session sampling at 1000 Hz for 60 seconds produces 60,000 samples. +Each sample contains a full call stack, often 20-50 frames deep, and each +frame includes a filename, function name, and line number. In a text-based +format like collapsed stacks, this would mean repeating the same long file +paths and function names thousands of times. + +The binary format addresses this through two key strategies: + +1. **Deduplication**: Strings and frames are stored once in lookup tables, + then referenced by small integer indices. A 100-character file path that + appears in 50,000 samples is stored once, not 50,000 times. + +2. **Compact encoding**: Variable-length integers (varints) encode small + values in fewer bytes. Since most indices are small (under 128), they + typically need only one byte instead of four. + +Together with optional zstd compression, these techniques reduce file sizes +by 10-50x compared to text formats while also enabling faster I/O. + +## File Layout + +The file consists of five sections: + +``` ++------------------+ Offset 0 +| Header | 64 bytes (fixed) ++------------------+ Offset 64 +| | +| Sample Data | Variable size (optionally compressed) +| | ++------------------+ string_table_offset +| String Table | Variable size ++------------------+ frame_table_offset +| Frame Table | Variable size ++------------------+ file_size - 32 +| Footer | 32 bytes (fixed) ++------------------+ file_size +``` + +The layout is designed for streaming writes during profiling. The profiler +cannot know in advance how many unique strings or frames will be encountered, +so these tables must be built incrementally and written at the end. + +The header comes first so readers can quickly validate the file and locate +the metadata tables. The sample data follows immediately, allowing the writer +to stream samples directly to disk (or through a compression stream) without +buffering the entire dataset in memory. + +The string and frame tables are placed after sample data because they grow +as new unique entries are discovered during profiling. By deferring their +output until finalization, the writer avoids the complexity of reserving +space or rewriting portions of the file. + +The footer at the end contains counts needed to allocate arrays before +parsing the tables. Placing it at a fixed offset from the end (rather than +at a variable offset recorded in the header) means readers can locate it +with a single seek to `file_size - 32`, without first reading the header. + +## Header + +``` + Offset Size Type Description ++--------+------+---------+----------------------------------------+ +| 0 | 4 | uint32 | Magic number (0x54414348 = "TACH") | +| 4 | 4 | uint32 | Format version (currently 2) | +| 8 | 8 | uint64 | Start timestamp (microseconds) | +| 16 | 8 | uint64 | Sample interval (microseconds) | +| 24 | 4 | uint32 | Total sample count | +| 28 | 4 | uint32 | Thread count | +| 32 | 8 | uint64 | String table offset | +| 40 | 8 | uint64 | Frame table offset | +| 48 | 4 | uint32 | Compression type (0=none, 1=zstd) | +| 52 | 12 | bytes | Reserved (zero-filled) | ++--------+------+---------+----------------------------------------+ +``` + +The header is written as zeros initially, then overwritten with actual values +during finalization. This requires the output stream to be seekable, which +is acceptable since the format targets regular files rather than pipes or +network streams. + +## Sample Data + +Sample data begins at offset 64 and extends to `string_table_offset`. Samples +use delta compression to minimize redundancy when consecutive samples from the +same thread have identical or similar call stacks. + +### Stack Encoding Types + +Each sample record begins with thread identification, then an encoding byte: + +| Code | Name | Description | +|------|------|-------------| +| 0x00 | REPEAT | RLE: identical stack repeated N times | +| 0x01 | FULL | Complete stack (first sample or no match) | +| 0x02 | SUFFIX | Shares N frames from bottom of previous stack | +| 0x03 | POP_PUSH | Remove M frames from top, add N new frames | + +### Record Formats + +**REPEAT (0x00) - Run-Length Encoded Identical Stacks:** +``` ++-----------------+-----------+----------------------------------------+ +| thread_id | 8 bytes | Thread identifier (uint64, fixed) | +| interpreter_id | 4 bytes | Interpreter ID (uint32, fixed) | +| encoding | 1 byte | 0x00 (REPEAT) | +| count | varint | Number of samples in this RLE group | +| samples | varies | Interleaved: [delta: varint, status: 1]| +| | | repeated count times | ++-----------------+-----------+----------------------------------------+ +``` +The stack is inherited from this thread's previous sample. Each sample in the +group gets its own timestamp delta and status byte, stored as interleaved pairs +(delta1, status1, delta2, status2, ...) rather than separate arrays. + +**FULL (0x01) - Complete Stack:** +``` ++-----------------+-----------+----------------------------------------+ +| thread_id | 8 bytes | Thread identifier (uint64, fixed) | +| interpreter_id | 4 bytes | Interpreter ID (uint32, fixed) | +| encoding | 1 byte | 0x01 (FULL) | +| timestamp_delta | varint | Microseconds since thread's last sample| +| status | 1 byte | Thread state flags | +| stack_depth | varint | Number of frames in call stack | +| frame_indices | varint[] | Array of frame table indices | ++-----------------+-----------+----------------------------------------+ +``` +Used for the first sample from a thread, or when delta encoding would not +provide savings. + +**SUFFIX (0x02) - Shared Suffix Match:** +``` ++-----------------+-----------+----------------------------------------+ +| thread_id | 8 bytes | Thread identifier (uint64, fixed) | +| interpreter_id | 4 bytes | Interpreter ID (uint32, fixed) | +| encoding | 1 byte | 0x02 (SUFFIX) | +| timestamp_delta | varint | Microseconds since thread's last sample| +| status | 1 byte | Thread state flags | +| shared_count | varint | Frames shared from bottom of prev stack| +| new_count | varint | New frames at top of stack | +| new_frames | varint[] | Array of new_count frame indices | ++-----------------+-----------+----------------------------------------+ +``` +Used when a function call added frames to the top of the stack. The shared +frames from the previous stack are kept, and new frames are prepended. + +**POP_PUSH (0x03) - Pop and Push:** +``` ++-----------------+-----------+----------------------------------------+ +| thread_id | 8 bytes | Thread identifier (uint64, fixed) | +| interpreter_id | 4 bytes | Interpreter ID (uint32, fixed) | +| encoding | 1 byte | 0x03 (POP_PUSH) | +| timestamp_delta | varint | Microseconds since thread's last sample| +| status | 1 byte | Thread state flags | +| pop_count | varint | Frames to remove from top of prev stack| +| push_count | varint | New frames to add at top | +| new_frames | varint[] | Array of push_count frame indices | ++-----------------+-----------+----------------------------------------+ +``` +Used when the code path changed: some frames were popped (function returns) +and new frames were pushed (different function calls). + +### Thread and Interpreter Identification + +Thread IDs are 64-bit values that can be large (memory addresses on some +platforms) and vary unpredictably. Using a fixed 8-byte encoding avoids +the overhead of varint encoding for large values and simplifies parsing +since the reader knows exactly where each field begins. + +The interpreter ID identifies which Python sub-interpreter the thread +belongs to, allowing analysis tools to separate activity across interpreters +in processes using multiple sub-interpreters. + +### Status Byte + +The status byte is a bitfield encoding thread state at sample time: + +| Bit | Flag | Meaning | +|-----|-----------------------|--------------------------------------------| +| 0 | THREAD_STATUS_HAS_GIL | Thread holds the GIL (Global Interpreter Lock) | +| 1 | THREAD_STATUS_ON_CPU | Thread is actively running on a CPU core | +| 2 | THREAD_STATUS_UNKNOWN | Thread state could not be determined | +| 3 | THREAD_STATUS_GIL_REQUESTED | Thread is waiting to acquire the GIL | +| 4 | THREAD_STATUS_HAS_EXCEPTION | Thread has a pending exception | + +Multiple flags can be set simultaneously (e.g., a thread can hold the GIL +while also running on CPU). Analysis tools use these to filter samples or +visualize thread states over time. + +### Timestamp Delta Encoding + +Timestamps use delta encoding rather than absolute values. Absolute +timestamps in microseconds require 8 bytes each, but consecutive samples +from the same thread are typically separated by the sampling interval +(e.g., 1000 microseconds), so the delta between them is small and fits +in 1-2 varint bytes. The writer tracks the previous timestamp for each +thread separately. The first sample from a thread encodes its delta from +the profiling start time; subsequent samples encode the delta from that +thread's previous sample. This per-thread tracking is necessary because +samples are interleaved across threads in arrival order, not grouped by +thread. + +For REPEAT (RLE) records, timestamp deltas and status bytes are stored as +interleaved pairs (delta, status, delta, status, ...) - one pair per +repeated sample - allowing efficient batching while preserving the exact +timing and state of each sample. + +### Frame Indexing + +Each frame in a call stack is represented by an index into the frame table +rather than inline data. This provides massive space savings because call +stacks are highly repetitive: the same function appears in many samples +(hot functions), call stacks often share common prefixes (main -> app -> +handler -> ...), and recursive functions create repeated frame sequences. +A frame index is typically 1-2 varint bytes. Inline frame data would be +20-200+ bytes (two strings plus a line number). For a profile with 100,000 +samples averaging 30 frames each, this reduces frame data from potentially +gigabytes to tens of megabytes. + +Frame indices are written innermost-first (the currently executing frame +has index 0 in the array). This ordering works well with delta compression: +function calls typically add frames at the top (index 0), while shared +frames remain at the bottom. + +## String Table + +The string table stores deduplicated UTF-8 strings (filenames and function +names). It begins at `string_table_offset` and contains entries in order of +their assignment during writing: + +``` ++----------------+ +| length: varint | +| data: bytes | ++----------------+ (repeated for each string) +``` + +Strings are stored in the order they were first encountered during writing. +The first unique filename gets index 0, the second gets index 1, and so on. +Length-prefixing (rather than null-termination) allows strings containing +null bytes and enables readers to allocate exact-sized buffers. The varint +length encoding means short strings (under 128 bytes) need only one length +byte. + +## Frame Table + +The frame table stores deduplicated frame entries: + +``` ++----------------------+ +| filename_idx: varint | +| funcname_idx: varint | +| lineno: svarint | ++----------------------+ (repeated for each frame) +``` + +Each unique (filename, funcname, lineno) combination gets one entry. Two +calls to the same function at different line numbers produce different +frame entries; two calls at the same line number share one entry. + +Strings and frames are deduplicated separately because they have different +cardinalities and reference patterns. A codebase might have hundreds of +unique source files but thousands of unique functions. Many functions share +the same filename, so storing the filename index in each frame entry (rather +than the full string) provides an additional layer of deduplication. A frame +entry is just three varints (typically 3-6 bytes) rather than two full +strings plus a line number. + +Line numbers use signed varint (zigzag encoding) rather than unsigned to +handle edge cases. Synthetic frames—generated frames that don't correspond +directly to Python source code, such as C extension boundaries or internal +interpreter frames—use line number 0 or -1 to indicate the absence of a +source location. Zigzag encoding ensures these small negative values encode +efficiently (−1 becomes 1, which is one byte) rather than requiring the +maximum varint length. + +## Footer + +``` + Offset Size Type Description ++--------+------+---------+----------------------------------------+ +| 0 | 4 | uint32 | String count | +| 4 | 4 | uint32 | Frame count | +| 8 | 8 | uint64 | Total file size | +| 16 | 16 | bytes | Checksum (reserved, currently zeros) | ++--------+------+---------+----------------------------------------+ +``` + +The string and frame counts allow readers to pre-allocate arrays of the +correct size before parsing the tables. Without these counts, readers would +need to either scan the tables twice (once to count, once to parse) or use +dynamically-growing arrays. + +The file size field provides a consistency check: if the actual file size +does not match, the file may be truncated or corrupted. + +The checksum field is reserved for future use. A checksum would allow +detection of corruption but adds complexity and computation cost. The +current implementation leaves this as zeros. + +## Variable-Length Integer Encoding + +The format uses LEB128 (Little Endian Base 128) for unsigned integers and +zigzag + LEB128 for signed integers. These encodings are widely used +(Protocol Buffers, DWARF debug info, WebAssembly) and well-understood. + +### Unsigned Varint (LEB128) + +Each byte stores 7 bits of data. The high bit indicates whether more bytes +follow: + +``` +Value Encoded bytes +0-127 [0xxxxxxx] (1 byte) +128-16383 [1xxxxxxx] [0xxxxxxx] (2 bytes) +16384+ [1xxxxxxx] [1xxxxxxx] ... (3+ bytes) +``` + +Most indices in profiling data are small. A profile with 1000 unique frames +needs at most 2 bytes per frame index. The common case (indices under 128) +needs only 1 byte. + +### Signed Varint (Zigzag) + +Standard LEB128 encodes −1 as a very large unsigned value, requiring many +bytes. Zigzag encoding interleaves positive and negative values: + +``` + 0 -> 0 -1 -> 1 1 -> 2 -2 -> 3 2 -> 4 +``` + +This ensures small-magnitude values (whether positive or negative) encode +in few bytes. + +## Compression + +When compression is enabled, the sample data region contains a zstd stream. +The string table, frame table, and footer remain uncompressed so readers can +access metadata without decompressing the entire file. A tool that only needs +to report "this file contains 50,000 samples of 3 threads" can read the header +and footer without touching the compressed sample data. This also simplifies +the format: the header's offset fields point directly to the tables rather +than to positions within a decompressed stream. + +Zstd provides an excellent balance of compression ratio and speed. Profiling +data compresses very well (often 5-10x) due to repetitive patterns: the same +small set of frame indices appears repeatedly, and delta-encoded timestamps +cluster around the sampling interval. Zstd's streaming API allows compression +without buffering the entire dataset. The writer feeds sample data through +the compressor incrementally, flushing compressed chunks to disk as they +become available. + +Level 5 compression is used as a default. Lower levels (1-3) are faster but +compress less; higher levels (6+) compress more but slow down writing. Level +5 provides good compression with minimal impact on profiling overhead. + +## Reading and Writing + +### Writing + +1. Open the output file and write 64 zero bytes as a placeholder header +2. Initialize empty string and frame dictionaries for deduplication +3. For each sample: + - Intern any new strings, assigning sequential indices + - Intern any new frames, assigning sequential indices + - Encode the sample record and write to the buffer + - Flush the buffer through compression (if enabled) when full +4. Flush remaining buffered data and finalize compression +5. Write the string table (length-prefixed strings in index order) +6. Write the frame table (varint-encoded entries in index order) +7. Write the footer with final counts +8. Seek to offset 0 and write the header with actual values + +The writer maintains two dictionaries: one mapping strings to indices, one +mapping (filename_idx, funcname_idx, lineno) tuples to frame indices. These +enable O(1) lookup during interning. + +### Reading + +1. Read the header and validate magic/version +2. Seek to end − 32 and read the footer +3. Allocate string array of `string_count` elements +4. Parse the string table, populating the array +5. Allocate frame array of `frame_count * 3` uint32 elements +6. Parse the frame table, populating the array +7. If compressed, decompress the sample data region +8. Iterate through samples, resolving indices to strings/frames + +The reader builds lookup arrays rather than dictionaries since it only needs +index-to-value mapping, not value-to-index. + +## Platform Considerations + +On Unix systems (Linux, macOS), the reader uses `mmap()` to map the file +into the process address space. The kernel handles paging data in and out +as needed, no explicit read() calls or buffer management are required, +multiple readers can share the same physical pages, and sequential access +patterns benefit from kernel read-ahead. + +The implementation uses `madvise()` to hint the access pattern to the kernel: +`MADV_SEQUENTIAL` indicates the file will be read linearly, enabling +aggressive read-ahead. `MADV_WILLNEED` requests pre-faulting of pages. +On Linux, `MAP_POPULATE` pre-faults all pages at mmap time rather than on +first access, moving page fault overhead from the parsing loop to the +initial mapping for more predictable performance. For large files (over +32 MB), `MADV_HUGEPAGE` requests transparent huge pages (2 MB instead of +4 KB) to reduce TLB pressure when accessing large amounts of data. + +On Windows, the implementation falls back to standard file I/O with full +file buffering. Profiling data files are typically small enough (tens to +hundreds of megabytes) that this is acceptable. + +The writer uses a 512 KB buffer to batch small writes. Each sample record +is typically tens of bytes; writing these individually would incur excessive +syscall overhead. The buffer accumulates data until full, then flushes in +one write() call (or feeds through the compression stream). + +## Future Considerations + +The format reserves space for future extensions. The 12 reserved bytes in +the header could hold additional metadata. The 16-byte checksum field in +the footer is currently unused. The version field allows incompatible +changes with graceful rejection. New compression types could be added +(compression_type > 1). + +Any changes that alter the meaning of existing fields or the parsing logic +should increment the version number to prevent older readers from +misinterpreting new files. diff --git a/Lib/profiling/sampling/binary_collector.py b/Lib/profiling/sampling/binary_collector.py new file mode 100644 index 00000000000000..d8d38f4c078927 --- /dev/null +++ b/Lib/profiling/sampling/binary_collector.py @@ -0,0 +1,121 @@ +"""Thin Python wrapper around C binary writer for profiling data.""" + +import time + +from .collector import Collector + +# Compression type constants (must match binary_io.h) +COMPRESSION_NONE = 0 +COMPRESSION_ZSTD = 1 + + +def _resolve_compression(compression): + """Resolve compression type from string or int. + + Args: + compression: 'auto', 'zstd', 'none', or int (0/1) + + Returns: + int: Compression type constant + """ + if isinstance(compression, int): + return compression + + compression = compression.lower() + if compression == 'none': + return COMPRESSION_NONE + elif compression == 'zstd': + return COMPRESSION_ZSTD + elif compression == 'auto': + # Auto: use zstd if available, otherwise none + import _remote_debugging + if _remote_debugging.zstd_available(): + return COMPRESSION_ZSTD + return COMPRESSION_NONE + else: + raise ValueError(f"Unknown compression type: {compression}") + + +class BinaryCollector(Collector): + """High-performance binary collector using C implementation. + + This collector writes profiling data directly to a binary file format + with optional zstd compression. All I/O is performed in C for maximum + throughput. + + The binary format uses string/frame deduplication and varint encoding + for efficient storage. + """ + + def __init__(self, filename, sample_interval_usec, *, skip_idle=False, + compression='auto'): + """Create a new binary collector. + + Args: + filename: Path to output binary file + sample_interval_usec: Sampling interval in microseconds + skip_idle: If True, skip idle threads (not used in binary format) + compression: 'auto', 'zstd', 'none', or int (0=none, 1=zstd) + """ + import _remote_debugging + + self.filename = filename + self.sample_interval_usec = sample_interval_usec + self.skip_idle = skip_idle + + compression_type = _resolve_compression(compression) + start_time_us = int(time.monotonic() * 1_000_000) + self._writer = _remote_debugging.BinaryWriter( + filename, sample_interval_usec, start_time_us, compression=compression_type + ) + + def collect(self, stack_frames, timestamp_us=None): + """Collect profiling data from stack frames. + + This passes stack_frames directly to the C writer which handles + all encoding and buffering. + + Args: + stack_frames: List of InterpreterInfo objects from _remote_debugging + timestamp_us: Optional timestamp in microseconds. If not provided, + uses time.monotonic() to generate one. + """ + if timestamp_us is None: + timestamp_us = int(time.monotonic() * 1_000_000) + self._writer.write_sample(stack_frames, timestamp_us) + + def collect_failed_sample(self): + """Record a failed sample attempt (no-op for binary format).""" + pass + + def export(self, filename=None): + """Finalize and close the binary file. + + Args: + filename: Ignored (binary files are written incrementally) + """ + self._writer.finalize() + + @property + def total_samples(self): + return self._writer.total_samples + + def get_stats(self): + """Get encoding statistics. + + Returns: + Dict with encoding statistics including repeat/full/suffix/pop-push + record counts, frames written/saved, and compression ratio. + """ + return self._writer.get_stats() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit - finalize unless there was an error.""" + if exc_type is None: + self._writer.finalize() + else: + self._writer.close() + return False diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py new file mode 100644 index 00000000000000..50c96668cc585b --- /dev/null +++ b/Lib/profiling/sampling/binary_reader.py @@ -0,0 +1,128 @@ +"""Thin Python wrapper around C binary reader for profiling data.""" + + +class BinaryReader: + """High-performance binary reader using C implementation. + + This reader uses memory-mapped I/O (on Unix) for fast replay of + profiling data from binary files. + + Use as a context manager: + with BinaryReader('profile.bin') as reader: + info = reader.get_info() + reader.replay_samples(collector, progress_callback) + """ + + def __init__(self, filename): + """Create a new binary reader. + + Args: + filename: Path to input binary file + """ + self.filename = filename + self._reader = None + + def __enter__(self): + import _remote_debugging + self._reader = _remote_debugging.BinaryReader(self.filename) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._reader is not None: + self._reader.close() + self._reader = None + return False + + def get_info(self): + """Get metadata about the binary file. + + Returns: + dict: File metadata including: + - sample_count: Number of samples in the file + - sample_interval_us: Sampling interval in microseconds + - start_time_us: Start timestamp in microseconds + - string_count: Number of unique strings + - frame_count: Number of unique frames + - compression: Compression type used + """ + if self._reader is None: + raise RuntimeError("Reader not open. Use as context manager.") + return self._reader.get_info() + + def replay_samples(self, collector, progress_callback=None): + """Replay samples from binary file through a collector. + + This allows converting binary profiling data to other formats + (e.g., flamegraph, pstats) by replaying through the appropriate + collector. + + Args: + collector: A Collector instance with a collect() method + progress_callback: Optional callable(current, total) for progress + + Returns: + int: Number of samples replayed + """ + if self._reader is None: + raise RuntimeError("Reader not open. Use as context manager.") + return self._reader.replay(collector, progress_callback) + + @property + def sample_count(self): + if self._reader is None: + raise RuntimeError("Reader not open. Use as context manager.") + return self._reader.get_info()['sample_count'] + + def get_stats(self): + """Get reconstruction statistics from replay. + + Returns: + dict: Statistics about record types decoded and samples + reconstructed during replay. + """ + if self._reader is None: + raise RuntimeError("Reader not open. Use as context manager.") + return self._reader.get_stats() + + +def convert_binary_to_format(input_file, output_file, output_format, + sample_interval_usec=None, progress_callback=None): + """Convert a binary profiling file to another format. + + Args: + input_file: Path to input binary file + output_file: Path to output file + output_format: Target format ('flamegraph', 'collapsed', 'pstats', etc.) + sample_interval_usec: Override sample interval (uses file's if None) + progress_callback: Optional callable(current, total) for progress + + Returns: + int: Number of samples converted + """ + from .gecko_collector import GeckoCollector + from .stack_collector import FlamegraphCollector, CollapsedStackCollector + from .pstats_collector import PStatsCollector + + with BinaryReader(input_file) as reader: + info = reader.get_info() + interval = sample_interval_usec or info['sample_interval_us'] + + # Create appropriate collector based on format + if output_format == 'flamegraph': + collector = FlamegraphCollector(interval) + elif output_format == 'collapsed': + collector = CollapsedStackCollector(interval) + elif output_format == 'pstats': + collector = PStatsCollector(interval) + elif output_format == 'gecko': + collector = GeckoCollector(interval) + else: + raise ValueError(f"Unknown output format: {output_format}") + + # Replay samples through collector + count = reader.replay_samples(collector, progress_callback) + + # Export to target format + collector.export(output_file) + + return count diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index e1ff3758c0d341..266a490476936d 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -15,6 +15,8 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .binary_collector import BinaryCollector +from .binary_reader import BinaryReader, convert_binary_to_format from .constants import ( PROFILING_MODE_ALL, PROFILING_MODE_WALL, @@ -74,6 +76,7 @@ class CustomFormatter( "flamegraph": "html", "gecko": "json", "heatmap": "html", + "binary": "bin", } COLLECTOR_MAP = { @@ -82,6 +85,7 @@ class CustomFormatter( "flamegraph": FlamegraphCollector, "gecko": GeckoCollector, "heatmap": HeatmapCollector, + "binary": BinaryCollector, } def _setup_child_monitor(args, parent_pid): @@ -179,7 +183,7 @@ def _parse_mode(mode_string): def _check_process_died(process): """Check if process died and raise an error with stderr if available.""" if process.poll() is None: - return # Process still running + return # Process died - try to get stderr for error message stderr_msg = "" @@ -364,7 +368,7 @@ def _add_mode_options(parser): ) -def _add_format_options(parser): +def _add_format_options(parser, include_compression=True): """Add output format options to a parser.""" output_group = parser.add_argument_group("Output options") format_group = output_group.add_mutually_exclusive_group() @@ -403,8 +407,23 @@ def _add_format_options(parser): dest="format", help="Generate interactive HTML heatmap visualization with line-level sample counts", ) + format_group.add_argument( + "--binary", + action="store_const", + const="binary", + dest="format", + help="Generate high-performance binary format (use 'replay' command to convert)", + ) parser.set_defaults(format="pstats") + if include_compression: + output_group.add_argument( + "--compression", + choices=["auto", "zstd", "none"], + default="auto", + help="Compression for binary format: auto (use zstd if available), zstd, none", + ) + output_group.add_argument( "-o", "--output", @@ -459,15 +478,18 @@ def _sort_to_mode(sort_choice): return sort_map.get(sort_choice, SORT_MODE_NSAMPLES) -def _create_collector(format_type, interval, skip_idle, opcodes=False): +def _create_collector(format_type, interval, skip_idle, opcodes=False, + output_file=None, compression='auto'): """Create the appropriate collector based on format type. Args: - format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap') + format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary') interval: Sampling interval in microseconds skip_idle: Whether to skip idle samples opcodes: Whether to collect opcode information (only used by gecko format for creating interval markers in Firefox Profiler) + output_file: Output file path (required for binary format) + compression: Compression type for binary format ('auto', 'zstd', 'none') Returns: A collector instance of the appropriate type @@ -476,6 +498,13 @@ def _create_collector(format_type, interval, skip_idle, opcodes=False): if collector_class is None: raise ValueError(f"Unknown format: {format_type}") + # Binary format requires output file and compression + if format_type == "binary": + if output_file is None: + raise ValueError("Binary format requires an output file") + return collector_class(output_file, interval, skip_idle=skip_idle, + compression=compression) + # Gecko format never skips idle (it needs both GIL and CPU data) # and is the only format that uses opcodes for interval markers if format_type == "gecko": @@ -511,7 +540,12 @@ def _handle_output(collector, args, pid, mode): pid: Process ID (for generating filenames) mode: Profiling mode used """ - if args.format == "pstats": + if args.format == "binary": + # Binary format already wrote to file incrementally, just finalize + collector.export(None) + filename = collector.filename + print(f"Binary profile written to {filename} ({collector.total_samples} samples)") + elif args.format == "pstats": if args.outfile: # If outfile is a directory, generate filename inside it if os.path.isdir(args.outfile): @@ -544,6 +578,13 @@ def _validate_args(args, parser): args: Parsed command-line arguments parser: ArgumentParser instance for error reporting """ + # Replay command has minimal validation + if args.command == "replay": + # Can't replay to binary format + if args.format == "binary": + parser.error("Cannot replay to binary format. Use a different output format.") + return + # Check if live mode is available if hasattr(args, 'live') and args.live and LiveStatsCollector is None: parser.error( @@ -556,7 +597,7 @@ def _validate_args(args, parser): parser.error("--subprocesses is incompatible with --live mode.") # Async-aware mode is incompatible with --native, --no-gc, --mode, and --all-threads - if args.async_aware: + if getattr(args, 'async_aware', False): issues = [] if args.native: issues.append("--native") @@ -573,7 +614,7 @@ def _validate_args(args, parser): ) # --async-mode requires --async-aware - if hasattr(args, 'async_mode') and args.async_mode != "running" and not args.async_aware: + if hasattr(args, 'async_mode') and args.async_mode != "running" and not getattr(args, 'async_aware', False): parser.error("--async-mode requires --async-aware to be enabled.") # Live mode is incompatible with format options @@ -601,7 +642,7 @@ def _validate_args(args, parser): return # Validate gecko mode doesn't use non-wall mode - if args.format == "gecko" and args.mode != "wall": + if args.format == "gecko" and getattr(args, 'mode', 'wall') != "wall": parser.error( "--mode option is incompatible with --gecko. " "Gecko format automatically includes both GIL-holding and CPU status analysis." @@ -609,7 +650,7 @@ def _validate_args(args, parser): # Validate --opcodes is only used with compatible formats opcodes_compatible_formats = ("live", "gecko", "flamegraph", "heatmap") - if args.opcodes and args.format not in opcodes_compatible_formats: + if getattr(args, 'opcodes', False) and args.format not in opcodes_compatible_formats: parser.error( f"--opcodes is only compatible with {', '.join('--' + f for f in opcodes_compatible_formats)}." ) @@ -721,6 +762,30 @@ def main(): _add_format_options(attach_parser) _add_pstats_options(attach_parser) + # === REPLAY COMMAND === + replay_parser = subparsers.add_parser( + "replay", + help="Replay a binary profile and convert to another format", + formatter_class=CustomFormatter, + description="""Replay a binary profile file and convert to another format + +Examples: + # Convert binary to flamegraph + `python -m profiling.sampling replay --flamegraph -o output.html profile.bin` + + # Convert binary to pstats and print to stdout + `python -m profiling.sampling replay profile.bin` + + # Convert binary to gecko format + `python -m profiling.sampling replay --gecko -o profile.json profile.bin`""", + ) + replay_parser.add_argument( + "input_file", + help="Binary profile file to replay", + ) + _add_format_options(replay_parser, include_compression=False) + _add_pstats_options(replay_parser) + # Parse arguments args = parser.parse_args() @@ -731,6 +796,7 @@ def main(): command_handlers = { "run": _handle_run, "attach": _handle_attach, + "replay": _handle_replay, } # Execute the appropriate command @@ -760,8 +826,16 @@ def _handle_attach(args): mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False ) + output_file = None + if args.format == "binary": + output_file = args.outfile or _generate_output_filename(args.format, args.pid) + # Create the appropriate collector - collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes) + collector = _create_collector( + args.format, args.interval, skip_idle, args.opcodes, + output_file=output_file, + compression=getattr(args, 'compression', 'auto') + ) with _get_child_monitor_context(args, args.pid): collector = sample( @@ -829,8 +903,16 @@ def _handle_run(args): mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False ) + output_file = None + if args.format == "binary": + output_file = args.outfile or _generate_output_filename(args.format, process.pid) + # Create the appropriate collector - collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes) + collector = _create_collector( + args.format, args.interval, skip_idle, args.opcodes, + output_file=output_file, + compression=getattr(args, 'compression', 'auto') + ) with _get_child_monitor_context(args, process.pid): try: @@ -949,5 +1031,52 @@ def _handle_live_run(args): process.wait() +def _handle_replay(args): + """Handle the 'replay' command - convert binary profile to another format.""" + import os + + if not os.path.exists(args.input_file): + sys.exit(f"Error: Input file not found: {args.input_file}") + + # Can't replay to binary format + if args.format == "binary": + sys.exit("Error: Cannot replay to binary format. Use a different output format.") + + with BinaryReader(args.input_file) as reader: + info = reader.get_info() + interval = info['sample_interval_us'] + + print(f"Replaying {info['sample_count']} samples from {args.input_file}") + print(f" Sample interval: {interval} us") + print(f" Compression: {'zstd' if info.get('compression_type', 0) == 1 else 'none'}") + + collector = _create_collector(args.format, interval, skip_idle=False) + + def progress_callback(current, total): + if total > 0: + pct = current / total + bar_width = 40 + filled = int(bar_width * pct) + bar = '█' * filled + '░' * (bar_width - filled) + print(f"\r [{bar}] {pct*100:5.1f}% ({current:,}/{total:,})", end="", flush=True) + + count = reader.replay_samples(collector, progress_callback) + print() + + if args.format == "pstats": + if args.outfile: + collector.export(args.outfile) + else: + sort_choice = args.sort if args.sort is not None else "nsamples" + limit = args.limit if args.limit is not None else 15 + sort_mode = _sort_to_mode(sort_choice) + collector.print_stats(sort_mode, limit, not args.no_summary, PROFILING_MODE_WALL) + else: + filename = args.outfile or _generate_output_filename(args.format, os.getpid()) + collector.export(filename) + + print(f"Replayed {count} samples") + + if __name__ == "__main__": main() diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py index a1f6ec190f6556..0b485bbbb4c240 100644 --- a/Lib/profiling/sampling/collector.py +++ b/Lib/profiling/sampling/collector.py @@ -44,8 +44,15 @@ def extract_lineno(location): class Collector(ABC): @abstractmethod - def collect(self, stack_frames): - """Collect profiling data from stack frames.""" + def collect(self, stack_frames, timestamp_us=None): + """Collect profiling data from stack frames. + + Args: + stack_frames: List of InterpreterInfo objects + timestamp_us: Optional timestamp in microseconds. If provided (from + binary replay), use this instead of current time. If None, + collectors should use time.monotonic() or similar. + """ def collect_failed_sample(self): """Collect data about a failed sample attempt.""" @@ -79,6 +86,17 @@ def _iter_async_frames(self, awaited_info_list): # Phase 3: Build linear stacks from each leaf to root (optimized - no sorting!) yield from self._build_linear_stacks(leaf_task_ids, task_map, child_to_parent) + def _iter_stacks(self, stack_frames, skip_idle=False): + """Yield (frames, thread_id) for all stacks, handling both sync and async modes.""" + if stack_frames and hasattr(stack_frames[0], "awaited_by"): + for frames, thread_id, _ in self._iter_async_frames(stack_frames): + if frames: + yield frames, thread_id + else: + for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle): + if frames: + yield frames, thread_id + def _build_task_graph(self, awaited_info_list): task_map = {} child_to_parent = {} # Maps child_id -> (selected_parent_id, parent_count) diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py index 608a15da483729..c1c9cfcf3b93a9 100644 --- a/Lib/profiling/sampling/gecko_collector.py +++ b/Lib/profiling/sampling/gecko_collector.py @@ -66,7 +66,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, opcodes=False): self.sample_interval_usec = sample_interval_usec self.skip_idle = skip_idle self.opcodes_enabled = opcodes - self.start_time = time.time() * 1000 # milliseconds since epoch + self.start_time = time.monotonic() * 1000 # milliseconds since start # Global string table (shared across all threads) self.global_strings = ["(root)"] # Start with root @@ -103,6 +103,9 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, opcodes=False): # Opcode state tracking per thread: tid -> (opcode, lineno, col_offset, funcname, filename, start_time) self.opcode_state = {} + # For binary replay: track base timestamp (first sample's timestamp) + self._replay_base_timestamp_us = None + def _track_state_transition(self, tid, condition, active_dict, inactive_dict, active_name, inactive_name, category, current_time): """Track binary state transitions and emit markers. @@ -138,18 +141,35 @@ def _track_state_transition(self, tid, condition, active_dict, inactive_dict, self._add_marker(tid, active_name, active_dict.pop(tid), current_time, category) - def collect(self, stack_frames): - """Collect a sample from stack frames.""" - current_time = (time.time() * 1000) - self.start_time + def collect(self, stack_frames, timestamps_us=None): + """Collect samples from stack frames. + + Args: + stack_frames: List of interpreter/thread frame info + timestamps_us: List of timestamps in microseconds (None for live sampling) + """ + # Handle live sampling (no timestamps provided) + if timestamps_us is None: + current_time = (time.monotonic() * 1000) - self.start_time + times = [current_time] + else: + if not timestamps_us: + return + # Initialize base timestamp if needed + if self._replay_base_timestamp_us is None: + self._replay_base_timestamp_us = timestamps_us[0] + # Convert all timestamps to times (ms relative to first sample) + base = self._replay_base_timestamp_us + times = [(ts - base) / 1000 for ts in timestamps_us] + + first_time = times[0] # Update interval calculation if self.sample_count > 0 and self.last_sample_time > 0: - self.interval = ( - current_time - self.last_sample_time - ) / self.sample_count - self.last_sample_time = current_time + self.interval = (times[-1] - self.last_sample_time) / self.sample_count + self.last_sample_time = times[-1] - # Process threads and track GC per thread + # Process threads for interpreter_info in stack_frames: for thread_info in interpreter_info.threads: frames = thread_info.frame_info @@ -167,92 +187,86 @@ def collect(self, stack_frames): on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU) gil_requested = bool(status_flags & THREAD_STATUS_GIL_REQUESTED) - # Track GIL possession (Has GIL / No GIL) + # Track state transitions using first timestamp self._track_state_transition( tid, has_gil, self.has_gil_start, self.no_gil_start, - "Has GIL", "No GIL", CATEGORY_GIL, current_time + "Has GIL", "No GIL", CATEGORY_GIL, first_time ) - - # Track CPU state (On CPU / Off CPU) self._track_state_transition( tid, on_cpu, self.on_cpu_start, self.off_cpu_start, - "On CPU", "Off CPU", CATEGORY_CPU, current_time + "On CPU", "Off CPU", CATEGORY_CPU, first_time ) - # Track code type (Python Code / Native Code) - # This is tri-state: Python (has_gil), Native (on_cpu without gil), or Neither + # Track code type if has_gil: self._track_state_transition( tid, True, self.python_code_start, self.native_code_start, - "Python Code", "Native Code", CATEGORY_CODE_TYPE, current_time + "Python Code", "Native Code", CATEGORY_CODE_TYPE, first_time ) elif on_cpu: self._track_state_transition( tid, True, self.native_code_start, self.python_code_start, - "Native Code", "Python Code", CATEGORY_CODE_TYPE, current_time + "Native Code", "Python Code", CATEGORY_CODE_TYPE, first_time ) else: - # Thread is idle (neither has GIL nor on CPU) - close any open code markers - # This handles the third state that _track_state_transition doesn't cover if tid in self.initialized_threads: if tid in self.python_code_start: self._add_marker(tid, "Python Code", self.python_code_start.pop(tid), - current_time, CATEGORY_CODE_TYPE) + first_time, CATEGORY_CODE_TYPE) if tid in self.native_code_start: self._add_marker(tid, "Native Code", self.native_code_start.pop(tid), - current_time, CATEGORY_CODE_TYPE) + first_time, CATEGORY_CODE_TYPE) - # Track "Waiting for GIL" intervals (one-sided tracking) + # Track GIL wait if gil_requested: - self.gil_wait_start.setdefault(tid, current_time) + self.gil_wait_start.setdefault(tid, first_time) elif tid in self.gil_wait_start: self._add_marker(tid, "Waiting for GIL", self.gil_wait_start.pop(tid), - current_time, CATEGORY_GIL) + first_time, CATEGORY_GIL) - # Track exception state (Has Exception / No Exception) + # Track exception state has_exception = bool(status_flags & THREAD_STATUS_HAS_EXCEPTION) self._track_state_transition( tid, has_exception, self.exception_start, self.no_exception_start, - "Has Exception", "No Exception", CATEGORY_EXCEPTION, current_time + "Has Exception", "No Exception", CATEGORY_EXCEPTION, first_time ) - # Track GC events by detecting frames in the stack trace - # This leverages the improved GC frame tracking from commit 336366fd7ca - # which precisely identifies the thread that initiated GC collection + # Track GC events has_gc_frame = any(frame[2] == "" for frame in frames) if has_gc_frame: - # This thread initiated GC collection if tid not in self.gc_start_per_thread: - self.gc_start_per_thread[tid] = current_time + self.gc_start_per_thread[tid] = first_time elif tid in self.gc_start_per_thread: - # End GC marker when no more GC frames are detected self._add_marker(tid, "GC Collecting", self.gc_start_per_thread.pop(tid), - current_time, CATEGORY_GC) + first_time, CATEGORY_GC) - # Mark thread as initialized after processing all state transitions + # Mark thread as initialized self.initialized_threads.add(tid) - # Categorize: idle if neither has GIL nor on CPU + # Skip idle threads if requested is_idle = not has_gil and not on_cpu - - # Skip idle threads if skip_idle is enabled if self.skip_idle and is_idle: continue if not frames: continue - # Process the stack + # Process stack once to get stack_index stack_index = self._process_stack(thread_data, frames) - # Add sample - cache references to avoid dictionary lookups + # Add samples with timestamps samples = thread_data["samples"] - samples["stack"].append(stack_index) - samples["time"].append(current_time) - samples["eventDelay"].append(None) + samples_stack = samples["stack"] + samples_time = samples["time"] + samples_delay = samples["eventDelay"] + + for t in times: + samples_stack.append(stack_index) + samples_time.append(t) + samples_delay.append(None) - # Track opcode state changes for interval markers (leaf frame only) - if self.opcodes_enabled: + # Handle opcodes + if self.opcodes_enabled and frames: leaf_frame = frames[0] filename, location, funcname, opcode = leaf_frame if isinstance(location, tuple): @@ -264,18 +278,15 @@ def collect(self, stack_frames): current_state = (opcode, lineno, col_offset, funcname, filename) if tid not in self.opcode_state: - # First observation - start tracking - self.opcode_state[tid] = (*current_state, current_time) + self.opcode_state[tid] = (*current_state, first_time) elif self.opcode_state[tid][:5] != current_state: - # State changed - emit marker for previous state prev_opcode, prev_lineno, prev_col, prev_funcname, prev_filename, prev_start = self.opcode_state[tid] self._add_opcode_interval_marker( - tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, current_time + tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, first_time ) - # Start tracking new state - self.opcode_state[tid] = (*current_state, current_time) + self.opcode_state[tid] = (*current_state, first_time) - self.sample_count += 1 + self.sample_count += len(times) def _create_thread(self, tid): """Create a new thread structure with processed profile format.""" diff --git a/Lib/profiling/sampling/heatmap_collector.py b/Lib/profiling/sampling/heatmap_collector.py index 5b4c89283be08c..4e7e359bf8903b 100644 --- a/Lib/profiling/sampling/heatmap_collector.py +++ b/Lib/profiling/sampling/heatmap_collector.py @@ -518,7 +518,7 @@ def set_stats(self, sample_interval_usec, duration_sec, sample_rate, error_rate= } self.stats.update(kwargs) - def process_frames(self, frames, thread_id): + def process_frames(self, frames, thread_id, weight=1): """Process stack frames and count samples per line. Args: @@ -526,8 +526,9 @@ def process_frames(self, frames, thread_id): leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset). opcode is None if not gathered. thread_id: Thread ID for this stack trace + weight: Number of samples this stack represents (for batched RLE) """ - self._total_samples += 1 + self._total_samples += weight self._seen_lines.clear() for i, (filename, location, funcname, opcode) in enumerate(frames): @@ -545,15 +546,16 @@ def process_frames(self, frames, thread_id): self._seen_lines.add(line_key) self._record_line_sample(filename, lineno, funcname, is_leaf=is_leaf, - count_cumulative=count_cumulative) + count_cumulative=count_cumulative, weight=weight) if opcode is not None: # Set opcodes_enabled flag when we first encounter opcode data self.opcodes_enabled = True self._record_bytecode_sample(filename, lineno, opcode, - end_lineno, col_offset, end_col_offset) + end_lineno, col_offset, end_col_offset, + weight=weight) - # Build call graph for adjacent frames + # Build call graph for adjacent frames (relationships are deduplicated anyway) if i + 1 < len(frames): next_frame = frames[i + 1] next_lineno = extract_lineno(next_frame[1]) @@ -575,24 +577,25 @@ def _is_valid_frame(self, filename, lineno): return True def _record_line_sample(self, filename, lineno, funcname, is_leaf=False, - count_cumulative=True): + count_cumulative=True, weight=1): """Record a sample for a specific line.""" # Track cumulative samples (all occurrences in stack) if count_cumulative: - self.line_samples[(filename, lineno)] += 1 - self.file_samples[filename][lineno] += 1 + self.line_samples[(filename, lineno)] += weight + self.file_samples[filename][lineno] += weight # Track self/leaf samples (only when at top of stack) if is_leaf: - self.line_self_samples[(filename, lineno)] += 1 - self.file_self_samples[filename][lineno] += 1 + self.line_self_samples[(filename, lineno)] += weight + self.file_self_samples[filename][lineno] += weight # Record function definition location if funcname and (filename, funcname) not in self.function_definitions: self.function_definitions[(filename, funcname)] = lineno def _record_bytecode_sample(self, filename, lineno, opcode, - end_lineno=None, col_offset=None, end_col_offset=None): + end_lineno=None, col_offset=None, end_col_offset=None, + weight=1): """Record a sample for a specific bytecode instruction. Args: @@ -602,6 +605,7 @@ def _record_bytecode_sample(self, filename, lineno, opcode, end_lineno: End line number (may be -1 if not available) col_offset: Column offset in UTF-8 bytes (may be -1 if not available) end_col_offset: End column offset in UTF-8 bytes (may be -1 if not available) + weight: Number of samples this represents (for batched RLE) """ key = (filename, lineno) @@ -609,7 +613,7 @@ def _record_bytecode_sample(self, filename, lineno, opcode, if opcode not in self.line_opcodes[key]: self.line_opcodes[key][opcode] = {'count': 0, 'locations': set()} - self.line_opcodes[key][opcode]['count'] += 1 + self.line_opcodes[key][opcode]['count'] += weight # Store unique location info if column offset is available (not -1) if col_offset is not None and col_offset >= 0: diff --git a/Lib/profiling/sampling/live_collector/collector.py b/Lib/profiling/sampling/live_collector/collector.py index 28af2e9744545a..dcb9fcabe32779 100644 --- a/Lib/profiling/sampling/live_collector/collector.py +++ b/Lib/profiling/sampling/live_collector/collector.py @@ -348,7 +348,7 @@ def collect_failed_sample(self): self.failed_samples += 1 self.total_samples += 1 - def collect(self, stack_frames): + def collect(self, stack_frames, timestamp_us=None): """Collect and display profiling data.""" if self.start_time is None: self.start_time = time.perf_counter() diff --git a/Lib/profiling/sampling/pstats_collector.py b/Lib/profiling/sampling/pstats_collector.py index 7c154e25828a8f..1b2fe6a77278ee 100644 --- a/Lib/profiling/sampling/pstats_collector.py +++ b/Lib/profiling/sampling/pstats_collector.py @@ -18,7 +18,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False): self.skip_idle = skip_idle self._seen_locations = set() - def _process_frames(self, frames): + def _process_frames(self, frames, weight=1): """Process a single thread's frame stack.""" if not frames: return @@ -32,12 +32,12 @@ def _process_frames(self, frames): location = (frame.filename, lineno, frame.funcname) if location not in self._seen_locations: self._seen_locations.add(location) - self.result[location]["cumulative_calls"] += 1 + self.result[location]["cumulative_calls"] += weight # The top frame gets counted as an inline call (directly executing) top_lineno = extract_lineno(frames[0].location) top_location = (frames[0].filename, top_lineno, frames[0].funcname) - self.result[top_location]["direct_calls"] += 1 + self.result[top_location]["direct_calls"] += weight # Track caller-callee relationships for call graph for i in range(1, len(frames)): @@ -49,17 +49,12 @@ def _process_frames(self, frames): callee = (callee_frame.filename, callee_lineno, callee_frame.funcname) caller = (caller_frame.filename, caller_lineno, caller_frame.funcname) - self.callers[callee][caller] += 1 + self.callers[callee][caller] += weight - def collect(self, stack_frames): - if stack_frames and hasattr(stack_frames[0], "awaited_by"): - # Async frame processing - for frames, thread_id, task_id in self._iter_async_frames(stack_frames): - self._process_frames(frames) - else: - # Regular frame processing - for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle): - self._process_frames(frames) + def collect(self, stack_frames, timestamps_us=None): + weight = len(timestamps_us) if timestamps_us else 1 + for frames, _ in self._iter_stacks(stack_frames, skip_idle=self.skip_idle): + self._process_frames(frames, weight=weight) def export(self, filename): self.create_stats() diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py index 294ec3003fc6bc..9c0cdce93c403e 100644 --- a/Lib/profiling/sampling/sample.py +++ b/Lib/profiling/sampling/sample.py @@ -12,6 +12,7 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .binary_collector import BinaryCollector from .constants import ( PROFILING_MODE_WALL, PROFILING_MODE_CPU, @@ -137,6 +138,9 @@ def sample(self, collector, duration_sec=10, *, async_aware=False): if self.collect_stats: self._print_unwinder_stats() + if isinstance(collector, BinaryCollector): + self._print_binary_stats(collector) + # Pass stats to flamegraph collector if it's the right type if hasattr(collector, 'set_stats'): collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode) @@ -278,6 +282,53 @@ def _print_unwinder_stats(self): if stale_invalidations > 0: print(f" {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}") + def _print_binary_stats(self, collector): + """Print binary I/O encoding statistics.""" + try: + stats = collector.get_stats() + except (ValueError, RuntimeError): + return # Collector closed or stats unavailable + + print(f" {ANSIColors.CYAN}Binary Encoding:{ANSIColors.RESET}") + + repeat_records = stats.get('repeat_records', 0) + repeat_samples = stats.get('repeat_samples', 0) + full_records = stats.get('full_records', 0) + suffix_records = stats.get('suffix_records', 0) + pop_push_records = stats.get('pop_push_records', 0) + total_records = stats.get('total_records', 0) + + if total_records > 0: + repeat_pct = repeat_records / total_records * 100 + full_pct = full_records / total_records * 100 + suffix_pct = suffix_records / total_records * 100 + pop_push_pct = pop_push_records / total_records * 100 + else: + repeat_pct = full_pct = suffix_pct = pop_push_pct = 0 + + print(f" Records: {total_records:,}") + print(f" RLE repeat: {repeat_records:,} ({ANSIColors.GREEN}{repeat_pct:.1f}%{ANSIColors.RESET}) [{repeat_samples:,} samples]") + print(f" Full stack: {full_records:,} ({full_pct:.1f}%)") + print(f" Suffix match: {suffix_records:,} ({suffix_pct:.1f}%)") + print(f" Pop-push: {pop_push_records:,} ({pop_push_pct:.1f}%)") + + frames_written = stats.get('total_frames_written', 0) + frames_saved = stats.get('frames_saved', 0) + compression_pct = stats.get('frame_compression_pct', 0) + + print(f" {ANSIColors.CYAN}Frame Efficiency:{ANSIColors.RESET}") + print(f" Frames written: {frames_written:,}") + print(f" Frames saved: {frames_saved:,} ({ANSIColors.GREEN}{compression_pct:.1f}%{ANSIColors.RESET})") + + bytes_written = stats.get('bytes_written', 0) + if bytes_written >= 1024 * 1024: + bytes_str = f"{bytes_written / (1024 * 1024):.1f} MB" + elif bytes_written >= 1024: + bytes_str = f"{bytes_written / 1024:.1f} KB" + else: + bytes_str = f"{bytes_written} B" + print(f" Bytes (pre-zstd): {bytes_str}") + def sample( pid, diff --git a/Lib/profiling/sampling/stack_collector.py b/Lib/profiling/sampling/stack_collector.py index e437facd8bb94b..55e643d0e9c8cb 100644 --- a/Lib/profiling/sampling/stack_collector.py +++ b/Lib/profiling/sampling/stack_collector.py @@ -18,21 +18,12 @@ def __init__(self, sample_interval_usec, *, skip_idle=False): self.sample_interval_usec = sample_interval_usec self.skip_idle = skip_idle - def collect(self, stack_frames, skip_idle=False): - if stack_frames and hasattr(stack_frames[0], "awaited_by"): - # Async-aware mode: process async task frames - for frames, thread_id, task_id in self._iter_async_frames(stack_frames): - if not frames: - continue - self.process_frames(frames, thread_id) - else: - # Sync-only mode - for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle): - if not frames: - continue - self.process_frames(frames, thread_id) + def collect(self, stack_frames, timestamps_us=None, skip_idle=False): + weight = len(timestamps_us) if timestamps_us else 1 + for frames, thread_id in self._iter_stacks(stack_frames, skip_idle=skip_idle): + self.process_frames(frames, thread_id, weight=weight) - def process_frames(self, frames, thread_id): + def process_frames(self, frames, thread_id, weight=1): pass @@ -41,13 +32,13 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.stack_counter = collections.Counter() - def process_frames(self, frames, thread_id): + def process_frames(self, frames, thread_id, weight=1): # Extract only (filename, lineno, funcname) - opcode not needed for collapsed stacks # frame is (filename, location, funcname, opcode) call_tree = tuple( (f[0], extract_lineno(f[1]), f[2]) for f in reversed(frames) ) - self.stack_counter[(call_tree, thread_id)] += 1 + self.stack_counter[(call_tree, thread_id)] += weight def export(self, filename): lines = [] @@ -96,23 +87,26 @@ def __init__(self, *args, **kwargs): # Per-thread statistics self.per_thread_stats = {} # {thread_id: {has_gil, on_cpu, gil_requested, unknown, has_exception, total, gc_samples}} - def collect(self, stack_frames, skip_idle=False): + def collect(self, stack_frames, timestamps_us=None, skip_idle=False): """Override to track thread status statistics before processing frames.""" - # Increment sample count once per sample - self._sample_count += 1 + # Weight is number of timestamps (samples with identical stack) + weight = len(timestamps_us) if timestamps_us else 1 + + # Increment sample count by weight + self._sample_count += weight # Collect both aggregate and per-thread statistics using base method status_counts, has_gc_frame, per_thread_stats = self._collect_thread_status_stats(stack_frames) - # Merge aggregate status counts + # Merge aggregate status counts (multiply by weight) for key in status_counts: - self.thread_status_counts[key] += status_counts[key] + self.thread_status_counts[key] += status_counts[key] * weight # Update aggregate GC frame count if has_gc_frame: - self.samples_with_gc_frames += 1 + self.samples_with_gc_frames += weight - # Merge per-thread statistics + # Merge per-thread statistics (multiply by weight) for thread_id, stats in per_thread_stats.items(): if thread_id not in self.per_thread_stats: self.per_thread_stats[thread_id] = { @@ -125,10 +119,10 @@ def collect(self, stack_frames, skip_idle=False): "gc_samples": 0, } for key, value in stats.items(): - self.per_thread_stats[thread_id][key] += value + self.per_thread_stats[thread_id][key] += value * weight # Call parent collect to process frames - super().collect(stack_frames, skip_idle=skip_idle) + super().collect(stack_frames, timestamps_us, skip_idle=skip_idle) def set_stats(self, sample_interval_usec, duration_sec, sample_rate, error_rate=None, missed_samples=None, mode=None): @@ -311,7 +305,7 @@ def convert_children(children, min_samples): "opcode_mapping": opcode_mapping } - def process_frames(self, frames, thread_id): + def process_frames(self, frames, thread_id, weight=1): """Process stack frames into flamegraph tree structure. Args: @@ -319,10 +313,11 @@ def process_frames(self, frames, thread_id): leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset). opcode is None if not gathered. thread_id: Thread ID for this stack trace + weight: Number of samples this stack represents (for batched RLE) """ # Reverse to root->leaf order for tree building - self._root["samples"] += 1 - self._total_samples += 1 + self._root["samples"] += weight + self._total_samples += weight self._root["threads"].add(thread_id) self._all_threads.add(thread_id) @@ -336,11 +331,11 @@ def process_frames(self, frames, thread_id): if node is None: node = {"samples": 0, "children": {}, "threads": set(), "opcodes": collections.Counter()} current["children"][func] = node - node["samples"] += 1 + node["samples"] += weight node["threads"].add(thread_id) if opcode is not None: - node["opcodes"][opcode] += 1 + node["opcodes"][opcode] += weight current = node diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py new file mode 100644 index 00000000000000..64bef181da9ba2 --- /dev/null +++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py @@ -0,0 +1,1016 @@ +"""Tests for binary format round-trip functionality.""" + +import os +import random +import tempfile +import unittest +from collections import defaultdict + +try: + import _remote_debugging + from _remote_debugging import ( + InterpreterInfo, + ThreadInfo, + FrameInfo, + LocationInfo, + THREAD_STATUS_HAS_GIL, + THREAD_STATUS_ON_CPU, + THREAD_STATUS_UNKNOWN, + THREAD_STATUS_GIL_REQUESTED, + THREAD_STATUS_HAS_EXCEPTION, + ) + from profiling.sampling.binary_collector import BinaryCollector + from profiling.sampling.binary_reader import BinaryReader + + ZSTD_AVAILABLE = _remote_debugging.zstd_available() +except ImportError: + raise unittest.SkipTest( + "Test only runs when _remote_debugging is available" + ) + + +def make_frame(filename, lineno, funcname): + """Create a FrameInfo struct sequence.""" + location = LocationInfo((lineno, lineno, -1, -1)) + return FrameInfo((filename, location, funcname, None)) + + +def make_thread(thread_id, frames, status=0): + """Create a ThreadInfo struct sequence.""" + return ThreadInfo((thread_id, status, frames)) + + +def make_interpreter(interp_id, threads): + """Create an InterpreterInfo struct sequence.""" + return InterpreterInfo((interp_id, threads)) + + +def extract_lineno(location): + """Extract line number from location (tuple or int or None).""" + if location is None: + return 0 # Treat None as 0 + if isinstance(location, tuple): + return location[0] if location[0] is not None else 0 + return location + + +class RawCollector: + """Collector that captures all raw data grouped by thread.""" + + def __init__(self): + # Key: (interpreter_id, thread_id) -> list of samples for that thread + self.by_thread = defaultdict(list) + self.total_count = 0 + + def collect(self, stack_frames, timestamps_us): + """Capture the raw sample data.""" + # timestamps_us is a list; add one sample per timestamp + count = len(timestamps_us) + for interp in stack_frames: + for thread in interp.threads: + frames = [] + for frame in thread.frame_info: + frames.append( + { + "filename": frame.filename, + "funcname": frame.funcname, + "lineno": extract_lineno(frame.location), + } + ) + key = (interp.interpreter_id, thread.thread_id) + sample = {"status": thread.status, "frames": frames} + for _ in range(count): + self.by_thread[key].append(sample) + self.total_count += count + + def export(self, filename): + pass + + +def samples_to_by_thread(samples): + """Convert input samples to by-thread format for comparison.""" + by_thread = defaultdict(list) + for sample in samples: + for interp in sample: + for thread in interp.threads: + frames = [] + for frame in thread.frame_info: + frames.append( + { + "filename": frame.filename, + "funcname": frame.funcname, + "lineno": extract_lineno(frame.location), + } + ) + key = (interp.interpreter_id, thread.thread_id) + by_thread[key].append( + { + "status": thread.status, + "frames": frames, + } + ) + return by_thread + + +class BinaryFormatTestBase(unittest.TestCase): + """Base class with common setup/teardown for binary format tests.""" + + def setUp(self): + self.temp_files = [] + + def tearDown(self): + for f in self.temp_files: + if os.path.exists(f): + os.unlink(f) + + def create_binary_file(self, samples, interval=1000, compression="none"): + """Create a test binary file and track it for cleanup.""" + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + filename = f.name + self.temp_files.append(filename) + + collector = BinaryCollector( + filename, interval, compression=compression + ) + for sample in samples: + collector.collect(sample) + collector.export(None) + return filename + + def roundtrip(self, samples, interval=1000, compression="none"): + """Write samples to binary and read back.""" + filename = self.create_binary_file(samples, interval, compression) + collector = RawCollector() + with BinaryReader(filename) as reader: + count = reader.replay_samples(collector) + return collector, count + + def assert_samples_equal(self, expected_samples, collector): + """Assert that roundtripped samples match input exactly, per-thread.""" + expected = samples_to_by_thread(expected_samples) + + # Same threads present + self.assertEqual( + set(expected.keys()), + set(collector.by_thread.keys()), + "Thread set mismatch", + ) + + # For each thread, samples match in order + for key in expected: + exp_samples = expected[key] + act_samples = collector.by_thread[key] + interp_id, thread_id = key + + self.assertEqual( + len(exp_samples), + len(act_samples), + f"Thread ({interp_id}, {thread_id}): sample count mismatch " + f"(expected {len(exp_samples)}, got {len(act_samples)})", + ) + + for i, (exp, act) in enumerate(zip(exp_samples, act_samples)): + self.assertEqual( + exp["status"], + act["status"], + f"Thread ({interp_id}, {thread_id}), sample {i}: " + f"status mismatch (expected {exp['status']}, got {act['status']})", + ) + + self.assertEqual( + len(exp["frames"]), + len(act["frames"]), + f"Thread ({interp_id}, {thread_id}), sample {i}: " + f"frame count mismatch", + ) + + for j, (exp_frame, act_frame) in enumerate( + zip(exp["frames"], act["frames"]) + ): + self.assertEqual( + exp_frame["filename"], + act_frame["filename"], + f"Thread ({interp_id}, {thread_id}), sample {i}, " + f"frame {j}: filename mismatch", + ) + self.assertEqual( + exp_frame["funcname"], + act_frame["funcname"], + f"Thread ({interp_id}, {thread_id}), sample {i}, " + f"frame {j}: funcname mismatch", + ) + self.assertEqual( + exp_frame["lineno"], + act_frame["lineno"], + f"Thread ({interp_id}, {thread_id}), sample {i}, " + f"frame {j}: lineno mismatch " + f"(expected {exp_frame['lineno']}, got {act_frame['lineno']})", + ) + + +class TestBinaryRoundTrip(BinaryFormatTestBase): + """Tests for exact binary format round-trip.""" + + def test_single_sample_single_frame(self): + """Single sample with one frame roundtrips exactly.""" + samples = [ + [ + make_interpreter( + 0, + [ + make_thread( + 12345, [make_frame("test.py", 42, "myfunc")] + ) + ], + ) + ] + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_single_sample_multi_frame(self): + """Single sample with call stack roundtrips exactly.""" + frames = [ + make_frame("inner.py", 10, "inner"), + make_frame("middle.py", 20, "middle"), + make_frame("outer.py", 30, "outer"), + ] + samples = [[make_interpreter(0, [make_thread(100, frames)])]] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_multiple_samples_same_stack(self): + """Multiple identical samples roundtrip exactly (tests RLE).""" + frame = make_frame("hot.py", 99, "hot_func") + samples = [ + [make_interpreter(0, [make_thread(1, [frame])])] + for _ in range(100) + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 100) + self.assert_samples_equal(samples, collector) + + def test_multiple_samples_varying_stacks(self): + """Multiple samples with varying stacks roundtrip exactly.""" + samples = [] + for i in range(20): + depth = i % 5 + 1 + frames = [ + make_frame(f"f{j}.py", j * 10 + i, f"func{j}") + for j in range(depth) + ] + samples.append([make_interpreter(0, [make_thread(1, frames)])]) + collector, count = self.roundtrip(samples) + self.assertEqual(count, 20) + self.assert_samples_equal(samples, collector) + + def test_thread_ids_preserved(self): + """Thread IDs are preserved exactly.""" + thread_ids = [1, 12345, 0x7FFF12345678, 999999] + samples = [] + for tid in thread_ids: + samples.append( + [ + make_interpreter( + 0, [make_thread(tid, [make_frame("t.py", 10, "f")])] + ) + ] + ) + collector, count = self.roundtrip(samples) + self.assertEqual(count, len(thread_ids)) + self.assert_samples_equal(samples, collector) + + def test_interpreter_ids_preserved(self): + """Interpreter IDs are preserved exactly.""" + interp_ids = [0, 1, 5, 100] + samples = [] + for iid in interp_ids: + samples.append( + [ + make_interpreter( + iid, [make_thread(1, [make_frame("i.py", 10, "f")])] + ) + ] + ) + collector, count = self.roundtrip(samples) + self.assertEqual(count, len(interp_ids)) + self.assert_samples_equal(samples, collector) + + def test_status_flags_preserved(self): + """All thread status flags are preserved exactly.""" + statuses = [ + 0, + THREAD_STATUS_HAS_GIL, + THREAD_STATUS_ON_CPU, + THREAD_STATUS_UNKNOWN, + THREAD_STATUS_GIL_REQUESTED, + THREAD_STATUS_HAS_EXCEPTION, + THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU, + THREAD_STATUS_HAS_GIL | THREAD_STATUS_HAS_EXCEPTION, + THREAD_STATUS_HAS_GIL + | THREAD_STATUS_ON_CPU + | THREAD_STATUS_GIL_REQUESTED, + ] + samples = [] + for i, status in enumerate(statuses): + samples.append( + [ + make_interpreter( + 0, + [ + make_thread( + 1, [make_frame("s.py", 10 + i, "f")], status + ) + ], + ) + ] + ) + collector, count = self.roundtrip(samples) + self.assertEqual(count, len(statuses)) + self.assert_samples_equal(samples, collector) + + def test_multiple_threads_per_sample(self): + """Multiple threads in one sample roundtrip exactly.""" + threads = [ + make_thread( + 1, [make_frame("t1.py", 10, "t1")], THREAD_STATUS_HAS_GIL + ), + make_thread( + 2, [make_frame("t2.py", 20, "t2")], THREAD_STATUS_ON_CPU + ), + make_thread(3, [make_frame("t3.py", 30, "t3")], 0), + ] + samples = [[make_interpreter(0, threads)] for _ in range(10)] + collector, count = self.roundtrip(samples) + # 10 samples × 3 threads = 30 thread-samples + self.assertEqual(count, 30) + self.assert_samples_equal(samples, collector) + + def test_multiple_interpreters_per_sample(self): + """Multiple interpreters in one sample roundtrip exactly.""" + samples = [ + [ + make_interpreter( + 0, [make_thread(1, [make_frame("i0.py", 10, "i0")])] + ), + make_interpreter( + 1, [make_thread(2, [make_frame("i1.py", 20, "i1")])] + ), + ] + for _ in range(5) + ] + collector, count = self.roundtrip(samples) + # 5 samples × 2 interpreters × 1 thread = 10 thread-samples + self.assertEqual(count, 10) + self.assert_samples_equal(samples, collector) + + def test_same_thread_id_different_interpreters(self): + """Same thread_id in different interpreters must be tracked separately.""" + # This test catches bugs where thread state is keyed only by thread_id + # without considering interpreter_id + samples = [] + # Interleave samples from interpreter 0 and 1, both using thread_id=1 + for i in range(20): + interp_id = i % 2 # Alternate between interpreter 0 and 1 + frame = make_frame( + f"interp{interp_id}.py", 10 + i, f"func{interp_id}" + ) + samples.append( + [make_interpreter(interp_id, [make_thread(1, [frame])])] + ) + + collector, count = self.roundtrip(samples) + self.assertEqual(count, 20) + self.assert_samples_equal(samples, collector) + + # Verify both interpreters are present + keys = set(collector.by_thread.keys()) + self.assertIn((0, 1), keys) # interpreter 0, thread 1 + self.assertIn((1, 1), keys) # interpreter 1, thread 1 + + # Verify each interpreter got 10 samples + self.assertEqual(len(collector.by_thread[(0, 1)]), 10) + self.assertEqual(len(collector.by_thread[(1, 1)]), 10) + + # Verify the samples are in the right order for each interpreter + for i, sample in enumerate(collector.by_thread[(0, 1)]): + expected_lineno = 10 + i * 2 # 10, 12, 14, ... + self.assertEqual(sample["frames"][0]["lineno"], expected_lineno) + self.assertEqual(sample["frames"][0]["filename"], "interp0.py") + + for i, sample in enumerate(collector.by_thread[(1, 1)]): + expected_lineno = 11 + i * 2 # 11, 13, 15, ... + self.assertEqual(sample["frames"][0]["lineno"], expected_lineno) + self.assertEqual(sample["frames"][0]["filename"], "interp1.py") + + def test_deep_call_stack(self): + """Deep call stack roundtrips exactly.""" + depth = 100 + frames = [ + make_frame(f"f{i}.py", i + 1, f"func{i}") for i in range(depth) + ] + samples = [[make_interpreter(0, [make_thread(1, frames)])]] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_line_numbers_preserved(self): + """Various line numbers are preserved exactly.""" + linenos = [1, 100, 1000, 65535, 100000] + samples = [] + for lineno in linenos: + samples.append( + [ + make_interpreter( + 0, [make_thread(1, [make_frame("l.py", lineno, "f")])] + ) + ] + ) + collector, count = self.roundtrip(samples) + self.assertEqual(count, len(linenos)) + self.assert_samples_equal(samples, collector) + + @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available") + def test_zstd_compression_roundtrip(self): + """Zstd compressed data roundtrips exactly.""" + samples = [] + for i in range(200): + frames = [ + make_frame(f"z{j}.py", j * 10 + i + 1, f"zfunc{j}") + for j in range(3) + ] + samples.append([make_interpreter(0, [make_thread(1, frames)])]) + collector, count = self.roundtrip(samples, compression="zstd") + self.assertEqual(count, 200) + self.assert_samples_equal(samples, collector) + + def test_sample_interval_preserved(self): + """Sample interval is preserved in file metadata.""" + intervals = [100, 500, 1000, 5000, 10000] + for interval in intervals: + with self.subTest(interval=interval): + samples = [ + [ + make_interpreter( + 0, [make_thread(1, [make_frame("i.py", 1, "f")])] + ) + ] + ] + filename = self.create_binary_file(samples, interval=interval) + with BinaryReader(filename) as reader: + info = reader.get_info() + self.assertEqual(info["sample_interval_us"], interval) + + def test_threads_interleaved_samples(self): + """Multiple threads with interleaved varying samples.""" + samples = [] + for i in range(30): + threads = [ + make_thread( + 1, + [make_frame("t1.py", 10 + i, "t1")], + THREAD_STATUS_HAS_GIL if i % 2 == 0 else 0, + ), + make_thread( + 2, + [make_frame("t2.py", 20 + i, "t2")], + THREAD_STATUS_ON_CPU if i % 3 == 0 else 0, + ), + ] + samples.append([make_interpreter(0, threads)]) + collector, count = self.roundtrip(samples) + self.assertEqual(count, 60) + self.assert_samples_equal(samples, collector) + + +class TestBinaryEdgeCases(BinaryFormatTestBase): + """Tests for edge cases in binary format.""" + + def test_unicode_filenames(self): + """Unicode filenames roundtrip exactly.""" + filenames = [ + "/путь/файл.py", + "/路径/文件.py", + "/パス/ファイル.py", + "/chemin/café.py", + ] + for fname in filenames: + with self.subTest(filename=fname): + samples = [ + [ + make_interpreter( + 0, [make_thread(1, [make_frame(fname, 1, "func")])] + ) + ] + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_unicode_funcnames(self): + """Unicode function names roundtrip exactly.""" + funcnames = [ + "функция", + "函数", + "関数", + "función", + ] + for funcname in funcnames: + with self.subTest(funcname=funcname): + samples = [ + [ + make_interpreter( + 0, + [ + make_thread( + 1, [make_frame("test.py", 1, funcname)] + ) + ], + ) + ] + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_special_char_filenames(self): + """Filenames with special characters roundtrip exactly.""" + filenames = [ + "/path/with spaces/file.py", + "/path/with\ttab/file.py", + "/path/with'quote/file.py", + '/path/with"double/file.py', + "/path/with\\backslash/file.py", + ] + for fname in filenames: + with self.subTest(filename=fname): + samples = [ + [ + make_interpreter( + 0, [make_thread(1, [make_frame(fname, 1, "func")])] + ) + ] + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_special_funcnames(self): + """Function names with special characters roundtrip exactly.""" + funcnames = [ + "", + "", + "", + "", + "__init__", + "func.inner", + ] + for funcname in funcnames: + with self.subTest(funcname=funcname): + samples = [ + [ + make_interpreter( + 0, + [ + make_thread( + 1, [make_frame("test.py", 1, funcname)] + ) + ], + ) + ] + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_long_filename(self): + """Long filename roundtrips exactly.""" + long_file = "/very/long/path/" + "sub/" * 50 + "file.py" + samples = [ + [ + make_interpreter( + 0, [make_thread(1, [make_frame(long_file, 1, "func")])] + ) + ] + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_long_funcname(self): + """Long function name roundtrips exactly.""" + long_func = "very_long_function_name_" + "x" * 200 + samples = [ + [ + make_interpreter( + 0, [make_thread(1, [make_frame("test.py", 1, long_func)])] + ) + ] + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_empty_funcname(self): + """Empty function name roundtrips exactly.""" + samples = [ + [ + make_interpreter( + 0, [make_thread(1, [make_frame("test.py", 1, "")])] + ) + ] + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available") + def test_large_sample_count(self): + """Large number of samples roundtrips exactly.""" + num = 5000 + samples = [ + [ + make_interpreter( + 0, + [ + make_thread( + 1, [make_frame("test.py", (i % 100) + 1, "func")] + ) + ], + ) + ] + for i in range(num) + ] + collector, count = self.roundtrip(samples, compression="zstd") + self.assertEqual(count, num) + self.assert_samples_equal(samples, collector) + + def test_context_manager_cleanup(self): + """Reader cleans up on context exit.""" + samples = [ + [ + make_interpreter( + 0, [make_thread(1, [make_frame("t.py", 1, "f")])] + ) + ] + ] + filename = self.create_binary_file(samples) + reader = BinaryReader(filename) + with reader: + collector = RawCollector() + count = reader.replay_samples(collector) + self.assertEqual(count, 1) + with self.assertRaises(RuntimeError): + reader.replay_samples(collector) + + def test_invalid_file_path(self): + """Invalid file path raises appropriate error.""" + with self.assertRaises((FileNotFoundError, OSError, ValueError)): + with BinaryReader("/nonexistent/path/file.bin") as reader: + reader.replay_samples(RawCollector()) + + +class TestBinaryEncodings(BinaryFormatTestBase): + """Tests specifically targeting different stack encodings.""" + + def test_stack_full_encoding(self): + """First sample uses STACK_FULL encoding and roundtrips.""" + frames = [make_frame(f"f{i}.py", i + 1, f"func{i}") for i in range(5)] + samples = [[make_interpreter(0, [make_thread(1, frames)])]] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1) + self.assert_samples_equal(samples, collector) + + def test_stack_repeat_encoding(self): + """Identical consecutive samples use RLE and roundtrip.""" + frame = make_frame("repeat.py", 42, "repeat_func") + samples = [ + [make_interpreter(0, [make_thread(1, [frame])])] + for _ in range(1000) + ] + collector, count = self.roundtrip(samples) + self.assertEqual(count, 1000) + self.assert_samples_equal(samples, collector) + + def test_stack_suffix_encoding(self): + """Samples sharing suffix use STACK_SUFFIX and roundtrip.""" + samples = [] + for i in range(10): + frames = [make_frame(f"new{i}.py", i + 1, f"new{i}")] + frames.extend( + [ + make_frame(f"shared{j}.py", j + 1, f"shared{j}") + for j in range(5) + ] + ) + samples.append([make_interpreter(0, [make_thread(1, frames)])]) + collector, count = self.roundtrip(samples) + self.assertEqual(count, 10) + self.assert_samples_equal(samples, collector) + + def test_stack_pop_push_encoding(self): + """Samples with pop+push pattern roundtrip.""" + samples = [] + base_frames = [make_frame("base.py", 10, "base")] + + # Call deeper + samples.append([make_interpreter(0, [make_thread(1, base_frames)])]) + samples.append( + [ + make_interpreter( + 0, + [ + make_thread( + 1, + [make_frame("call1.py", 20, "call1")] + + base_frames, + ) + ], + ) + ] + ) + samples.append( + [ + make_interpreter( + 0, + [ + make_thread( + 1, + [ + make_frame("call2.py", 30, "call2"), + make_frame("call1.py", 20, "call1"), + ] + + base_frames, + ) + ], + ) + ] + ) + # Return + samples.append( + [ + make_interpreter( + 0, + [ + make_thread( + 1, + [make_frame("call1.py", 25, "call1")] + + base_frames, + ) + ], + ) + ] + ) + samples.append([make_interpreter(0, [make_thread(1, base_frames)])]) + + collector, count = self.roundtrip(samples) + self.assertEqual(count, 5) + self.assert_samples_equal(samples, collector) + + def test_mixed_encodings(self): + """Mix of different encoding patterns roundtrips.""" + samples = [] + # Some repeated samples (RLE) + frame1 = make_frame("hot.py", 1, "hot") + for _ in range(20): + samples.append([make_interpreter(0, [make_thread(1, [frame1])])]) + # Some varying samples + for i in range(20): + frames = [make_frame(f"vary{i}.py", i + 1, f"vary{i}")] + samples.append([make_interpreter(0, [make_thread(1, frames)])]) + # More repeated + for _ in range(20): + samples.append([make_interpreter(0, [make_thread(1, [frame1])])]) + + collector, count = self.roundtrip(samples) + self.assertEqual(count, 60) + self.assert_samples_equal(samples, collector) + + def test_alternating_threads_status_changes(self): + """Alternating thread status changes roundtrip correctly.""" + samples = [] + for i in range(50): + status1 = THREAD_STATUS_HAS_GIL if i % 2 == 0 else 0 + status2 = ( + THREAD_STATUS_ON_CPU if i % 3 == 0 else THREAD_STATUS_HAS_GIL + ) + threads = [ + make_thread(1, [make_frame("t1.py", 10, "t1")], status1), + make_thread(2, [make_frame("t2.py", 20, "t2")], status2), + ] + samples.append([make_interpreter(0, threads)]) + collector, count = self.roundtrip(samples) + self.assertEqual(count, 100) + self.assert_samples_equal(samples, collector) + + +class TestBinaryStress(BinaryFormatTestBase): + """Randomized stress tests for binary format.""" + + @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available") + def test_random_samples_stress(self): + """Stress test with random samples - exercises hash table resizing.""" + random.seed(42) # Reproducible + + # Large pools to force hash table resizing (exceeds initial 8192/4096 sizes) + filenames = [f"file{i}.py" for i in range(200)] + funcnames = [f"func{i}" for i in range(300)] + thread_ids = list(range(1, 50)) + interp_ids = list(range(10)) + statuses = [ + 0, + THREAD_STATUS_HAS_GIL, + THREAD_STATUS_ON_CPU, + THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU, + THREAD_STATUS_HAS_EXCEPTION, + ] + + samples = [] + for _ in range(1000): + num_interps = random.randint(1, 3) + interps = [] + for _ in range(num_interps): + iid = random.choice(interp_ids) + num_threads = random.randint(1, 5) + threads = [] + for _ in range(num_threads): + tid = random.choice(thread_ids) + status = random.choice(statuses) + depth = random.randint(1, 15) + frames = [] + for _ in range(depth): + fname = random.choice(filenames) + func = random.choice(funcnames) + # Wide line number range to create many unique frames + lineno = random.randint(1, 5000) + frames.append(make_frame(fname, lineno, func)) + threads.append(make_thread(tid, frames, status)) + interps.append(make_interpreter(iid, threads)) + samples.append(interps) + + collector, count = self.roundtrip(samples, compression="zstd") + self.assertGreater(count, 0) + self.assert_samples_equal(samples, collector) + + def test_rle_stress(self): + """Stress test RLE encoding with identical samples.""" + random.seed(123) + + # Create a few distinct stacks + stacks = [] + for i in range(5): + depth = random.randint(1, 8) + frames = [ + make_frame(f"rle{j}.py", j * 10, f"rle{j}") + for j in range(depth) + ] + stacks.append(frames) + + # Generate samples with repeated stacks (should trigger RLE) + samples = [] + for _ in range(100): + stack = random.choice(stacks) + repeat = random.randint(1, 50) + for _ in range(repeat): + samples.append([make_interpreter(0, [make_thread(1, stack)])]) + + collector, count = self.roundtrip(samples) + self.assertEqual(count, len(samples)) + self.assert_samples_equal(samples, collector) + + @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available") + def test_multi_thread_stress(self): + """Stress test with many threads and interleaved samples.""" + random.seed(456) + + thread_ids = list(range(1, 20)) + samples = [] + + for i in range(300): + # Randomly select 1-5 threads for this sample + num_threads = random.randint(1, 5) + selected = random.sample(thread_ids, num_threads) + threads = [] + for tid in selected: + status = random.choice( + [0, THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU] + ) + depth = random.randint(1, 5) + frames = [ + make_frame(f"mt{tid}_{j}.py", i + j, f"f{j}") + for j in range(depth) + ] + threads.append(make_thread(tid, frames, status)) + samples.append([make_interpreter(0, threads)]) + + collector, count = self.roundtrip(samples, compression="zstd") + self.assertGreater(count, 0) + self.assert_samples_equal(samples, collector) + + def test_encoding_transitions_stress(self): + """Stress test stack encoding transitions.""" + random.seed(789) + + base_frames = [ + make_frame(f"base{i}.py", i, f"base{i}") for i in range(5) + ] + samples = [] + + for i in range(200): + choice = random.randint(0, 4) + if choice == 0: + # Full new stack + depth = random.randint(1, 8) + frames = [ + make_frame(f"new{i}_{j}.py", j, f"new{j}") + for j in range(depth) + ] + elif choice == 1: + # Repeat previous (will use RLE if identical) + frames = base_frames[: random.randint(1, 5)] + elif choice == 2: + # Add frames on top (suffix encoding) + extra = random.randint(1, 3) + frames = [ + make_frame(f"top{i}_{j}.py", j, f"top{j}") + for j in range(extra) + ] + frames.extend(base_frames[: random.randint(2, 4)]) + else: + # Pop and push (pop-push encoding) + keep = random.randint(1, 3) + push = random.randint(0, 2) + frames = [ + make_frame(f"push{i}_{j}.py", j, f"push{j}") + for j in range(push) + ] + frames.extend(base_frames[:keep]) + + samples.append([make_interpreter(0, [make_thread(1, frames)])]) + + collector, count = self.roundtrip(samples) + self.assertEqual(count, len(samples)) + self.assert_samples_equal(samples, collector) + + @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available") + def test_same_thread_id_multiple_interpreters_stress(self): + """Stress test: same thread_id across multiple interpreters with interleaved samples. + + This test catches bugs where thread state is keyed only by thread_id + without considering interpreter_id (both in writer and reader). + """ + random.seed(999) + + # Multiple interpreters, each with overlapping thread_ids + interp_ids = [0, 1, 2, 3] + # Same thread_ids used across all interpreters + shared_thread_ids = [1, 2, 3] + + filenames = [f"file{i}.py" for i in range(10)] + funcnames = [f"func{i}" for i in range(15)] + statuses = [0, THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU] + + samples = [] + for i in range(1000): + # Randomly pick an interpreter + iid = random.choice(interp_ids) + # Randomly pick 1-3 threads (from shared pool) + num_threads = random.randint(1, 3) + selected_tids = random.sample(shared_thread_ids, num_threads) + + threads = [] + for tid in selected_tids: + status = random.choice(statuses) + depth = random.randint(1, 6) + frames = [] + for d in range(depth): + # Include interpreter and thread info in frame data for verification + fname = f"i{iid}_t{tid}_{random.choice(filenames)}" + func = random.choice(funcnames) + lineno = i * 10 + d + 1 # Unique per sample + frames.append(make_frame(fname, lineno, func)) + threads.append(make_thread(tid, frames, status)) + + samples.append([make_interpreter(iid, threads)]) + + collector, count = self.roundtrip(samples, compression="zstd") + self.assertGreater(count, 0) + self.assert_samples_equal(samples, collector) + + # Verify that we have samples from multiple (interpreter, thread) combinations + # with the same thread_id + keys = set(collector.by_thread.keys()) + # Should have samples for same thread_id in different interpreters + for tid in shared_thread_ids: + interps_with_tid = [iid for (iid, t) in keys if t == tid] + self.assertGreater( + len(interps_with_tid), + 1, + f"Thread {tid} should appear in multiple interpreters", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst new file mode 100644 index 00000000000000..f9c2cee51d1dcd --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst @@ -0,0 +1,4 @@ +Add binary output format to :mod:`profiling.sampling` for compact storage of +profiling data. The new ``--binary`` option captures samples to a file that +can be converted to other formats using the ``replay`` command. Patch by +Pablo Galindo diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in index acb08400e24e2e..2a4b937ce6bf80 100644 --- a/Modules/Setup.stdlib.in +++ b/Modules/Setup.stdlib.in @@ -41,7 +41,7 @@ @MODULE__PICKLE_TRUE@_pickle _pickle.c @MODULE__QUEUE_TRUE@_queue _queuemodule.c @MODULE__RANDOM_TRUE@_random _randommodule.c -@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/subprocess.c +@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/binary_io_writer.c _remote_debugging/binary_io_reader.c _remote_debugging/subprocess.c @MODULE__STRUCT_TRUE@_struct _struct.c # build supports subinterpreters diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 2f3efedd1e0ed5..9557a200b237e9 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -16,7 +16,9 @@ extern "C" { #endif #ifndef Py_BUILD_CORE_BUILTIN +# ifndef Py_BUILD_CORE_MODULE # define Py_BUILD_CORE_MODULE 1 +# endif #endif #include "Python.h" @@ -205,6 +207,8 @@ typedef struct { PyTypeObject *ThreadInfo_Type; PyTypeObject *InterpreterInfo_Type; PyTypeObject *AwaitedInfo_Type; + PyTypeObject *BinaryWriter_Type; + PyTypeObject *BinaryReader_Type; } RemoteDebuggingState; enum _ThreadState { diff --git a/Modules/_remote_debugging/binary_io.h b/Modules/_remote_debugging/binary_io.h new file mode 100644 index 00000000000000..e730fa8d9ace5c --- /dev/null +++ b/Modules/_remote_debugging/binary_io.h @@ -0,0 +1,585 @@ +/****************************************************************************** + * Python Remote Debugging Module - Binary I/O Header + * + * This header provides declarations for high-performance binary file I/O + * for profiling data with optional zstd streaming compression. + ******************************************************************************/ + +#ifndef Py_BINARY_IO_H +#define Py_BINARY_IO_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "Python.h" +#include "pycore_hashtable.h" +#include +#include + +/* ============================================================================ + * BINARY FORMAT CONSTANTS + * ============================================================================ */ + +#define BINARY_FORMAT_MAGIC 0x54414348 /* "TACH" (Tachyon) */ +#define BINARY_FORMAT_VERSION 2 + +/* Buffer sizes: 512KB balances syscall amortization against memory use, + * and aligns well with filesystem block sizes and zstd dictionary windows */ +#define WRITE_BUFFER_SIZE (512 * 1024) +#define COMPRESSED_BUFFER_SIZE (512 * 1024) + +/* Compression types */ +#define COMPRESSION_NONE 0 +#define COMPRESSION_ZSTD 1 + +/* Stack encoding types for delta compression */ +#define STACK_REPEAT 0x00 /* RLE: identical to previous, with count */ +#define STACK_FULL 0x01 /* Full stack (first sample or no match) */ +#define STACK_SUFFIX 0x02 /* Shares N frames from bottom */ +#define STACK_POP_PUSH 0x03 /* Remove M frames, add N frames */ + +/* Maximum stack depth we'll buffer for delta encoding */ +#define MAX_STACK_DEPTH 256 + +/* Initial capacity for RLE pending buffer */ +#define INITIAL_RLE_CAPACITY 64 + +/* Initial capacities for dynamic arrays - sized to reduce reallocations */ +#define INITIAL_STRING_CAPACITY 4096 +#define INITIAL_FRAME_CAPACITY 4096 +#define INITIAL_THREAD_CAPACITY 256 + +/* ============================================================================ + * STATISTICS STRUCTURES + * ============================================================================ */ + +/* Writer statistics - tracks encoding efficiency */ +typedef struct { + uint64_t repeat_records; /* Number of RLE repeat records written */ + uint64_t repeat_samples; /* Total samples encoded via RLE */ + uint64_t full_records; /* Number of full stack records */ + uint64_t suffix_records; /* Number of suffix match records */ + uint64_t pop_push_records; /* Number of pop-push records */ + uint64_t total_frames_written;/* Total frame indices written */ + uint64_t frames_saved; /* Frames avoided due to delta encoding */ + uint64_t bytes_written; /* Total bytes written (before compression) */ +} BinaryWriterStats; + +/* Reader statistics - tracks reconstruction performance */ +typedef struct { + uint64_t repeat_records; /* RLE records decoded */ + uint64_t repeat_samples; /* Samples decoded from RLE */ + uint64_t full_records; /* Full stack records decoded */ + uint64_t suffix_records; /* Suffix match records decoded */ + uint64_t pop_push_records; /* Pop-push records decoded */ + uint64_t total_samples; /* Total samples reconstructed */ + uint64_t stack_reconstructions; /* Number of stack array reconstructions */ +} BinaryReaderStats; + +/* ============================================================================ + * PLATFORM ABSTRACTION + * ============================================================================ */ + +#if defined(__linux__) || defined(__APPLE__) + #include + #include + #include + #include + #define USE_MMAP 1 +#else + #define USE_MMAP 0 +#endif + +/* 64-bit file position support for files larger than 2GB. + * On POSIX: use ftello/fseeko with off_t (already 64-bit on 64-bit systems) + * On Windows: use _ftelli64/_fseeki64 with __int64 */ +#if defined(_WIN32) || defined(_WIN64) + #include + typedef __int64 file_offset_t; + #define FTELL64(fp) _ftelli64(fp) + #define FSEEK64(fp, offset, whence) _fseeki64(fp, offset, whence) +#else + /* POSIX - off_t is 64-bit on 64-bit systems, ftello/fseeko handle large files */ + typedef off_t file_offset_t; + #define FTELL64(fp) ftello(fp) + #define FSEEK64(fp, offset, whence) fseeko(fp, offset, whence) +#endif + +/* Forward declare zstd types if available */ +#ifdef HAVE_ZSTD +#include +#endif + +/* Branch prediction hints - same as Objects/obmalloc.c */ +#if (defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 2))) && defined(__OPTIMIZE__) +# define UNLIKELY(value) __builtin_expect((value), 0) +# define LIKELY(value) __builtin_expect((value), 1) +#else +# define UNLIKELY(value) (value) +# define LIKELY(value) (value) +#endif + +/* ============================================================================ + * BINARY WRITER STRUCTURES + * ============================================================================ */ + +/* zstd compression state (only used if HAVE_ZSTD defined) */ +typedef struct { +#ifdef HAVE_ZSTD + ZSTD_CCtx *cctx; /* Modern API: CCtx and CStream are the same since v1.3.0 */ +#else + void *cctx; /* Placeholder */ +#endif + uint8_t *compressed_buffer; + size_t compressed_buffer_size; +} ZstdCompressor; + +/* Frame entry - combines all frame data for better cache locality */ +typedef struct { + uint32_t filename_idx; + uint32_t funcname_idx; + int32_t lineno; +} FrameEntry; + +/* Frame key for hash table lookup */ +typedef struct { + uint32_t filename_idx; + uint32_t funcname_idx; + int32_t lineno; +} FrameKey; + +/* Pending RLE sample - buffered for run-length encoding */ +typedef struct { + uint64_t timestamp_delta; + uint8_t status; +} PendingRLESample; + +/* Thread entry - tracks per-thread state for delta encoding */ +typedef struct { + uint64_t thread_id; + uint64_t prev_timestamp; + uint32_t interpreter_id; + + /* Previous stack for delta encoding (frame indices, innermost first) */ + uint32_t *prev_stack; + size_t prev_stack_depth; + size_t prev_stack_capacity; + + /* RLE pending buffer - samples waiting to be written as a repeat group */ + PendingRLESample *pending_rle; + size_t pending_rle_count; + size_t pending_rle_capacity; + int has_pending_rle; /* Flag: do we have buffered repeats? */ +} ThreadEntry; + +/* Main binary writer structure */ +typedef struct { + FILE *fp; + char *filename; + + /* Write buffer for batched I/O */ + uint8_t *write_buffer; + size_t buffer_pos; + size_t buffer_size; + + /* Compression */ + int compression_type; + ZstdCompressor zstd; + + /* Metadata */ + uint64_t start_time_us; + uint64_t sample_interval_us; + uint32_t total_samples; + + /* String hash table: PyObject* -> uint32_t index */ + _Py_hashtable_t *string_hash; + /* String storage: array of UTF-8 encoded strings */ + char **strings; + size_t *string_lengths; + size_t string_count; + size_t string_capacity; + + /* Frame hash table: FrameKey* -> uint32_t index */ + _Py_hashtable_t *frame_hash; + /* Frame storage: combined struct for better cache locality */ + FrameEntry *frame_entries; + size_t frame_count; + size_t frame_capacity; + + /* Thread timestamp tracking for delta encoding - combined for cache locality */ + ThreadEntry *thread_entries; + size_t thread_count; + size_t thread_capacity; + + /* Statistics */ + BinaryWriterStats stats; +} BinaryWriter; + +/* ============================================================================ + * BINARY READER STRUCTURES + * ============================================================================ */ + +/* Per-thread state for stack reconstruction during replay */ +typedef struct { + uint64_t thread_id; + uint32_t interpreter_id; + uint64_t prev_timestamp; + + /* Reconstructed stack buffer (frame indices, innermost first) */ + uint32_t *current_stack; + size_t current_stack_depth; + size_t current_stack_capacity; +} ReaderThreadState; + +/* Main binary reader structure */ +typedef struct { + char *filename; + +#if USE_MMAP + int fd; + uint8_t *mapped_data; + size_t mapped_size; +#else + FILE *fp; + uint8_t *file_data; + size_t file_size; +#endif + + /* Decompression state */ + int compression_type; + /* Note: ZSTD_DCtx is not stored - created/freed during decompression */ + uint8_t *decompressed_data; + size_t decompressed_size; + + /* Header metadata */ + uint64_t start_time_us; + uint64_t sample_interval_us; + uint32_t sample_count; + uint32_t thread_count; + uint64_t string_table_offset; + uint64_t frame_table_offset; + + /* Parsed string table: array of Python string objects */ + PyObject **strings; + uint32_t strings_count; + + /* Parsed frame table: packed as [filename_idx, funcname_idx, lineno] */ + uint32_t *frame_data; + uint32_t frames_count; + + /* Sample data region */ + uint8_t *sample_data; + size_t sample_data_size; + + /* Per-thread state for stack reconstruction (used during replay) */ + ReaderThreadState *thread_states; + size_t thread_state_count; + size_t thread_state_capacity; + + /* Statistics */ + BinaryReaderStats stats; +} BinaryReader; + +/* ============================================================================ + * VARINT ENCODING/DECODING (INLINE FOR PERFORMANCE) + * ============================================================================ */ + +/* Encode unsigned 64-bit varint (LEB128). Returns bytes written. */ +static inline size_t +encode_varint_u64(uint8_t *buf, uint64_t value) +{ + /* Fast path for single-byte values (0-127) - very common case */ + if (value < 0x80) { + buf[0] = (uint8_t)value; + return 1; + } + + size_t i = 0; + while (value >= 0x80) { + buf[i++] = (uint8_t)((value & 0x7F) | 0x80); + value >>= 7; + } + buf[i++] = (uint8_t)(value & 0x7F); + return i; +} + +/* Encode unsigned 32-bit varint. Returns bytes written. */ +static inline size_t +encode_varint_u32(uint8_t *buf, uint32_t value) +{ + return encode_varint_u64(buf, value); +} + +/* Encode signed 32-bit varint (zigzag encoding). Returns bytes written. */ +static inline size_t +encode_varint_i32(uint8_t *buf, int32_t value) +{ + /* Zigzag encode: map signed to unsigned */ + uint32_t zigzag = ((uint32_t)value << 1) ^ (uint32_t)(value >> 31); + return encode_varint_u32(buf, zigzag); +} + +/* Decode unsigned 64-bit varint (LEB128). Updates offset only on success. + * On error (overflow or incomplete), offset is NOT updated, allowing callers + * to detect errors via (offset == prev_offset) check. Sets PyErr on error. */ +static inline uint64_t +decode_varint_u64(const uint8_t *data, size_t *offset, size_t max_size) +{ + size_t pos = *offset; + uint64_t result = 0; + int shift = 0; + + /* Fast path for single-byte varints (0-127) - most common case */ + if (LIKELY(pos < max_size && (data[pos] & 0x80) == 0)) { + *offset = pos + 1; + return data[pos]; + } + + while (pos < max_size) { + uint8_t byte = data[pos++]; + result |= (uint64_t)(byte & 0x7F) << shift; + if ((byte & 0x80) == 0) { + *offset = pos; + return result; + } + shift += 7; + if (UNLIKELY(shift >= 64)) { + PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data"); + return 0; + } + } + + PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data"); + return 0; +} + +/* Decode unsigned 32-bit varint. If value exceeds UINT32_MAX, treats as error. */ +static inline uint32_t +decode_varint_u32(const uint8_t *data, size_t *offset, size_t max_size) +{ + size_t saved_offset = *offset; + uint64_t value = decode_varint_u64(data, offset, max_size); + if (PyErr_Occurred()) { + return 0; + } + if (UNLIKELY(value > UINT32_MAX)) { + *offset = saved_offset; + PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data"); + return 0; + } + return (uint32_t)value; +} + +/* Decode signed 32-bit varint (zigzag encoding). */ +static inline int32_t +decode_varint_i32(const uint8_t *data, size_t *offset, size_t max_size) +{ + uint32_t zigzag = decode_varint_u32(data, offset, max_size); + if (PyErr_Occurred()) { + return 0; + } + return (int32_t)((zigzag >> 1) ^ -(int32_t)(zigzag & 1)); +} + +/* ============================================================================ + * SHARED UTILITY FUNCTIONS + * ============================================================================ */ + +/* Generic array growth - returns new pointer or NULL (sets PyErr_NoMemory) + * Includes overflow checking for capacity doubling and allocation size. */ +static inline void * +grow_array(void *ptr, size_t *capacity, size_t elem_size) +{ + size_t old_cap = *capacity; + + /* Check for overflow when doubling capacity */ + if (old_cap > SIZE_MAX / 2) { + PyErr_SetString(PyExc_OverflowError, "Array capacity overflow"); + return NULL; + } + size_t new_cap = old_cap * 2; + + /* Check for overflow when calculating allocation size */ + if (new_cap > SIZE_MAX / elem_size) { + PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow"); + return NULL; + } + + void *new_ptr = PyMem_Realloc(ptr, new_cap * elem_size); + if (new_ptr) { + *capacity = new_cap; + } else { + PyErr_NoMemory(); + } + return new_ptr; +} + +/* Macro wrapper for type safety with grow_array */ +#define GROW_ARRAY(ptr, count, capacity, type) \ + ((count) < (capacity) ? 0 : \ + ((ptr) = grow_array((ptr), &(capacity), sizeof(type))) ? 0 : -1) + +/* ============================================================================ + * BINARY WRITER API + * ============================================================================ */ + +/* + * Create a new binary writer. + * + * Arguments: + * filename: Path to output file + * sample_interval_us: Sampling interval in microseconds + * compression_type: COMPRESSION_NONE or COMPRESSION_ZSTD + * start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6) + * + * Returns: + * New BinaryWriter* on success, NULL on failure (PyErr set) + */ +BinaryWriter *binary_writer_create( + const char *filename, + uint64_t sample_interval_us, + int compression_type, + uint64_t start_time_us +); + +/* + * Write a sample to the binary file. + * + * Arguments: + * writer: Writer from binary_writer_create + * stack_frames: List of InterpreterInfo struct sequences + * timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6) + * + * Returns: + * 0 on success, -1 on failure (PyErr set) + */ +int binary_writer_write_sample( + BinaryWriter *writer, + PyObject *stack_frames, + uint64_t timestamp_us +); + +/* + * Finalize and close the binary file. + * Writes string/frame tables, footer, and updates header. + * + * Arguments: + * writer: Writer to finalize + * + * Returns: + * 0 on success, -1 on failure (PyErr set) + */ +int binary_writer_finalize(BinaryWriter *writer); + +/* + * Destroy a binary writer and free all resources. + * Safe to call even if writer is partially initialized. + * + * Arguments: + * writer: Writer to destroy (may be NULL) + */ +void binary_writer_destroy(BinaryWriter *writer); + +/* ============================================================================ + * BINARY READER API + * ============================================================================ */ + +/* + * Open a binary file for reading. + * + * Arguments: + * filename: Path to input file + * + * Returns: + * New BinaryReader* on success, NULL on failure (PyErr set) + */ +BinaryReader *binary_reader_open(const char *filename); + +/* + * Replay samples from binary file through a collector. + * + * Arguments: + * reader: Reader from binary_reader_open + * collector: Python collector with collect() method + * progress_callback: Optional callable(current, total) or NULL + * + * Returns: + * Number of samples replayed on success, -1 on failure (PyErr set) + */ +Py_ssize_t binary_reader_replay( + BinaryReader *reader, + PyObject *collector, + PyObject *progress_callback +); + +/* + * Get metadata about the binary file. + * + * Arguments: + * reader: Reader from binary_reader_open + * + * Returns: + * Dict with file metadata on success, NULL on failure (PyErr set) + */ +PyObject *binary_reader_get_info(BinaryReader *reader); + +/* + * Close a binary reader and free all resources. + * + * Arguments: + * reader: Reader to close (may be NULL) + */ +void binary_reader_close(BinaryReader *reader); + +/* ============================================================================ + * STATISTICS FUNCTIONS + * ============================================================================ */ + +/* + * Get writer statistics as a Python dict. + * + * Arguments: + * writer: Writer to get stats from + * + * Returns: + * Dict with statistics on success, NULL on failure (PyErr set) + */ +PyObject *binary_writer_get_stats(BinaryWriter *writer); + +/* + * Get reader statistics as a Python dict. + * + * Arguments: + * reader: Reader to get stats from + * + * Returns: + * Dict with statistics on success, NULL on failure (PyErr set) + */ +PyObject *binary_reader_get_stats(BinaryReader *reader); + +/* ============================================================================ + * UTILITY FUNCTIONS + * ============================================================================ */ + +/* + * Check if zstd compression is available. + * + * Returns: + * 1 if zstd available, 0 otherwise + */ +int binary_io_zstd_available(void); + +/* + * Get the best available compression type. + * + * Returns: + * COMPRESSION_ZSTD if available, COMPRESSION_NONE otherwise + */ +int binary_io_get_best_compression(void); + +#ifdef __cplusplus +} +#endif + +#endif /* Py_BINARY_IO_H */ diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c new file mode 100644 index 00000000000000..10381e8b62bc00 --- /dev/null +++ b/Modules/_remote_debugging/binary_io_reader.c @@ -0,0 +1,1139 @@ +/****************************************************************************** + * Python Remote Debugging Module - Binary Reader Implementation + * + * High-performance binary file reader for profiling data with optional zstd + * decompression. + ******************************************************************************/ + +#ifndef Py_BUILD_CORE_MODULE +# define Py_BUILD_CORE_MODULE +#endif + +#include "binary_io.h" +#include "_remote_debugging.h" +#include + +#ifdef HAVE_ZSTD +#include +#endif + +/* ============================================================================ + * CONSTANTS FOR BINARY FORMAT SIZES + * ============================================================================ */ + +/* File structure sizes */ +#define FILE_HEADER_PLACEHOLDER_SIZE 64 /* Placeholder written at file start */ +#define FILE_HEADER_SIZE 52 /* Actual header content size */ +#define FILE_FOOTER_SIZE 32 /* Footer size */ +#define MIN_DECOMPRESS_BUFFER_SIZE (64 * 1024) /* Minimum decompression buffer */ + +/* Progress callback frequency */ +#define PROGRESS_CALLBACK_INTERVAL 1000 + +/* Maximum decompression size limit (1GB) */ +#define MAX_DECOMPRESS_SIZE (1ULL << 30) + +/* ============================================================================ + * BINARY READER IMPLEMENTATION + * ============================================================================ */ + +static inline int +reader_parse_header(BinaryReader *reader, const uint8_t *data, size_t file_size) +{ + if (file_size < FILE_HEADER_PLACEHOLDER_SIZE) { + PyErr_SetString(PyExc_ValueError, "File too small for header"); + return -1; + } + + /* Use memcpy to avoid strict aliasing violations and unaligned access */ + uint32_t magic; + uint32_t version; + memcpy(&magic, &data[0], sizeof(magic)); + memcpy(&version, &data[4], sizeof(version)); + + if (magic != BINARY_FORMAT_MAGIC) { + PyErr_Format(PyExc_ValueError, "Invalid magic number: 0x%08x", magic); + return -1; + } + + if (version != BINARY_FORMAT_VERSION) { + PyErr_Format(PyExc_ValueError, "Unsupported version: %u", version); + return -1; + } + + memcpy(&reader->start_time_us, &data[8], sizeof(reader->start_time_us)); + memcpy(&reader->sample_interval_us, &data[16], sizeof(reader->sample_interval_us)); + memcpy(&reader->sample_count, &data[24], sizeof(reader->sample_count)); + memcpy(&reader->thread_count, &data[28], sizeof(reader->thread_count)); + memcpy(&reader->string_table_offset, &data[32], sizeof(reader->string_table_offset)); + memcpy(&reader->frame_table_offset, &data[40], sizeof(reader->frame_table_offset)); + memcpy(&reader->compression_type, &data[48], sizeof(reader->compression_type)); + + return 0; +} + +static inline int +reader_parse_footer(BinaryReader *reader, const uint8_t *data, size_t file_size) +{ + if (file_size < FILE_FOOTER_SIZE) { + PyErr_SetString(PyExc_ValueError, "File too small for footer"); + return -1; + } + + const uint8_t *footer = data + file_size - FILE_FOOTER_SIZE; + /* Use memcpy to avoid strict aliasing violations */ + memcpy(&reader->strings_count, &footer[0], sizeof(reader->strings_count)); + memcpy(&reader->frames_count, &footer[4], sizeof(reader->frames_count)); + + return 0; +} + +#ifdef HAVE_ZSTD +/* Maximum decompression buffer size to prevent memory exhaustion (1GB) */ +#define MAX_DECOMPRESS_SIZE (1ULL << 30) + +static inline int +reader_decompress_samples(BinaryReader *reader, const uint8_t *data) +{ + size_t compressed_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE; + const uint8_t *compressed_data = data + FILE_HEADER_PLACEHOLDER_SIZE; + + /* Validate compressed data region */ + if (reader->string_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) { + PyErr_SetString(PyExc_ValueError, "Invalid string table offset"); + return -1; + } + + ZSTD_DCtx *dctx = ZSTD_createDCtx(); + if (!dctx) { + PyErr_SetString(PyExc_MemoryError, "Failed to create zstd decompression context"); + return -1; + } + + /* Try to get exact decompressed size from frame header for optimal allocation */ + unsigned long long frame_content_size = ZSTD_getFrameContentSize(compressed_data, compressed_size); + size_t alloc_size; + + if (frame_content_size == ZSTD_CONTENTSIZE_ERROR) { + /* Corrupted frame header - fail early */ + ZSTD_freeDCtx(dctx); + PyErr_SetString(PyExc_ValueError, "Corrupted zstd frame header"); + return -1; + } else if (frame_content_size != ZSTD_CONTENTSIZE_UNKNOWN && + frame_content_size <= SIZE_MAX && + frame_content_size <= MAX_DECOMPRESS_SIZE) { + alloc_size = (size_t)frame_content_size; + } else { + alloc_size = ZSTD_DStreamOutSize() * 4; + if (alloc_size < MIN_DECOMPRESS_BUFFER_SIZE) { + alloc_size = MIN_DECOMPRESS_BUFFER_SIZE; + } + } + + reader->decompressed_data = PyMem_Malloc(alloc_size); + if (!reader->decompressed_data) { + ZSTD_freeDCtx(dctx); + PyErr_NoMemory(); + return -1; + } + + ZSTD_inBuffer input = { compressed_data, compressed_size, 0 }; + size_t total_output = 0; + size_t last_result = 0; + + while (input.pos < input.size) { + if (total_output >= alloc_size) { + /* Check for overflow before doubling */ + if (alloc_size > SIZE_MAX / 2 || alloc_size * 2 > MAX_DECOMPRESS_SIZE) { + PyMem_Free(reader->decompressed_data); + reader->decompressed_data = NULL; + ZSTD_freeDCtx(dctx); + PyErr_SetString(PyExc_MemoryError, "Decompressed data exceeds maximum size"); + return -1; + } + size_t new_size = alloc_size * 2; + uint8_t *new_buf = PyMem_Realloc(reader->decompressed_data, new_size); + if (!new_buf) { + PyMem_Free(reader->decompressed_data); + reader->decompressed_data = NULL; + ZSTD_freeDCtx(dctx); + PyErr_NoMemory(); + return -1; + } + reader->decompressed_data = new_buf; + alloc_size = new_size; + } + + ZSTD_outBuffer output = { + reader->decompressed_data + total_output, + alloc_size - total_output, + 0 + }; + + last_result = ZSTD_decompressStream(dctx, &output, &input); + if (ZSTD_isError(last_result)) { + PyMem_Free(reader->decompressed_data); + reader->decompressed_data = NULL; + ZSTD_freeDCtx(dctx); + PyErr_Format(PyExc_ValueError, "zstd decompression error: %s", + ZSTD_getErrorName(last_result)); + return -1; + } + + total_output += output.pos; + } + + /* Verify decompression is complete (last_result == 0 means frame is complete) */ + if (last_result != 0) { + PyMem_Free(reader->decompressed_data); + reader->decompressed_data = NULL; + ZSTD_freeDCtx(dctx); + PyErr_SetString(PyExc_ValueError, "Incomplete zstd frame: data may be truncated"); + return -1; + } + + ZSTD_freeDCtx(dctx); + reader->decompressed_size = total_output; + reader->sample_data = reader->decompressed_data; + reader->sample_data_size = reader->decompressed_size; + + return 0; +} +#endif + +static inline int +reader_parse_string_table(BinaryReader *reader, const uint8_t *data, size_t file_size) +{ + reader->strings = PyMem_Calloc(reader->strings_count, sizeof(PyObject *)); + if (!reader->strings && reader->strings_count > 0) { + PyErr_NoMemory(); + return -1; + } + + size_t offset = reader->string_table_offset; + for (uint32_t i = 0; i < reader->strings_count; i++) { + size_t prev_offset = offset; + uint32_t str_len = decode_varint_u32(data, &offset, file_size); + if (offset == prev_offset) { + PyErr_SetString(PyExc_ValueError, "Malformed varint in string table"); + return -1; + } + if (offset + str_len > file_size) { + PyErr_SetString(PyExc_ValueError, "String table overflow"); + return -1; + } + + reader->strings[i] = PyUnicode_DecodeUTF8((char *)&data[offset], str_len, "replace"); + if (!reader->strings[i]) { + return -1; + } + offset += str_len; + } + + return 0; +} + +static inline int +reader_parse_frame_table(BinaryReader *reader, const uint8_t *data, size_t file_size) +{ + /* Check for integer overflow in allocation size calculation. + Only needed on 32-bit where SIZE_MAX can be exceeded by uint32_t * 12. */ +#if SIZEOF_SIZE_T < 8 + if (reader->frames_count > SIZE_MAX / (3 * sizeof(uint32_t))) { + PyErr_SetString(PyExc_OverflowError, "Frame count too large for allocation"); + return -1; + } +#endif + + size_t alloc_size = (size_t)reader->frames_count * 3 * sizeof(uint32_t); + reader->frame_data = PyMem_Malloc(alloc_size); + if (!reader->frame_data && reader->frames_count > 0) { + PyErr_NoMemory(); + return -1; + } + + size_t offset = reader->frame_table_offset; + for (uint32_t i = 0; i < reader->frames_count; i++) { + size_t base = (size_t)i * 3; + size_t prev_offset; + + prev_offset = offset; + reader->frame_data[base] = decode_varint_u32(data, &offset, file_size); + if (offset == prev_offset) { + PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (filename)"); + return -1; + } + + prev_offset = offset; + reader->frame_data[base + 1] = decode_varint_u32(data, &offset, file_size); + if (offset == prev_offset) { + PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (funcname)"); + return -1; + } + + prev_offset = offset; + reader->frame_data[base + 2] = (uint32_t)decode_varint_i32(data, &offset, file_size); + if (offset == prev_offset) { + PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (lineno)"); + return -1; + } + } + + return 0; +} + +BinaryReader * +binary_reader_open(const char *filename) +{ + BinaryReader *reader = PyMem_Calloc(1, sizeof(BinaryReader)); + if (!reader) { + PyErr_NoMemory(); + return NULL; + } + +#if USE_MMAP + reader->fd = -1; /* Explicit initialization for cleanup safety */ +#endif + + reader->filename = PyMem_Malloc(strlen(filename) + 1); + if (!reader->filename) { + PyMem_Free(reader); + PyErr_NoMemory(); + return NULL; + } + strcpy(reader->filename, filename); + +#if USE_MMAP + /* Open with mmap on Unix */ + reader->fd = open(filename, O_RDONLY); + if (reader->fd < 0) { + PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename); + goto error; + } + + struct stat st; + if (fstat(reader->fd, &st) < 0) { + PyErr_SetFromErrno(PyExc_IOError); + goto error; + } + reader->mapped_size = st.st_size; + + /* Map the file into memory. + * MAP_POPULATE (Linux-only) pre-faults all pages at mmap time, which: + * - Catches issues (e.g., file truncation) immediately rather than as SIGBUS during reads + * - Eliminates page faults during subsequent reads for better performance + */ +#ifdef __linux__ + reader->mapped_data = mmap(NULL, reader->mapped_size, PROT_READ, + MAP_PRIVATE | MAP_POPULATE, reader->fd, 0); +#else + reader->mapped_data = mmap(NULL, reader->mapped_size, PROT_READ, + MAP_PRIVATE, reader->fd, 0); +#endif + if (reader->mapped_data == MAP_FAILED) { + reader->mapped_data = NULL; + PyErr_SetFromErrno(PyExc_IOError); + goto error; + } + + /* Hint sequential access pattern - failures are non-fatal */ + (void)madvise(reader->mapped_data, reader->mapped_size, MADV_SEQUENTIAL); + + /* Pre-fetch pages into memory - failures are non-fatal. + * Complements MAP_POPULATE on Linux, provides benefit on macOS. */ + (void)madvise(reader->mapped_data, reader->mapped_size, MADV_WILLNEED); + + /* Use transparent huge pages for large files to reduce TLB misses. + * Only beneficial for files >= 32MB where TLB pressure matters. */ +#ifdef MADV_HUGEPAGE + if (reader->mapped_size >= (32 * 1024 * 1024)) { + (void)madvise(reader->mapped_data, reader->mapped_size, MADV_HUGEPAGE); + } +#endif + + /* Add file descriptor-level hints for better kernel I/O scheduling */ +#if defined(__linux__) && defined(POSIX_FADV_SEQUENTIAL) + (void)posix_fadvise(reader->fd, 0, 0, POSIX_FADV_SEQUENTIAL); + if (reader->mapped_size > (64 * 1024 * 1024)) { + (void)posix_fadvise(reader->fd, 0, 0, POSIX_FADV_WILLNEED); + } +#endif + + uint8_t *data = reader->mapped_data; + size_t file_size = reader->mapped_size; +#else + /* Use stdio on Windows */ + reader->fp = fopen(filename, "rb"); + if (!reader->fp) { + PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename); + goto error; + } + + if (FSEEK64(reader->fp, 0, SEEK_END) != 0) { + PyErr_SetFromErrno(PyExc_IOError); + goto error; + } + file_offset_t file_size_off = FTELL64(reader->fp); + if (file_size_off < 0) { + PyErr_SetFromErrno(PyExc_IOError); + goto error; + } + reader->file_size = (size_t)file_size_off; + if (FSEEK64(reader->fp, 0, SEEK_SET) != 0) { + PyErr_SetFromErrno(PyExc_IOError); + goto error; + } + + reader->file_data = PyMem_Malloc(reader->file_size); + if (!reader->file_data) { + PyErr_NoMemory(); + goto error; + } + + if (fread(reader->file_data, 1, reader->file_size, reader->fp) != reader->file_size) { + PyErr_SetFromErrno(PyExc_IOError); + goto error; + } + + uint8_t *data = reader->file_data; + size_t file_size = reader->file_size; +#endif + + /* Parse header and footer */ + if (reader_parse_header(reader, data, file_size) < 0) { + goto error; + } + if (reader_parse_footer(reader, data, file_size) < 0) { + goto error; + } + + /* Validate table offsets are within file bounds */ + if (reader->string_table_offset > file_size) { + PyErr_Format(PyExc_ValueError, + "Invalid string table offset: %llu exceeds file size %zu", + (unsigned long long)reader->string_table_offset, file_size); + goto error; + } + if (reader->frame_table_offset > file_size) { + PyErr_Format(PyExc_ValueError, + "Invalid frame table offset: %llu exceeds file size %zu", + (unsigned long long)reader->frame_table_offset, file_size); + goto error; + } + if (reader->string_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) { + PyErr_Format(PyExc_ValueError, + "Invalid string table offset: %llu is before data section", + (unsigned long long)reader->string_table_offset); + goto error; + } + if (reader->frame_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) { + PyErr_Format(PyExc_ValueError, + "Invalid frame table offset: %llu is before data section", + (unsigned long long)reader->frame_table_offset); + goto error; + } + if (reader->string_table_offset > reader->frame_table_offset) { + PyErr_Format(PyExc_ValueError, + "Invalid table offsets: string table (%llu) is after frame table (%llu)", + (unsigned long long)reader->string_table_offset, + (unsigned long long)reader->frame_table_offset); + goto error; + } + + /* Handle compressed data */ + if (reader->compression_type == COMPRESSION_ZSTD) { +#ifdef HAVE_ZSTD + if (reader_decompress_samples(reader, data) < 0) { + goto error; + } +#else + PyErr_SetString(PyExc_RuntimeError, + "File uses zstd compression but zstd support not compiled in"); + goto error; +#endif + } else { + reader->sample_data = data + FILE_HEADER_PLACEHOLDER_SIZE; + reader->sample_data_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE; + } + + /* Parse string and frame tables */ + if (reader_parse_string_table(reader, data, file_size) < 0) { + goto error; + } + if (reader_parse_frame_table(reader, data, file_size) < 0) { + goto error; + } + + return reader; + +error: + binary_reader_close(reader); + return NULL; +} + +/* Get or create reader thread state for stack reconstruction */ +static ReaderThreadState * +reader_get_or_create_thread_state(BinaryReader *reader, uint64_t thread_id, + uint32_t interpreter_id) +{ + /* Search existing threads (key is thread_id + interpreter_id) */ + for (size_t i = 0; i < reader->thread_state_count; i++) { + if (reader->thread_states[i].thread_id == thread_id && + reader->thread_states[i].interpreter_id == interpreter_id) { + return &reader->thread_states[i]; + } + } + + if (!reader->thread_states) { + reader->thread_state_capacity = 16; + reader->thread_states = PyMem_Calloc(reader->thread_state_capacity, sizeof(ReaderThreadState)); + if (!reader->thread_states) { + PyErr_NoMemory(); + return NULL; + } + } else if (reader->thread_state_count >= reader->thread_state_capacity) { + reader->thread_states = grow_array(reader->thread_states, + &reader->thread_state_capacity, + sizeof(ReaderThreadState)); + if (!reader->thread_states) { + return NULL; + } + } + + ReaderThreadState *ts = &reader->thread_states[reader->thread_state_count++]; + memset(ts, 0, sizeof(ReaderThreadState)); + ts->thread_id = thread_id; + ts->interpreter_id = interpreter_id; + ts->prev_timestamp = reader->start_time_us; + ts->current_stack_capacity = MAX_STACK_DEPTH; + ts->current_stack = PyMem_Malloc(ts->current_stack_capacity * sizeof(uint32_t)); + if (!ts->current_stack) { + PyErr_NoMemory(); + return NULL; + } + return ts; +} + +/* ============================================================================ + * STACK DECODING HELPERS + * ============================================================================ */ + +/* Decode a full stack from sample data. + * Updates ts->current_stack and ts->current_stack_depth. + * Returns 0 on success, -1 on error (bounds violation). */ +static inline int +decode_stack_full(ReaderThreadState *ts, const uint8_t *data, + size_t *offset, size_t max_size) +{ + uint32_t depth = decode_varint_u32(data, offset, max_size); + + /* Validate depth against capacity to prevent buffer overflow */ + if (depth > ts->current_stack_capacity) { + PyErr_Format(PyExc_ValueError, + "Stack depth %u exceeds capacity %zu", depth, ts->current_stack_capacity); + return -1; + } + + ts->current_stack_depth = depth; + for (uint32_t i = 0; i < depth; i++) { + ts->current_stack[i] = decode_varint_u32(data, offset, max_size); + } + return 0; +} + +/* Decode a suffix-encoded stack from sample data. + * The suffix encoding shares frames from the bottom of the previous stack. + * Returns 0 on success, -1 on error (bounds violation). */ +static inline int +decode_stack_suffix(ReaderThreadState *ts, const uint8_t *data, + size_t *offset, size_t max_size) +{ + uint32_t shared = decode_varint_u32(data, offset, max_size); + uint32_t new_count = decode_varint_u32(data, offset, max_size); + + /* Validate shared doesn't exceed current stack depth */ + if (shared > ts->current_stack_depth) { + PyErr_Format(PyExc_ValueError, + "Shared count %u exceeds current stack depth %zu", + shared, ts->current_stack_depth); + return -1; + } + + /* Validate final depth doesn't exceed capacity */ + size_t final_depth = (size_t)shared + new_count; + if (final_depth > ts->current_stack_capacity) { + PyErr_Format(PyExc_ValueError, + "Final stack depth %zu exceeds capacity %zu", + final_depth, ts->current_stack_capacity); + return -1; + } + + /* Move shared frames (from bottom of stack) to make room for new frames at the top */ + if (new_count > 0 && shared > 0) { + size_t prev_shared_start = ts->current_stack_depth - shared; + memmove(&ts->current_stack[new_count], + &ts->current_stack[prev_shared_start], + shared * sizeof(uint32_t)); + } + + for (uint32_t i = 0; i < new_count; i++) { + ts->current_stack[i] = decode_varint_u32(data, offset, max_size); + } + ts->current_stack_depth = final_depth; + return 0; +} + +/* Decode a pop-push encoded stack from sample data. + * Pops frames from the top and pushes new frames. + * Returns 0 on success, -1 on error (bounds violation). */ +static inline int +decode_stack_pop_push(ReaderThreadState *ts, const uint8_t *data, + size_t *offset, size_t max_size) +{ + uint32_t pop = decode_varint_u32(data, offset, max_size); + uint32_t push = decode_varint_u32(data, offset, max_size); + size_t keep = (ts->current_stack_depth > pop) ? ts->current_stack_depth - pop : 0; + + /* Validate final depth doesn't exceed capacity */ + size_t final_depth = keep + push; + if (final_depth > ts->current_stack_capacity) { + PyErr_Format(PyExc_ValueError, + "Final stack depth %zu exceeds capacity %zu", + final_depth, ts->current_stack_capacity); + return -1; + } + + /* Move kept frames (from bottom of stack) to make room for new frames at the top. + * Even when push == 0, we need to move kept frames to index 0 if pop > 0. */ + if (keep > 0) { + memmove(&ts->current_stack[push], + &ts->current_stack[pop], + keep * sizeof(uint32_t)); + } + + for (uint32_t i = 0; i < push; i++) { + ts->current_stack[i] = decode_varint_u32(data, offset, max_size); + } + ts->current_stack_depth = final_depth; + return 0; +} + +/* Build a Python list of FrameInfo objects from frame indices */ +static PyObject * +build_frame_list(RemoteDebuggingState *state, BinaryReader *reader, + const uint32_t *frame_indices, size_t stack_depth) +{ + PyObject *frame_list = PyList_New(stack_depth); + if (!frame_list) { + return NULL; + } + + for (size_t k = 0; k < stack_depth; k++) { + uint32_t frame_idx = frame_indices[k]; + if (frame_idx >= reader->frames_count) { + PyErr_Format(PyExc_ValueError, "Invalid frame index: %u", frame_idx); + goto error; + } + + size_t base = frame_idx * 3; + uint32_t filename_idx = reader->frame_data[base]; + uint32_t funcname_idx = reader->frame_data[base + 1]; + int32_t lineno = (int32_t)reader->frame_data[base + 2]; + + if (filename_idx >= reader->strings_count || + funcname_idx >= reader->strings_count) { + PyErr_SetString(PyExc_ValueError, "Invalid string index in frame"); + goto error; + } + + PyObject *frame_info = PyStructSequence_New(state->FrameInfo_Type); + if (!frame_info) { + goto error; + } + + PyObject *location; + if (lineno > 0) { + location = Py_BuildValue("(iiii)", lineno, lineno, 0, 0); + if (!location) { + Py_DECREF(frame_info); + goto error; + } + } + else { + location = Py_NewRef(Py_None); + } + + PyStructSequence_SetItem(frame_info, 0, Py_NewRef(reader->strings[filename_idx])); + PyStructSequence_SetItem(frame_info, 1, location); + PyStructSequence_SetItem(frame_info, 2, Py_NewRef(reader->strings[funcname_idx])); + PyStructSequence_SetItem(frame_info, 3, Py_NewRef(Py_None)); + PyList_SET_ITEM(frame_list, k, frame_info); + } + + return frame_list; + +error: + Py_DECREF(frame_list); + return NULL; +} + +/* Helper to build sample_list from frame indices (shared by emit functions) */ +static PyObject * +build_sample_list(RemoteDebuggingState *state, BinaryReader *reader, + uint64_t thread_id, uint32_t interpreter_id, uint8_t status, + const uint32_t *frame_indices, size_t stack_depth) +{ + PyObject *frame_list = NULL, *thread_info = NULL, *thread_list = NULL; + PyObject *interp_info = NULL, *sample_list = NULL; + + frame_list = build_frame_list(state, reader, frame_indices, stack_depth); + if (!frame_list) { + goto error; + } + + thread_info = PyStructSequence_New(state->ThreadInfo_Type); + if (!thread_info) { + goto error; + } + PyObject *tid = PyLong_FromUnsignedLongLong(thread_id); + if (!tid) { + goto error; + } + PyObject *st = PyLong_FromLong(status); + if (!st) { + Py_DECREF(tid); + goto error; + } + PyStructSequence_SetItem(thread_info, 0, tid); + PyStructSequence_SetItem(thread_info, 1, st); + PyStructSequence_SetItem(thread_info, 2, frame_list); + frame_list = NULL; /* ownership transferred */ + + thread_list = PyList_New(1); + if (!thread_list) { + goto error; + } + PyList_SET_ITEM(thread_list, 0, thread_info); + thread_info = NULL; + + interp_info = PyStructSequence_New(state->InterpreterInfo_Type); + if (!interp_info) { + goto error; + } + PyObject *iid = PyLong_FromUnsignedLong(interpreter_id); + if (!iid) { + goto error; + } + PyStructSequence_SetItem(interp_info, 0, iid); + PyStructSequence_SetItem(interp_info, 1, thread_list); + thread_list = NULL; + + sample_list = PyList_New(1); + if (!sample_list) { + goto error; + } + PyList_SET_ITEM(sample_list, 0, interp_info); + return sample_list; + +error: + Py_XDECREF(sample_list); + Py_XDECREF(interp_info); + Py_XDECREF(thread_list); + Py_XDECREF(thread_info); + Py_XDECREF(frame_list); + return NULL; +} + +/* Helper to emit a sample to the collector. timestamps_list is borrowed. */ +static int +emit_sample(RemoteDebuggingState *state, PyObject *collector, + uint64_t thread_id, uint32_t interpreter_id, uint8_t status, + const uint32_t *frame_indices, size_t stack_depth, + BinaryReader *reader, PyObject *timestamps_list) +{ + PyObject *sample_list = build_sample_list(state, reader, thread_id, + interpreter_id, status, + frame_indices, stack_depth); + if (!sample_list) { + return -1; + } + + PyObject *result = PyObject_CallMethod(collector, "collect", "OO", sample_list, timestamps_list); + Py_DECREF(sample_list); + + if (!result) { + return -1; + } + Py_DECREF(result); + return 0; +} + +/* Helper to trim timestamp list and emit batch. Returns 0 on success, -1 on error. */ +static int +emit_batch(RemoteDebuggingState *state, PyObject *collector, + uint64_t thread_id, uint32_t interpreter_id, uint8_t status, + const uint32_t *frame_indices, size_t stack_depth, + BinaryReader *reader, PyObject *timestamps_list, Py_ssize_t actual_size) +{ + /* Trim list to actual size */ + if (PyList_SetSlice(timestamps_list, actual_size, PyList_GET_SIZE(timestamps_list), NULL) < 0) { + return -1; + } + return emit_sample(state, collector, thread_id, interpreter_id, status, + frame_indices, stack_depth, reader, timestamps_list); +} + +/* Helper to invoke progress callback, clearing any errors */ +static inline void +invoke_progress_callback(PyObject *callback, Py_ssize_t current, uint32_t total) +{ + if (callback && callback != Py_None) { + PyObject *result = PyObject_CallFunction(callback, "nI", current, total); + if (result) { + Py_DECREF(result); + } else { + PyErr_Clear(); + } + } +} + +Py_ssize_t +binary_reader_replay(BinaryReader *reader, PyObject *collector, PyObject *progress_callback) +{ + if (!PyObject_HasAttrString(collector, "collect")) { + PyErr_SetString(PyExc_TypeError, "Collector must have a collect() method"); + return -1; + } + + /* Get module state for struct sequence types */ + PyObject *module = PyImport_ImportModule("_remote_debugging"); + if (!module) { + return -1; + } + RemoteDebuggingState *state = RemoteDebugging_GetState(module); + Py_DECREF(module); + + if (!state) { + PyErr_SetString(PyExc_RuntimeError, "Failed to get module state"); + return -1; + } + + size_t offset = 0; + Py_ssize_t replayed = 0; + + /* Initial progress callback at 0% */ + invoke_progress_callback(progress_callback, 0, reader->sample_count); + + while (offset < reader->sample_data_size) { + /* Read thread_id (8 bytes) + interpreter_id (4 bytes) */ + if (offset + 13 > reader->sample_data_size) { + break; /* End of data */ + } + + /* Use memcpy to avoid strict aliasing violations */ + uint64_t thread_id; + uint32_t interpreter_id; + memcpy(&thread_id, &reader->sample_data[offset], sizeof(thread_id)); + offset += 8; + + memcpy(&interpreter_id, &reader->sample_data[offset], sizeof(interpreter_id)); + offset += 4; + + /* Get or create thread state for reconstruction */ + ReaderThreadState *ts = reader_get_or_create_thread_state(reader, thread_id, interpreter_id); + if (!ts) { + return -1; + } + + /* Read encoding byte */ + uint8_t encoding = reader->sample_data[offset++]; + + switch (encoding) { + case STACK_REPEAT: { + /* RLE repeat: [count: varint] [delta: varint, status: 1]... */ + size_t prev_offset = offset; + uint32_t count = decode_varint_u32(reader->sample_data, &offset, reader->sample_data_size); + /* Detect varint decode failure */ + if (offset == prev_offset) { + PyErr_SetString(PyExc_ValueError, "Malformed varint for RLE count"); + return -1; + } + + /* Validate RLE count to prevent DoS from malicious files. + * Each RLE sample needs at least 2 bytes (1 byte min varint + 1 status byte). + * Also reject absurdly large counts that would exhaust memory. */ + size_t remaining_data = reader->sample_data_size - offset; + size_t max_possible_samples = remaining_data / 2; + if (count > max_possible_samples) { + PyErr_Format(PyExc_ValueError, + "Invalid RLE count %u exceeds maximum possible %zu for remaining data", + count, max_possible_samples); + return -1; + } + + reader->stats.repeat_records++; + reader->stats.repeat_samples += count; + + /* Process RLE samples, batching by status */ + PyObject *timestamps_list = NULL; + uint8_t batch_status = 0; + Py_ssize_t batch_idx = 0; + + for (uint32_t i = 0; i < count; i++) { + size_t delta_prev_offset = offset; + uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size); + if (offset == delta_prev_offset) { + Py_XDECREF(timestamps_list); + PyErr_SetString(PyExc_ValueError, "Malformed varint in RLE sample data"); + return -1; + } + if (offset >= reader->sample_data_size) { + Py_XDECREF(timestamps_list); + PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data in RLE"); + return -1; + } + uint8_t status = reader->sample_data[offset++]; + ts->prev_timestamp += delta; + + /* Start new batch on first sample or status change */ + if (i == 0 || status != batch_status) { + if (timestamps_list) { + int rc = emit_batch(state, collector, thread_id, interpreter_id, + batch_status, ts->current_stack, ts->current_stack_depth, + reader, timestamps_list, batch_idx); + Py_DECREF(timestamps_list); + if (rc < 0) { + return -1; + } + } + timestamps_list = PyList_New(count - i); + if (!timestamps_list) { + return -1; + } + batch_status = status; + batch_idx = 0; + } + + PyObject *ts_obj = PyLong_FromUnsignedLongLong(ts->prev_timestamp); + if (!ts_obj) { + Py_DECREF(timestamps_list); + return -1; + } + PyList_SET_ITEM(timestamps_list, batch_idx++, ts_obj); + } + + /* Emit final batch */ + if (timestamps_list) { + int rc = emit_batch(state, collector, thread_id, interpreter_id, + batch_status, ts->current_stack, ts->current_stack_depth, + reader, timestamps_list, batch_idx); + Py_DECREF(timestamps_list); + if (rc < 0) { + return -1; + } + } + + replayed += count; + reader->stats.total_samples += count; + + /* Progress callback after batch */ + if (replayed % PROGRESS_CALLBACK_INTERVAL < count) { + invoke_progress_callback(progress_callback, replayed, reader->sample_count); + } + break; + } + + case STACK_FULL: + case STACK_SUFFIX: + case STACK_POP_PUSH: { + /* All three encodings share: [delta: varint] [status: 1] ... */ + size_t prev_offset = offset; + uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size); + /* Detect varint decode failure: offset unchanged means error */ + if (offset == prev_offset) { + PyErr_SetString(PyExc_ValueError, "Malformed varint in sample data"); + return -1; + } + if (offset >= reader->sample_data_size) { + PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data"); + return -1; + } + uint8_t status = reader->sample_data[offset++]; + ts->prev_timestamp += delta; + + if (encoding == STACK_FULL) { + if (decode_stack_full(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) { + return -1; + } + reader->stats.full_records++; + } else if (encoding == STACK_SUFFIX) { + if (decode_stack_suffix(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) { + return -1; + } + reader->stats.suffix_records++; + } else { /* STACK_POP_PUSH */ + if (decode_stack_pop_push(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) { + return -1; + } + reader->stats.pop_push_records++; + } + reader->stats.stack_reconstructions++; + + /* Build single-element timestamp list */ + PyObject *ts_obj = PyLong_FromUnsignedLongLong(ts->prev_timestamp); + if (!ts_obj) { + return -1; + } + PyObject *timestamps_list = PyList_New(1); + if (!timestamps_list) { + Py_DECREF(ts_obj); + return -1; + } + PyList_SET_ITEM(timestamps_list, 0, ts_obj); + + if (emit_sample(state, collector, thread_id, interpreter_id, status, + ts->current_stack, ts->current_stack_depth, reader, + timestamps_list) < 0) { + Py_DECREF(timestamps_list); + return -1; + } + Py_DECREF(timestamps_list); + replayed++; + reader->stats.total_samples++; + break; + } + + default: + PyErr_Format(PyExc_ValueError, "Unknown stack encoding: %u", encoding); + return -1; + } + + /* Progress callback */ + if (replayed % PROGRESS_CALLBACK_INTERVAL == 0) { + invoke_progress_callback(progress_callback, replayed, reader->sample_count); + } + } + + /* Final progress callback at 100% */ + invoke_progress_callback(progress_callback, replayed, reader->sample_count); + + return replayed; +} + +PyObject * +binary_reader_get_info(BinaryReader *reader) +{ + return Py_BuildValue( + "{s:I, s:K, s:K, s:I, s:I, s:I, s:I, s:i}", + "version", BINARY_FORMAT_VERSION, + "start_time_us", reader->start_time_us, + "sample_interval_us", reader->sample_interval_us, + "sample_count", reader->sample_count, + "thread_count", reader->thread_count, + "string_count", reader->strings_count, + "frame_count", reader->frames_count, + "compression_type", reader->compression_type + ); +} + +PyObject * +binary_writer_get_stats(BinaryWriter *writer) +{ + BinaryWriterStats *s = &writer->stats; + + /* Calculate derived stats */ + uint64_t total_records = s->repeat_records + s->full_records + + s->suffix_records + s->pop_push_records; + uint64_t total_samples = writer->total_samples; + uint64_t potential_frames = s->total_frames_written + s->frames_saved; + double compression_ratio = (potential_frames > 0) ? + (double)s->frames_saved / potential_frames * 100.0 : 0.0; + + return Py_BuildValue( + "{s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:d}", + "repeat_records", s->repeat_records, + "repeat_samples", s->repeat_samples, + "full_records", s->full_records, + "suffix_records", s->suffix_records, + "pop_push_records", s->pop_push_records, + "total_records", total_records, + "total_samples", total_samples, + "total_frames_written", s->total_frames_written, + "frames_saved", s->frames_saved, + "bytes_written", s->bytes_written, + "frame_compression_pct", compression_ratio + ); +} + +PyObject * +binary_reader_get_stats(BinaryReader *reader) +{ + BinaryReaderStats *s = &reader->stats; + + uint64_t total_records = s->repeat_records + s->full_records + + s->suffix_records + s->pop_push_records; + + return Py_BuildValue( + "{s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K}", + "repeat_records", s->repeat_records, + "repeat_samples", s->repeat_samples, + "full_records", s->full_records, + "suffix_records", s->suffix_records, + "pop_push_records", s->pop_push_records, + "total_records", total_records, + "total_samples", s->total_samples, + "stack_reconstructions", s->stack_reconstructions + ); +} + +void +binary_reader_close(BinaryReader *reader) +{ + if (!reader) { + return; + } + + PyMem_Free(reader->filename); + +#if USE_MMAP + if (reader->mapped_data) { + munmap(reader->mapped_data, reader->mapped_size); + reader->mapped_data = NULL; /* Prevent use-after-free */ + reader->mapped_size = 0; + } + if (reader->fd >= 0) { + close(reader->fd); + reader->fd = -1; /* Mark as closed */ + } +#else + if (reader->fp) { + fclose(reader->fp); + reader->fp = NULL; + } + if (reader->file_data) { + PyMem_Free(reader->file_data); + reader->file_data = NULL; + reader->file_size = 0; + } +#endif + + PyMem_Free(reader->decompressed_data); + + if (reader->strings) { + for (uint32_t i = 0; i < reader->strings_count; i++) { + Py_XDECREF(reader->strings[i]); + } + PyMem_Free(reader->strings); + } + + PyMem_Free(reader->frame_data); + + if (reader->thread_states) { + for (size_t i = 0; i < reader->thread_state_count; i++) { + PyMem_Free(reader->thread_states[i].current_stack); + } + PyMem_Free(reader->thread_states); + } + + PyMem_Free(reader); +} diff --git a/Modules/_remote_debugging/binary_io_writer.c b/Modules/_remote_debugging/binary_io_writer.c new file mode 100644 index 00000000000000..fbcdea5cbe526b --- /dev/null +++ b/Modules/_remote_debugging/binary_io_writer.c @@ -0,0 +1,1149 @@ +/****************************************************************************** + * Python Remote Debugging Module - Binary Writer Implementation + * + * High-performance binary file writer for profiling data with optional zstd + * streaming compression. + ******************************************************************************/ + +#ifndef Py_BUILD_CORE_MODULE +# define Py_BUILD_CORE_MODULE +#endif + +#include "binary_io.h" +#include "_remote_debugging.h" +#include + +#ifdef HAVE_ZSTD +#include +#endif + +/* ============================================================================ + * CONSTANTS FOR BINARY FORMAT SIZES + * ============================================================================ */ + +/* Sample header sizes */ +#define SAMPLE_HEADER_FIXED_SIZE 13 /* thread_id(8) + interpreter_id(4) + encoding(1) */ +#define SAMPLE_HEADER_MAX_SIZE 26 /* fixed + max_varint(10) + status(1) + margin */ +#define MAX_VARINT_SIZE 10 /* Maximum bytes for a varint64 */ +#define MAX_VARINT_SIZE_U32 5 /* Maximum bytes for a varint32 */ +/* Frame buffer: depth varint (max 2 bytes for 256) + 256 frames * 5 bytes/varint + margin */ +#define MAX_FRAME_BUFFER_SIZE ((MAX_STACK_DEPTH * MAX_VARINT_SIZE_U32) + MAX_VARINT_SIZE_U32 + 16) + +/* File structure sizes */ +#define FILE_HEADER_PLACEHOLDER_SIZE 64 /* Placeholder written at file start */ +#define FILE_HEADER_SIZE 52 /* Actual header content size */ +#define FILE_FOOTER_SIZE 32 /* Footer size */ + +/* ============================================================================ + * WRITER-SPECIFIC UTILITY HELPERS + * ============================================================================ */ + +/* Grow two parallel arrays together (e.g., strings and string_lengths). + * Returns 0 on success, -1 on error (sets PyErr). + * On error, original arrays are preserved (truly atomic update). */ +static inline int +grow_parallel_arrays(void **array1, void **array2, size_t *capacity, + size_t elem_size1, size_t elem_size2) +{ + size_t old_cap = *capacity; + + if (old_cap > SIZE_MAX / 2) { + PyErr_SetString(PyExc_OverflowError, "Array capacity overflow"); + return -1; + } + size_t new_cap = old_cap * 2; + + if (new_cap > SIZE_MAX / elem_size1 || new_cap > SIZE_MAX / elem_size2) { + PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow"); + return -1; + } + + size_t new_size1 = new_cap * elem_size1; + size_t new_size2 = new_cap * elem_size2; + size_t old_size1 = old_cap * elem_size1; + size_t old_size2 = old_cap * elem_size2; + + /* Allocate fresh memory blocks (not realloc) to ensure atomicity. + * If either allocation fails, original arrays are completely unchanged. */ + void *new_array1 = PyMem_Malloc(new_size1); + if (!new_array1) { + PyErr_NoMemory(); + return -1; + } + + void *new_array2 = PyMem_Malloc(new_size2); + if (!new_array2) { + /* Second allocation failed - free first and return with no state change */ + PyMem_Free(new_array1); + PyErr_NoMemory(); + return -1; + } + + /* Both allocations succeeded - copy data and update pointers atomically */ + memcpy(new_array1, *array1, old_size1); + memcpy(new_array2, *array2, old_size2); + + PyMem_Free(*array1); + PyMem_Free(*array2); + + *array1 = new_array1; + *array2 = new_array2; + *capacity = new_cap; + return 0; +} + +/* Checked fwrite with GIL release - returns 0 on success, -1 on error (sets PyErr). + * This version releases the GIL during the write operation to allow other Python + * threads to run during potentially blocking I/O. */ +static inline int +fwrite_checked_allow_threads(const void *data, size_t size, FILE *fp) +{ + size_t written; + Py_BEGIN_ALLOW_THREADS + written = fwrite(data, 1, size, fp); + Py_END_ALLOW_THREADS + if (written != size) { + PyErr_SetFromErrno(PyExc_IOError); + return -1; + } + return 0; +} + +/* Forward declaration for writer_write_bytes */ +static inline int writer_write_bytes(BinaryWriter *writer, const void *data, size_t size); + +/* Encode and write a varint u32 - returns 0 on success, -1 on error */ +static inline int +writer_write_varint_u32(BinaryWriter *writer, uint32_t value) +{ + uint8_t buf[MAX_VARINT_SIZE]; + size_t len = encode_varint_u32(buf, value); + return writer_write_bytes(writer, buf, len); +} + +/* Encode and write a varint u64 - returns 0 on success, -1 on error */ +static inline int +writer_write_varint_u64(BinaryWriter *writer, uint64_t value) +{ + uint8_t buf[MAX_VARINT_SIZE]; + size_t len = encode_varint_u64(buf, value); + return writer_write_bytes(writer, buf, len); +} + + +/* ============================================================================ + * UTILITY FUNCTIONS + * ============================================================================ */ + +int +binary_io_zstd_available(void) +{ +#ifdef HAVE_ZSTD + return 1; +#else + return 0; +#endif +} + +int +binary_io_get_best_compression(void) +{ +#ifdef HAVE_ZSTD + return COMPRESSION_ZSTD; +#else + return COMPRESSION_NONE; +#endif +} + +/* ============================================================================ + * BINARY WRITER IMPLEMENTATION + * ============================================================================ */ + +static int +writer_init_zstd(BinaryWriter *writer) +{ +#ifdef HAVE_ZSTD + writer->zstd.cctx = ZSTD_createCCtx(); + if (!writer->zstd.cctx) { + PyErr_SetString(PyExc_MemoryError, "Failed to create zstd compression context"); + return -1; + } + + /* Compression level 5: better ratio for repetitive profiling data */ + size_t result = ZSTD_CCtx_setParameter(writer->zstd.cctx, + ZSTD_c_compressionLevel, 5); + if (ZSTD_isError(result)) { + PyErr_Format(PyExc_RuntimeError, "Failed to set zstd compression level: %s", + ZSTD_getErrorName(result)); + ZSTD_freeCCtx(writer->zstd.cctx); + writer->zstd.cctx = NULL; + return -1; + } + + /* Use large buffer (512KB) for fewer I/O syscalls */ + writer->zstd.compressed_buffer = PyMem_Malloc(COMPRESSED_BUFFER_SIZE); + if (!writer->zstd.compressed_buffer) { + ZSTD_freeCCtx(writer->zstd.cctx); + writer->zstd.cctx = NULL; + PyErr_NoMemory(); + return -1; + } + writer->zstd.compressed_buffer_size = COMPRESSED_BUFFER_SIZE; + + return 0; +#else + PyErr_SetString(PyExc_RuntimeError, + "zstd compression requested but not available (HAVE_ZSTD not defined)"); + return -1; +#endif +} + +static int +writer_flush_buffer(BinaryWriter *writer) +{ + if (writer->buffer_pos == 0) { + return 0; + } + +#ifdef HAVE_ZSTD + if (writer->compression_type == COMPRESSION_ZSTD) { + ZSTD_inBuffer input = { writer->write_buffer, writer->buffer_pos, 0 }; + + while (input.pos < input.size) { + ZSTD_outBuffer output = { + writer->zstd.compressed_buffer, + writer->zstd.compressed_buffer_size, + 0 + }; + + size_t result = ZSTD_compressStream2( + writer->zstd.cctx, &output, &input, ZSTD_e_continue + ); + + if (ZSTD_isError(result)) { + PyErr_Format(PyExc_IOError, "zstd compression error: %s", + ZSTD_getErrorName(result)); + return -1; + } + + if (output.pos > 0) { + if (fwrite_checked_allow_threads(writer->zstd.compressed_buffer, output.pos, writer->fp) < 0) { + return -1; + } + } + } + } else +#endif + { + if (fwrite_checked_allow_threads(writer->write_buffer, writer->buffer_pos, writer->fp) < 0) { + return -1; + } + } + + writer->buffer_pos = 0; + return 0; +} + +static inline int +writer_write_bytes(BinaryWriter *writer, const void *data, size_t size) +{ + const uint8_t *src = (const uint8_t *)data; + size_t original_size = size; + + while (size > 0) { + size_t space = writer->buffer_size - writer->buffer_pos; + size_t to_copy = (size < space) ? size : space; + + memcpy(writer->write_buffer + writer->buffer_pos, src, to_copy); + writer->buffer_pos += to_copy; + src += to_copy; + size -= to_copy; + + if (writer->buffer_pos == writer->buffer_size) { + if (writer_flush_buffer(writer) < 0) { + return -1; + } + } + } + + writer->stats.bytes_written += original_size; + return 0; +} + +/* ============================================================================ + * HASH TABLE SUPPORT FUNCTIONS (using _Py_hashtable) + * ============================================================================ */ + +/* Hash function for Python strings - uses Python's cached hash */ +static Py_uhash_t +string_hash_func(const void *key) +{ + PyObject *str = (PyObject *)key; + Py_hash_t hash = PyObject_Hash(str); + if (hash == -1) { + PyErr_Clear(); + return 0; + } + return (Py_uhash_t)hash; +} + +static int +string_compare_func(const void *key1, const void *key2) +{ + PyObject *str1 = (PyObject *)key1; + PyObject *str2 = (PyObject *)key2; + if (str1 == str2) { + return 1; + } + int result = PyObject_RichCompareBool(str1, str2, Py_EQ); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; +} + +static void +string_key_destroy(void *key) +{ + Py_XDECREF((PyObject *)key); +} + +static Py_uhash_t +frame_key_hash_func(const void *key) +{ + const FrameKey *fk = (const FrameKey *)key; + /* FNV-1a style hash combining all three values */ + Py_uhash_t hash = 2166136261u; + hash ^= fk->filename_idx; + hash *= 16777619u; + hash ^= fk->funcname_idx; + hash *= 16777619u; + hash ^= (uint32_t)fk->lineno; + hash *= 16777619u; + return hash; +} + +static int +frame_key_compare_func(const void *key1, const void *key2) +{ + const FrameKey *fk1 = (const FrameKey *)key1; + const FrameKey *fk2 = (const FrameKey *)key2; + return (fk1->filename_idx == fk2->filename_idx && + fk1->funcname_idx == fk2->funcname_idx && + fk1->lineno == fk2->lineno); +} + +static void +frame_key_destroy(void *key) +{ + PyMem_Free(key); +} + +static inline int +writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index) +{ + void *existing = _Py_hashtable_get(writer->string_hash, string); + if (existing != NULL) { + *index = (uint32_t)(uintptr_t)existing - 1; /* index+1 stored to distinguish from NULL */ + return 0; + } + + if (writer->string_count >= writer->string_capacity) { + if (grow_parallel_arrays((void **)&writer->strings, + (void **)&writer->string_lengths, + &writer->string_capacity, + sizeof(char *), sizeof(size_t)) < 0) { + return -1; + } + } + + Py_ssize_t str_len; + const char *str_data = PyUnicode_AsUTF8AndSize(string, &str_len); + if (!str_data) { + return -1; + } + + char *str_copy = PyMem_Malloc(str_len + 1); + if (!str_copy) { + PyErr_NoMemory(); + return -1; + } + memcpy(str_copy, str_data, str_len + 1); + + *index = (uint32_t)writer->string_count; + + /* Add to hash table FIRST to ensure atomic rollback on failure */ + Py_INCREF(string); + if (_Py_hashtable_set(writer->string_hash, string, (void *)(uintptr_t)(*index + 1)) < 0) { + Py_DECREF(string); + PyMem_Free(str_copy); + PyErr_NoMemory(); + return -1; + } + + writer->strings[writer->string_count] = str_copy; + writer->string_lengths[writer->string_count] = str_len; + writer->string_count++; + + return 0; +} + +static inline int +writer_intern_frame(BinaryWriter *writer, uint32_t filename_idx, uint32_t funcname_idx, + int32_t lineno, uint32_t *index) +{ + FrameKey lookup_key = {filename_idx, funcname_idx, lineno}; + + void *existing = _Py_hashtable_get(writer->frame_hash, &lookup_key); + if (existing != NULL) { + *index = (uint32_t)(uintptr_t)existing - 1; /* index+1 stored to distinguish from NULL */ + return 0; + } + + if (GROW_ARRAY(writer->frame_entries, writer->frame_count, + writer->frame_capacity, FrameEntry) < 0) { + return -1; + } + + FrameKey *key = PyMem_Malloc(sizeof(FrameKey)); + if (!key) { + PyErr_NoMemory(); + return -1; + } + *key = lookup_key; + + *index = (uint32_t)writer->frame_count; + FrameEntry *fe = &writer->frame_entries[writer->frame_count]; + fe->filename_idx = filename_idx; + fe->funcname_idx = funcname_idx; + fe->lineno = lineno; + + if (_Py_hashtable_set(writer->frame_hash, key, (void *)(uintptr_t)(*index + 1)) < 0) { + PyMem_Free(key); + PyErr_NoMemory(); + return -1; + } + + writer->frame_count++; + return 0; +} + +/* Get or create a thread entry for the given thread_id. + * Returns pointer to ThreadEntry, or NULL on allocation failure. + * If is_new is non-NULL, sets it to 1 if this is a new thread, 0 otherwise. */ +static ThreadEntry * +writer_get_or_create_thread_entry(BinaryWriter *writer, uint64_t thread_id, + uint32_t interpreter_id, int *is_new) +{ + /* Linear search is OK for small number of threads. + * Key is (thread_id, interpreter_id) since same thread_id can exist in different interpreters. */ + for (size_t i = 0; i < writer->thread_count; i++) { + if (writer->thread_entries[i].thread_id == thread_id && + writer->thread_entries[i].interpreter_id == interpreter_id) { + if (is_new) { + *is_new = 0; + } + return &writer->thread_entries[i]; + } + } + + if (writer->thread_count >= writer->thread_capacity) { + writer->thread_entries = grow_array(writer->thread_entries, + &writer->thread_capacity, + sizeof(ThreadEntry)); + if (!writer->thread_entries) { + return NULL; + } + } + + ThreadEntry *entry = &writer->thread_entries[writer->thread_count]; + memset(entry, 0, sizeof(ThreadEntry)); + entry->thread_id = thread_id; + entry->interpreter_id = interpreter_id; + entry->prev_timestamp = writer->start_time_us; + entry->prev_stack_capacity = MAX_STACK_DEPTH; + entry->pending_rle_capacity = INITIAL_RLE_CAPACITY; + + entry->prev_stack = PyMem_Malloc(entry->prev_stack_capacity * sizeof(uint32_t)); + if (!entry->prev_stack) { + PyErr_NoMemory(); + return NULL; + } + + entry->pending_rle = PyMem_Malloc(entry->pending_rle_capacity * sizeof(PendingRLESample)); + if (!entry->pending_rle) { + PyMem_Free(entry->prev_stack); + PyErr_NoMemory(); + return NULL; + } + + writer->thread_count++; + if (is_new) { + *is_new = 1; + } + return entry; +} + +/* Compare two stacks and return the encoding type and parameters. + * Sets: + * - shared_count: number of frames matching from bottom of stack + * - pop_count: frames to remove from prev stack + * - push_count: new frames to add + * + * Returns the best encoding type to use. */ +static int +compare_stacks(const uint32_t *prev_stack, size_t prev_depth, + const uint32_t *curr_stack, size_t curr_depth, + size_t *shared_count, size_t *pop_count, size_t *push_count) +{ + if (prev_depth == curr_depth) { + int identical = 1; + for (size_t i = 0; i < prev_depth; i++) { + if (prev_stack[i] != curr_stack[i]) { + identical = 0; + break; + } + } + if (identical) { + *shared_count = prev_depth; + *pop_count = 0; + *push_count = 0; + return STACK_REPEAT; + } + } + + /* Find longest common suffix (frames at the bottom/outer part of stack). + * Stacks are stored innermost-first, so suffix is at the end. */ + size_t suffix_len = 0; + size_t min_depth = (prev_depth < curr_depth) ? prev_depth : curr_depth; + + for (size_t i = 0; i < min_depth; i++) { + size_t prev_idx = prev_depth - 1 - i; + size_t curr_idx = curr_depth - 1 - i; + if (prev_stack[prev_idx] == curr_stack[curr_idx]) { + suffix_len++; + } else { + break; + } + } + + *shared_count = suffix_len; + *pop_count = prev_depth - suffix_len; + *push_count = curr_depth - suffix_len; + + /* Choose best encoding based on byte cost */ + /* STACK_FULL: 1 (type) + 1-2 (depth) + sum(frame varints) */ + /* STACK_SUFFIX: 1 (type) + 1-2 (shared) + 1-2 (new_count) + sum(new frame varints) */ + /* STACK_POP_PUSH: 1 (type) + 1-2 (pop) + 1-2 (push) + sum(new frame varints) */ + + /* If no common suffix, use full stack */ + if (suffix_len == 0) { + return STACK_FULL; + } + + /* If only adding frames (suffix == prev_depth), use SUFFIX */ + if (*pop_count == 0 && *push_count > 0) { + return STACK_SUFFIX; + } + + /* If popping and/or pushing, use POP_PUSH if it saves bytes */ + /* Heuristic: POP_PUSH is better when we're modifying top frames */ + if (*pop_count > 0 || *push_count > 0) { + /* Use full stack if sharing less than half the frames */ + if (suffix_len < curr_depth / 2) { + return STACK_FULL; + } + return STACK_POP_PUSH; + } + + return STACK_FULL; +} + +/* Write common sample header: thread_id(8) + interpreter_id(4) + encoding(1). + * Returns 0 on success, -1 on failure. */ +static inline int +write_sample_header(BinaryWriter *writer, ThreadEntry *entry, uint8_t encoding) +{ + uint8_t header[SAMPLE_HEADER_FIXED_SIZE]; + memcpy(header, &entry->thread_id, 8); + memcpy(header + 8, &entry->interpreter_id, 4); + header[12] = encoding; + return writer_write_bytes(writer, header, SAMPLE_HEADER_FIXED_SIZE); +} + +/* Flush pending RLE samples for a thread. + * Writes the RLE record to the output buffer. + * Returns 0 on success, -1 on failure. */ +static int +flush_pending_rle(BinaryWriter *writer, ThreadEntry *entry) +{ + if (!entry->has_pending_rle || entry->pending_rle_count == 0) { + return 0; + } + + /* Write RLE record: + * [thread_id: 8] [interpreter_id: 4] [STACK_REPEAT: 1] [count: varint] + * [timestamp_delta_1: varint] [status_1: 1] ... [timestamp_delta_N: varint] [status_N: 1] + */ + + if (write_sample_header(writer, entry, STACK_REPEAT) < 0) { + return -1; + } + + if (writer_write_varint_u32(writer, (uint32_t)entry->pending_rle_count) < 0) { + return -1; + } + + for (size_t i = 0; i < entry->pending_rle_count; i++) { + if (writer_write_varint_u64(writer, entry->pending_rle[i].timestamp_delta) < 0) { + return -1; + } + if (writer_write_bytes(writer, &entry->pending_rle[i].status, 1) < 0) { + return -1; + } + writer->total_samples++; + } + + writer->stats.repeat_records++; + writer->stats.repeat_samples += entry->pending_rle_count; + /* Each RLE sample saves writing the entire stack */ + writer->stats.frames_saved += entry->pending_rle_count * entry->prev_stack_depth; + + entry->pending_rle_count = 0; + entry->has_pending_rle = 0; + + return 0; +} + +/* Write a single sample with the specified encoding. + * Returns 0 on success, -1 on failure. */ +static int +write_sample_with_encoding(BinaryWriter *writer, ThreadEntry *entry, + uint64_t timestamp_delta, uint8_t status, + int encoding_type, + const uint32_t *frame_indices, size_t stack_depth, + size_t shared_count, size_t pop_count, size_t push_count) +{ + /* Header: thread_id(8) + interpreter_id(4) + encoding(1) + delta(varint) + status(1) */ + uint8_t header_buf[SAMPLE_HEADER_MAX_SIZE]; + memcpy(header_buf, &entry->thread_id, 8); + memcpy(header_buf + 8, &entry->interpreter_id, 4); + header_buf[12] = (uint8_t)encoding_type; + size_t varint_len = encode_varint_u64(header_buf + 13, timestamp_delta); + header_buf[13 + varint_len] = status; + + if (writer_write_bytes(writer, header_buf, 14 + varint_len) < 0) { + return -1; + } + + uint8_t frame_buf[MAX_FRAME_BUFFER_SIZE]; + size_t frame_buf_pos = 0; + size_t frames_written = 0; + + switch (encoding_type) { + case STACK_FULL: + /* [depth: varint] [frame_idx: varint]... */ + frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)stack_depth); + for (size_t i = 0; i < stack_depth; i++) { + frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]); + } + frames_written = stack_depth; + writer->stats.full_records++; + break; + + case STACK_SUFFIX: + /* [shared_count: varint] [new_count: varint] [new_frame_idx: varint]... */ + frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)shared_count); + frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, (uint32_t)push_count); + /* New frames are at the top (beginning) of current stack */ + for (size_t i = 0; i < push_count; i++) { + frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]); + } + frames_written = push_count; + writer->stats.suffix_records++; + /* Saved writing shared_count frames */ + writer->stats.frames_saved += shared_count; + break; + + case STACK_POP_PUSH: + /* [pop_count: varint] [push_count: varint] [new_frame_idx: varint]... */ + frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)pop_count); + frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, (uint32_t)push_count); + /* New frames are at the top (beginning) of current stack */ + for (size_t i = 0; i < push_count; i++) { + frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]); + } + frames_written = push_count; + writer->stats.pop_push_records++; + /* Saved writing shared_count frames (stack_depth - push_count if we had written full) */ + writer->stats.frames_saved += shared_count; + break; + + default: + PyErr_SetString(PyExc_RuntimeError, "Invalid stack encoding type"); + return -1; + } + + if (writer_write_bytes(writer, frame_buf, frame_buf_pos) < 0) { + return -1; + } + + writer->stats.total_frames_written += frames_written; + writer->total_samples++; + return 0; +} + +BinaryWriter * +binary_writer_create(const char *filename, uint64_t sample_interval_us, int compression_type, + uint64_t start_time_us) +{ + BinaryWriter *writer = PyMem_Calloc(1, sizeof(BinaryWriter)); + if (!writer) { + PyErr_NoMemory(); + return NULL; + } + + writer->filename = PyMem_Malloc(strlen(filename) + 1); + if (!writer->filename) { + PyMem_Free(writer); + PyErr_NoMemory(); + return NULL; + } + strcpy(writer->filename, filename); + + writer->start_time_us = start_time_us; + writer->sample_interval_us = sample_interval_us; + writer->compression_type = compression_type; + + writer->write_buffer = PyMem_Malloc(WRITE_BUFFER_SIZE); + if (!writer->write_buffer) { + goto error; + } + writer->buffer_size = WRITE_BUFFER_SIZE; + + writer->string_hash = _Py_hashtable_new_full( + string_hash_func, + string_compare_func, + string_key_destroy, /* Key destroy: decref the Python string */ + NULL, /* Value destroy: values are just indices, not pointers */ + NULL /* Use default allocator */ + ); + if (!writer->string_hash) { + goto error; + } + writer->strings = PyMem_Malloc(INITIAL_STRING_CAPACITY * sizeof(char *)); + if (!writer->strings) { + goto error; + } + writer->string_lengths = PyMem_Malloc(INITIAL_STRING_CAPACITY * sizeof(size_t)); + if (!writer->string_lengths) { + goto error; + } + writer->string_capacity = INITIAL_STRING_CAPACITY; + + writer->frame_hash = _Py_hashtable_new_full( + frame_key_hash_func, + frame_key_compare_func, + frame_key_destroy, /* Key destroy: free the FrameKey */ + NULL, /* Value destroy: values are just indices, not pointers */ + NULL /* Use default allocator */ + ); + if (!writer->frame_hash) { + goto error; + } + writer->frame_entries = PyMem_Malloc(INITIAL_FRAME_CAPACITY * sizeof(FrameEntry)); + if (!writer->frame_entries) { + goto error; + } + writer->frame_capacity = INITIAL_FRAME_CAPACITY; + + writer->thread_entries = PyMem_Malloc(INITIAL_THREAD_CAPACITY * sizeof(ThreadEntry)); + if (!writer->thread_entries) { + goto error; + } + writer->thread_capacity = INITIAL_THREAD_CAPACITY; + + if (compression_type == COMPRESSION_ZSTD) { + if (writer_init_zstd(writer) < 0) { + goto error; + } + } + + writer->fp = fopen(filename, "wb"); + if (!writer->fp) { + PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename); + goto error; + } + + /* Hint sequential write pattern to kernel for better I/O scheduling */ +#if defined(__linux__) && defined(POSIX_FADV_SEQUENTIAL) + { + int fd = fileno(writer->fp); + if (fd >= 0) { + (void)posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); + } + } +#endif + + uint8_t header[FILE_HEADER_PLACEHOLDER_SIZE] = {0}; + if (fwrite_checked_allow_threads(header, FILE_HEADER_PLACEHOLDER_SIZE, writer->fp) < 0) { + goto error; + } + + return writer; + +error: + binary_writer_destroy(writer); + return NULL; +} + +/* Build a frame stack from Python frame list by interning all strings and frames. + * Returns 0 on success, -1 on error. */ +static int +build_frame_stack(BinaryWriter *writer, PyObject *frame_list, + uint32_t *curr_stack, size_t *curr_depth) +{ + Py_ssize_t stack_depth = PyList_Size(frame_list); + *curr_depth = (stack_depth < MAX_STACK_DEPTH) ? stack_depth : MAX_STACK_DEPTH; + + for (Py_ssize_t k = 0; k < (Py_ssize_t)*curr_depth; k++) { + /* Use unchecked accessors since we control the data structures */ + PyObject *frame_info = PyList_GET_ITEM(frame_list, k); + + /* Get filename, location, funcname from FrameInfo using unchecked access */ + PyObject *filename = PyStructSequence_GET_ITEM(frame_info, 0); + PyObject *location = PyStructSequence_GET_ITEM(frame_info, 1); + PyObject *funcname = PyStructSequence_GET_ITEM(frame_info, 2); + + /* Extract lineno from location (can be None for synthetic frames) */ + int32_t lineno = 0; + if (location != Py_None) { + /* Use unchecked access - first element is lineno */ + PyObject *lineno_obj = PyTuple_Check(location) ? + PyTuple_GET_ITEM(location, 0) : + PyStructSequence_GET_ITEM(location, 0); + lineno = (int32_t)PyLong_AsLong(lineno_obj); + if (UNLIKELY(PyErr_Occurred() != NULL)) { + PyErr_Clear(); + lineno = 0; + } + } + + /* Intern filename */ + uint32_t filename_idx; + if (writer_intern_string(writer, filename, &filename_idx) < 0) { + return -1; + } + + /* Intern funcname */ + uint32_t funcname_idx; + if (writer_intern_string(writer, funcname, &funcname_idx) < 0) { + return -1; + } + + /* Intern frame */ + uint32_t frame_idx; + if (writer_intern_frame(writer, filename_idx, funcname_idx, lineno, &frame_idx) < 0) { + return -1; + } + + curr_stack[k] = frame_idx; + } + return 0; +} + +/* Process a single thread's sample. + * Returns 0 on success, -1 on error. */ +static int +process_thread_sample(BinaryWriter *writer, PyObject *thread_info, + uint32_t interpreter_id, uint64_t timestamp_us) +{ + PyObject *thread_id_obj = PyStructSequence_GET_ITEM(thread_info, 0); + PyObject *status_obj = PyStructSequence_GET_ITEM(thread_info, 1); + PyObject *frame_list = PyStructSequence_GET_ITEM(thread_info, 2); + + uint64_t thread_id = PyLong_AsUnsignedLongLong(thread_id_obj); + if (thread_id == (uint64_t)-1 && PyErr_Occurred()) { + return -1; + } + long status_long = PyLong_AsLong(status_obj); + if (status_long == -1 && PyErr_Occurred()) { + return -1; + } + uint8_t status = (uint8_t)status_long; + + int is_new_thread = 0; + ThreadEntry *entry = writer_get_or_create_thread_entry( + writer, thread_id, interpreter_id, &is_new_thread); + if (!entry) { + return -1; + } + + /* Calculate timestamp delta */ + uint64_t delta = timestamp_us - entry->prev_timestamp; + entry->prev_timestamp = timestamp_us; + + /* Process frames and build current stack */ + uint32_t curr_stack[MAX_STACK_DEPTH]; + size_t curr_depth; + if (build_frame_stack(writer, frame_list, curr_stack, &curr_depth) < 0) { + return -1; + } + + /* Compare with previous stack to determine encoding */ + size_t shared_count, pop_count, push_count; + int encoding = compare_stacks( + entry->prev_stack, entry->prev_stack_depth, + curr_stack, curr_depth, + &shared_count, &pop_count, &push_count); + + if (encoding == STACK_REPEAT && !is_new_thread) { + /* Buffer this sample for RLE */ + if (GROW_ARRAY(entry->pending_rle, entry->pending_rle_count, + entry->pending_rle_capacity, PendingRLESample) < 0) { + return -1; + } + entry->pending_rle[entry->pending_rle_count].timestamp_delta = delta; + entry->pending_rle[entry->pending_rle_count].status = status; + entry->pending_rle_count++; + entry->has_pending_rle = 1; + } else { + /* Stack changed - flush any pending RLE first */ + if (entry->has_pending_rle) { + if (flush_pending_rle(writer, entry) < 0) { + return -1; + } + } + + if (write_sample_with_encoding(writer, entry, delta, status, encoding, + curr_stack, curr_depth, + shared_count, pop_count, push_count) < 0) { + return -1; + } + + memcpy(entry->prev_stack, curr_stack, curr_depth * sizeof(uint32_t)); + entry->prev_stack_depth = curr_depth; + } + + return 0; +} + +int +binary_writer_write_sample(BinaryWriter *writer, PyObject *stack_frames, uint64_t timestamp_us) +{ + if (!PyList_Check(stack_frames)) { + PyErr_SetString(PyExc_TypeError, "stack_frames must be a list"); + return -1; + } + + Py_ssize_t num_interpreters = PyList_GET_SIZE(stack_frames); + for (Py_ssize_t i = 0; i < num_interpreters; i++) { + PyObject *interp_info = PyList_GET_ITEM(stack_frames, i); + + PyObject *interp_id_obj = PyStructSequence_GET_ITEM(interp_info, 0); + PyObject *threads = PyStructSequence_GET_ITEM(interp_info, 1); + + unsigned long interp_id_long = PyLong_AsUnsignedLong(interp_id_obj); + if (interp_id_long == (unsigned long)-1 && PyErr_Occurred()) { + return -1; + } + /* Bounds check: interpreter_id is stored as uint32_t in binary format */ + if (interp_id_long > UINT32_MAX) { + PyErr_Format(PyExc_OverflowError, + "interpreter_id %lu exceeds maximum value %lu", + interp_id_long, (unsigned long)UINT32_MAX); + return -1; + } + uint32_t interpreter_id = (uint32_t)interp_id_long; + + Py_ssize_t num_threads = PyList_GET_SIZE(threads); + for (Py_ssize_t j = 0; j < num_threads; j++) { + PyObject *thread_info = PyList_GET_ITEM(threads, j); + if (process_thread_sample(writer, thread_info, interpreter_id, timestamp_us) < 0) { + return -1; + } + } + } + + return 0; +} + +int +binary_writer_finalize(BinaryWriter *writer) +{ + for (size_t i = 0; i < writer->thread_count; i++) { + if (writer->thread_entries[i].has_pending_rle) { + if (flush_pending_rle(writer, &writer->thread_entries[i]) < 0) { + return -1; + } + } + } + + if (writer_flush_buffer(writer) < 0) { + return -1; + } + +#ifdef HAVE_ZSTD + /* Finalize compression stream */ + if (writer->compression_type == COMPRESSION_ZSTD && writer->zstd.cctx) { + ZSTD_inBuffer input = { NULL, 0, 0 }; + size_t remaining; + + do { + ZSTD_outBuffer output = { + writer->zstd.compressed_buffer, + writer->zstd.compressed_buffer_size, + 0 + }; + + remaining = ZSTD_compressStream2(writer->zstd.cctx, &output, &input, ZSTD_e_end); + + if (ZSTD_isError(remaining)) { + PyErr_Format(PyExc_IOError, "zstd finalization error: %s", + ZSTD_getErrorName(remaining)); + return -1; + } + + if (output.pos > 0) { + if (fwrite_checked_allow_threads(writer->zstd.compressed_buffer, output.pos, writer->fp) < 0) { + return -1; + } + } + } while (remaining > 0); + } +#endif + + /* Use 64-bit file position for >2GB files */ + file_offset_t string_table_offset = FTELL64(writer->fp); + if (string_table_offset < 0) { + PyErr_SetFromErrno(PyExc_IOError); + return -1; + } + + /* Release GIL during potentially large writes */ + for (size_t i = 0; i < writer->string_count; i++) { + uint8_t len_buf[10]; + size_t len_size = encode_varint_u32(len_buf, (uint32_t)writer->string_lengths[i]); + if (fwrite_checked_allow_threads(len_buf, len_size, writer->fp) < 0 || + fwrite_checked_allow_threads(writer->strings[i], writer->string_lengths[i], writer->fp) < 0) { + return -1; + } + } + + file_offset_t frame_table_offset = FTELL64(writer->fp); + if (frame_table_offset < 0) { + PyErr_SetFromErrno(PyExc_IOError); + return -1; + } + + for (size_t i = 0; i < writer->frame_count; i++) { + FrameEntry *entry = &writer->frame_entries[i]; + uint8_t buf[30]; + size_t pos = encode_varint_u32(buf, entry->filename_idx); + pos += encode_varint_u32(buf + pos, entry->funcname_idx); + pos += encode_varint_i32(buf + pos, entry->lineno); + if (fwrite_checked_allow_threads(buf, pos, writer->fp) < 0) { + return -1; + } + } + + /* Footer: string_count(4) + frame_count(4) + file_size(8) + checksum(16) */ + file_offset_t footer_offset = FTELL64(writer->fp); + if (footer_offset < 0) { + PyErr_SetFromErrno(PyExc_IOError); + return -1; + } + uint64_t file_size = (uint64_t)footer_offset + 32; + uint8_t footer[32] = {0}; + memcpy(footer + 0, &writer->string_count, 4); + memcpy(footer + 4, &writer->frame_count, 4); + memcpy(footer + 8, &file_size, 8); + /* bytes 16-31: checksum placeholder (zeros) */ + if (fwrite_checked_allow_threads(footer, 32, writer->fp) < 0) { + return -1; + } + + if (FSEEK64(writer->fp, 0, SEEK_SET) < 0) { + PyErr_SetFromErrno(PyExc_IOError); + return -1; + } + + /* Convert file offsets to uint64_t for portable header format */ + uint64_t string_table_offset_u64 = (uint64_t)string_table_offset; + uint64_t frame_table_offset_u64 = (uint64_t)frame_table_offset; + + uint8_t header[52] = {0}; + uint32_t magic = BINARY_FORMAT_MAGIC; + uint32_t version = BINARY_FORMAT_VERSION; + memcpy(header + 0, &magic, 4); + memcpy(header + 4, &version, 4); + memcpy(header + 8, &writer->start_time_us, 8); + memcpy(header + 16, &writer->sample_interval_us, 8); + memcpy(header + 24, &writer->total_samples, 4); + memcpy(header + 28, &writer->thread_count, 4); + memcpy(header + 32, &string_table_offset_u64, 8); + memcpy(header + 40, &frame_table_offset_u64, 8); + memcpy(header + 48, &writer->compression_type, 4); + if (fwrite_checked_allow_threads(header, 52, writer->fp) < 0) { + return -1; + } + + if (fclose(writer->fp) != 0) { + writer->fp = NULL; + PyErr_SetFromErrno(PyExc_IOError); + return -1; + } + writer->fp = NULL; + + return 0; +} + +void +binary_writer_destroy(BinaryWriter *writer) +{ + if (!writer) { + return; + } + + if (writer->fp) { + fclose(writer->fp); + } + + PyMem_Free(writer->filename); + PyMem_Free(writer->write_buffer); + +#ifdef HAVE_ZSTD + if (writer->zstd.cctx) { + ZSTD_freeCCtx(writer->zstd.cctx); + } + PyMem_Free(writer->zstd.compressed_buffer); +#endif + + if (writer->string_hash) { + _Py_hashtable_destroy(writer->string_hash); + } + if (writer->strings) { + for (size_t i = 0; i < writer->string_count; i++) { + PyMem_Free(writer->strings[i]); + } + PyMem_Free(writer->strings); + } + PyMem_Free(writer->string_lengths); + + if (writer->frame_hash) { + _Py_hashtable_destroy(writer->frame_hash); + } + PyMem_Free(writer->frame_entries); + + if (writer->thread_entries) { + for (size_t i = 0; i < writer->thread_count; i++) { + PyMem_Free(writer->thread_entries[i].prev_stack); + PyMem_Free(writer->thread_entries[i].pending_rle); + } + PyMem_Free(writer->thread_entries); + } + + PyMem_Free(writer); +} + diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h index 5cbf64517af608..263dfd685657da 100644 --- a/Modules/_remote_debugging/clinic/module.c.h +++ b/Modules/_remote_debugging/clinic/module.c.h @@ -7,6 +7,7 @@ preserve # include "pycore_runtime.h" // _Py_ID() #endif #include "pycore_critical_section.h"// Py_BEGIN_CRITICAL_SECTION() +#include "pycore_long.h" // _PyLong_UnsignedLongLong_Converter() #include "pycore_modsupport.h" // _PyArg_UnpackKeywords() PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__, @@ -434,6 +435,659 @@ _remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(i return return_value; } +PyDoc_STRVAR(_remote_debugging_BinaryWriter___init____doc__, +"BinaryWriter(filename, sample_interval_us, start_time_us, *,\n" +" compression=0)\n" +"--\n" +"\n" +"High-performance binary writer for profiling data.\n" +"\n" +"Arguments:\n" +" filename: Path to output file\n" +" sample_interval_us: Sampling interval in microseconds\n" +" start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)\n" +" compression: 0=none, 1=zstd (default: 0)\n" +"\n" +"Use as a context manager or call finalize() when done."); + +static int +_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self, + const char *filename, + unsigned long long sample_interval_us, + unsigned long long start_time_us, + int compression); + +static int +_remote_debugging_BinaryWriter___init__(PyObject *self, PyObject *args, PyObject *kwargs) +{ + int return_value = -1; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 4 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(filename), &_Py_ID(sample_interval_us), &_Py_ID(start_time_us), &_Py_ID(compression), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"filename", "sample_interval_us", "start_time_us", "compression", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "BinaryWriter", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[4]; + PyObject * const *fastargs; + Py_ssize_t nargs = PyTuple_GET_SIZE(args); + Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 3; + const char *filename; + unsigned long long sample_interval_us; + unsigned long long start_time_us; + int compression = 0; + + fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, + /*minpos*/ 3, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!fastargs) { + goto exit; + } + if (!PyUnicode_Check(fastargs[0])) { + _PyArg_BadArgument("BinaryWriter", "argument 'filename'", "str", fastargs[0]); + goto exit; + } + Py_ssize_t filename_length; + filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length); + if (filename == NULL) { + goto exit; + } + if (strlen(filename) != (size_t)filename_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + if (!_PyLong_UnsignedLongLong_Converter(fastargs[1], &sample_interval_us)) { + goto exit; + } + if (!_PyLong_UnsignedLongLong_Converter(fastargs[2], &start_time_us)) { + goto exit; + } + if (!noptargs) { + goto skip_optional_kwonly; + } + compression = PyLong_AsInt(fastargs[3]); + if (compression == -1 && PyErr_Occurred()) { + goto exit; + } +skip_optional_kwonly: + return_value = _remote_debugging_BinaryWriter___init___impl((BinaryWriterObject *)self, filename, sample_interval_us, start_time_us, compression); + +exit: + return return_value; +} + +PyDoc_STRVAR(_remote_debugging_BinaryWriter_write_sample__doc__, +"write_sample($self, /, stack_frames, timestamp_us)\n" +"--\n" +"\n" +"Write a sample to the binary file.\n" +"\n" +"Arguments:\n" +" stack_frames: List of InterpreterInfo objects\n" +" timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)"); + +#define _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF \ + {"write_sample", _PyCFunction_CAST(_remote_debugging_BinaryWriter_write_sample), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter_write_sample__doc__}, + +static PyObject * +_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self, + PyObject *stack_frames, + unsigned long long timestamp_us); + +static PyObject * +_remote_debugging_BinaryWriter_write_sample(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(stack_frames), &_Py_ID(timestamp_us), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"stack_frames", "timestamp_us", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "write_sample", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + PyObject *stack_frames; + unsigned long long timestamp_us; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 2, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + stack_frames = args[0]; + if (!_PyLong_UnsignedLongLong_Converter(args[1], ×tamp_us)) { + goto exit; + } + return_value = _remote_debugging_BinaryWriter_write_sample_impl((BinaryWriterObject *)self, stack_frames, timestamp_us); + +exit: + return return_value; +} + +PyDoc_STRVAR(_remote_debugging_BinaryWriter_finalize__doc__, +"finalize($self, /)\n" +"--\n" +"\n" +"Finalize and close the binary file.\n" +"\n" +"Writes string/frame tables, footer, and updates header."); + +#define _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF \ + {"finalize", (PyCFunction)_remote_debugging_BinaryWriter_finalize, METH_NOARGS, _remote_debugging_BinaryWriter_finalize__doc__}, + +static PyObject * +_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self); + +static PyObject * +_remote_debugging_BinaryWriter_finalize(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_BinaryWriter_finalize_impl((BinaryWriterObject *)self); +} + +PyDoc_STRVAR(_remote_debugging_BinaryWriter_close__doc__, +"close($self, /)\n" +"--\n" +"\n" +"Close the writer without finalizing (discards data)."); + +#define _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF \ + {"close", (PyCFunction)_remote_debugging_BinaryWriter_close, METH_NOARGS, _remote_debugging_BinaryWriter_close__doc__}, + +static PyObject * +_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self); + +static PyObject * +_remote_debugging_BinaryWriter_close(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_BinaryWriter_close_impl((BinaryWriterObject *)self); +} + +PyDoc_STRVAR(_remote_debugging_BinaryWriter___enter____doc__, +"__enter__($self, /)\n" +"--\n" +"\n" +"Enter context manager."); + +#define _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF \ + {"__enter__", (PyCFunction)_remote_debugging_BinaryWriter___enter__, METH_NOARGS, _remote_debugging_BinaryWriter___enter____doc__}, + +static PyObject * +_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self); + +static PyObject * +_remote_debugging_BinaryWriter___enter__(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_BinaryWriter___enter___impl((BinaryWriterObject *)self); +} + +PyDoc_STRVAR(_remote_debugging_BinaryWriter___exit____doc__, +"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n" +"--\n" +"\n" +"Exit context manager, finalizing the file."); + +#define _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF \ + {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryWriter___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter___exit____doc__}, + +static PyObject * +_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self, + PyObject *exc_type, + PyObject *exc_val, + PyObject *exc_tb); + +static PyObject * +_remote_debugging_BinaryWriter___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 3 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "__exit__", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[3]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; + PyObject *exc_type = Py_None; + PyObject *exc_val = Py_None; + PyObject *exc_tb = Py_None; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!noptargs) { + goto skip_optional_pos; + } + if (args[0]) { + exc_type = args[0]; + if (!--noptargs) { + goto skip_optional_pos; + } + } + if (args[1]) { + exc_val = args[1]; + if (!--noptargs) { + goto skip_optional_pos; + } + } + exc_tb = args[2]; +skip_optional_pos: + return_value = _remote_debugging_BinaryWriter___exit___impl((BinaryWriterObject *)self, exc_type, exc_val, exc_tb); + +exit: + return return_value; +} + +PyDoc_STRVAR(_remote_debugging_BinaryWriter_get_stats__doc__, +"get_stats($self, /)\n" +"--\n" +"\n" +"Get encoding statistics for the writer.\n" +"\n" +"Returns a dict with encoding statistics including repeat/full/suffix/pop-push\n" +"record counts, frames written/saved, and compression ratio."); + +#define _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF \ + {"get_stats", (PyCFunction)_remote_debugging_BinaryWriter_get_stats, METH_NOARGS, _remote_debugging_BinaryWriter_get_stats__doc__}, + +static PyObject * +_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self); + +static PyObject * +_remote_debugging_BinaryWriter_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_BinaryWriter_get_stats_impl((BinaryWriterObject *)self); +} + +PyDoc_STRVAR(_remote_debugging_BinaryReader___init____doc__, +"BinaryReader(filename)\n" +"--\n" +"\n" +"High-performance binary reader for profiling data.\n" +"\n" +"Arguments:\n" +" filename: Path to input file\n" +"\n" +"Use as a context manager or call close() when done."); + +static int +_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self, + const char *filename); + +static int +_remote_debugging_BinaryReader___init__(PyObject *self, PyObject *args, PyObject *kwargs) +{ + int return_value = -1; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(filename), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"filename", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "BinaryReader", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + PyObject * const *fastargs; + Py_ssize_t nargs = PyTuple_GET_SIZE(args); + const char *filename; + + fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!fastargs) { + goto exit; + } + if (!PyUnicode_Check(fastargs[0])) { + _PyArg_BadArgument("BinaryReader", "argument 'filename'", "str", fastargs[0]); + goto exit; + } + Py_ssize_t filename_length; + filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length); + if (filename == NULL) { + goto exit; + } + if (strlen(filename) != (size_t)filename_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + return_value = _remote_debugging_BinaryReader___init___impl((BinaryReaderObject *)self, filename); + +exit: + return return_value; +} + +PyDoc_STRVAR(_remote_debugging_BinaryReader_replay__doc__, +"replay($self, /, collector, progress_callback=None)\n" +"--\n" +"\n" +"Replay samples through a collector.\n" +"\n" +"Arguments:\n" +" collector: Collector object with collect() method\n" +" progress_callback: Optional callable(current, total)\n" +"\n" +"Returns:\n" +" Number of samples replayed"); + +#define _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF \ + {"replay", _PyCFunction_CAST(_remote_debugging_BinaryReader_replay), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader_replay__doc__}, + +static PyObject * +_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self, + PyObject *collector, + PyObject *progress_callback); + +static PyObject * +_remote_debugging_BinaryReader_replay(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(collector), &_Py_ID(progress_callback), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"collector", "progress_callback", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "replay", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; + PyObject *collector; + PyObject *progress_callback = Py_None; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + collector = args[0]; + if (!noptargs) { + goto skip_optional_pos; + } + progress_callback = args[1]; +skip_optional_pos: + return_value = _remote_debugging_BinaryReader_replay_impl((BinaryReaderObject *)self, collector, progress_callback); + +exit: + return return_value; +} + +PyDoc_STRVAR(_remote_debugging_BinaryReader_get_info__doc__, +"get_info($self, /)\n" +"--\n" +"\n" +"Get metadata about the binary file.\n" +"\n" +"Returns:\n" +" Dict with file metadata"); + +#define _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF \ + {"get_info", (PyCFunction)_remote_debugging_BinaryReader_get_info, METH_NOARGS, _remote_debugging_BinaryReader_get_info__doc__}, + +static PyObject * +_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self); + +static PyObject * +_remote_debugging_BinaryReader_get_info(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_BinaryReader_get_info_impl((BinaryReaderObject *)self); +} + +PyDoc_STRVAR(_remote_debugging_BinaryReader_get_stats__doc__, +"get_stats($self, /)\n" +"--\n" +"\n" +"Get reconstruction statistics from replay.\n" +"\n" +"Returns a dict with statistics about record types decoded and samples\n" +"reconstructed during replay."); + +#define _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF \ + {"get_stats", (PyCFunction)_remote_debugging_BinaryReader_get_stats, METH_NOARGS, _remote_debugging_BinaryReader_get_stats__doc__}, + +static PyObject * +_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self); + +static PyObject * +_remote_debugging_BinaryReader_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_BinaryReader_get_stats_impl((BinaryReaderObject *)self); +} + +PyDoc_STRVAR(_remote_debugging_BinaryReader_close__doc__, +"close($self, /)\n" +"--\n" +"\n" +"Close the reader and free resources."); + +#define _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF \ + {"close", (PyCFunction)_remote_debugging_BinaryReader_close, METH_NOARGS, _remote_debugging_BinaryReader_close__doc__}, + +static PyObject * +_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self); + +static PyObject * +_remote_debugging_BinaryReader_close(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_BinaryReader_close_impl((BinaryReaderObject *)self); +} + +PyDoc_STRVAR(_remote_debugging_BinaryReader___enter____doc__, +"__enter__($self, /)\n" +"--\n" +"\n" +"Enter context manager."); + +#define _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF \ + {"__enter__", (PyCFunction)_remote_debugging_BinaryReader___enter__, METH_NOARGS, _remote_debugging_BinaryReader___enter____doc__}, + +static PyObject * +_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self); + +static PyObject * +_remote_debugging_BinaryReader___enter__(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_BinaryReader___enter___impl((BinaryReaderObject *)self); +} + +PyDoc_STRVAR(_remote_debugging_BinaryReader___exit____doc__, +"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n" +"--\n" +"\n" +"Exit context manager, closing the file."); + +#define _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF \ + {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryReader___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader___exit____doc__}, + +static PyObject * +_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self, + PyObject *exc_type, + PyObject *exc_val, + PyObject *exc_tb); + +static PyObject * +_remote_debugging_BinaryReader___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 3 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "__exit__", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[3]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; + PyObject *exc_type = Py_None; + PyObject *exc_val = Py_None; + PyObject *exc_tb = Py_None; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!noptargs) { + goto skip_optional_pos; + } + if (args[0]) { + exc_type = args[0]; + if (!--noptargs) { + goto skip_optional_pos; + } + } + if (args[1]) { + exc_val = args[1]; + if (!--noptargs) { + goto skip_optional_pos; + } + } + exc_tb = args[2]; +skip_optional_pos: + return_value = _remote_debugging_BinaryReader___exit___impl((BinaryReaderObject *)self, exc_type, exc_val, exc_tb); + +exit: + return return_value; +} + +PyDoc_STRVAR(_remote_debugging_zstd_available__doc__, +"zstd_available($module, /)\n" +"--\n" +"\n" +"Check if zstd compression is available.\n" +"\n" +"Returns:\n" +" True if zstd available, False otherwise"); + +#define _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF \ + {"zstd_available", (PyCFunction)_remote_debugging_zstd_available, METH_NOARGS, _remote_debugging_zstd_available__doc__}, + +static PyObject * +_remote_debugging_zstd_available_impl(PyObject *module); + +static PyObject * +_remote_debugging_zstd_available(PyObject *module, PyObject *Py_UNUSED(ignored)) +{ + return _remote_debugging_zstd_available_impl(module); +} + PyDoc_STRVAR(_remote_debugging_get_child_pids__doc__, "get_child_pids($module, /, pid, *, recursive=True)\n" "--\n" @@ -582,4 +1236,4 @@ _remote_debugging_is_python_process(PyObject *module, PyObject *const *args, Py_ exit: return return_value; } -/*[clinic end generated code: output=dc0550ad3d6a409c input=a9049054013a1b77]*/ +/*[clinic end generated code: output=036de0b06d0e34cc input=a9049054013a1b77]*/ diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index fc58e2428b2009..c27b0471c0d20f 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -6,6 +6,20 @@ ******************************************************************************/ #include "_remote_debugging.h" +#include "binary_io.h" + +/* Forward declarations for clinic-generated code */ +typedef struct { + PyObject_HEAD + BinaryWriter *writer; + uint32_t cached_total_samples; /* Preserved after finalize */ +} BinaryWriterObject; + +typedef struct { + PyObject_HEAD + BinaryReader *reader; +} BinaryReaderObject; + #include "clinic/module.c.h" /* ============================================================================ @@ -970,6 +984,10 @@ static PyType_Spec RemoteUnwinder_spec = { .slots = RemoteUnwinder_slots, }; +/* Forward declarations for type specs defined later */ +static PyType_Spec BinaryWriter_spec; +static PyType_Spec BinaryReader_spec; + /* ============================================================================ * MODULE INITIALIZATION * ============================================================================ */ @@ -1048,6 +1066,18 @@ _remote_debugging_exec(PyObject *m) if (PyModule_AddType(m, st->AwaitedInfo_Type) < 0) { return -1; } + + // Create BinaryWriter and BinaryReader types + CREATE_TYPE(m, st->BinaryWriter_Type, &BinaryWriter_spec); + if (PyModule_AddType(m, st->BinaryWriter_Type) < 0) { + return -1; + } + + CREATE_TYPE(m, st->BinaryReader_Type, &BinaryReader_spec); + if (PyModule_AddType(m, st->BinaryReader_Type) < 0) { + return -1; + } + #ifdef Py_GIL_DISABLED PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED); #endif @@ -1091,6 +1121,8 @@ remote_debugging_traverse(PyObject *mod, visitproc visit, void *arg) Py_VISIT(state->ThreadInfo_Type); Py_VISIT(state->InterpreterInfo_Type); Py_VISIT(state->AwaitedInfo_Type); + Py_VISIT(state->BinaryWriter_Type); + Py_VISIT(state->BinaryReader_Type); return 0; } @@ -1106,6 +1138,8 @@ remote_debugging_clear(PyObject *mod) Py_CLEAR(state->ThreadInfo_Type); Py_CLEAR(state->InterpreterInfo_Type); Py_CLEAR(state->AwaitedInfo_Type); + Py_CLEAR(state->BinaryWriter_Type); + Py_CLEAR(state->BinaryReader_Type); return 0; } @@ -1115,13 +1149,504 @@ remote_debugging_free(void *mod) (void)remote_debugging_clear((PyObject *)mod); } -static PyModuleDef_Slot remote_debugging_slots[] = { - {Py_mod_exec, _remote_debugging_exec}, - {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, - {Py_mod_gil, Py_MOD_GIL_NOT_USED}, - {0, NULL}, +/* ============================================================================ + * BINARY WRITER CLASS + * ============================================================================ */ + +#define BinaryWriter_CAST(op) ((BinaryWriterObject *)(op)) + +/*[clinic input] +class _remote_debugging.BinaryWriter "BinaryWriterObject *" "&PyBinaryWriter_Type" +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e948838b90a2003c]*/ + +/*[clinic input] +_remote_debugging.BinaryWriter.__init__ + filename: str + sample_interval_us: unsigned_long_long + start_time_us: unsigned_long_long + * + compression: int = 0 + +High-performance binary writer for profiling data. + +Arguments: + filename: Path to output file + sample_interval_us: Sampling interval in microseconds + start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6) + compression: 0=none, 1=zstd (default: 0) + +Use as a context manager or call finalize() when done. +[clinic start generated code]*/ + +static int +_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self, + const char *filename, + unsigned long long sample_interval_us, + unsigned long long start_time_us, + int compression) +/*[clinic end generated code: output=014c0306f1bacf4b input=57497fe3cb9214a6]*/ +{ + if (self->writer) { + binary_writer_destroy(self->writer); + } + + self->writer = binary_writer_create(filename, sample_interval_us, compression, start_time_us); + if (!self->writer) { + return -1; + } + + return 0; +} + +/*[clinic input] +_remote_debugging.BinaryWriter.write_sample + stack_frames: object + timestamp_us: unsigned_long_long + +Write a sample to the binary file. + +Arguments: + stack_frames: List of InterpreterInfo objects + timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6) +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self, + PyObject *stack_frames, + unsigned long long timestamp_us) +/*[clinic end generated code: output=24d5b86679b4128f input=dce3148417482624]*/ +{ + if (!self->writer) { + PyErr_SetString(PyExc_ValueError, "Writer is closed"); + return NULL; + } + + if (binary_writer_write_sample(self->writer, stack_frames, timestamp_us) < 0) { + return NULL; + } + + Py_RETURN_NONE; +} + +/*[clinic input] +_remote_debugging.BinaryWriter.finalize + +Finalize and close the binary file. + +Writes string/frame tables, footer, and updates header. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self) +/*[clinic end generated code: output=3534b88c6628de88 input=c02191750682f6a2]*/ +{ + if (!self->writer) { + PyErr_SetString(PyExc_ValueError, "Writer is already closed"); + return NULL; + } + + /* Save total_samples before finalizing */ + self->cached_total_samples = self->writer->total_samples; + + if (binary_writer_finalize(self->writer) < 0) { + return NULL; + } + + binary_writer_destroy(self->writer); + self->writer = NULL; + + Py_RETURN_NONE; +} + +/*[clinic input] +_remote_debugging.BinaryWriter.close + +Close the writer without finalizing (discards data). +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self) +/*[clinic end generated code: output=9571bb2256fd1fd2 input=6e0da206e60daf16]*/ +{ + if (self->writer) { + binary_writer_destroy(self->writer); + self->writer = NULL; + } + Py_RETURN_NONE; +} + +/*[clinic input] +_remote_debugging.BinaryWriter.__enter__ + +Enter context manager. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self) +/*[clinic end generated code: output=8eb95f61daf2d120 input=8ef14ee18da561d2]*/ +{ + Py_INCREF(self); + return (PyObject *)self; +} + +/*[clinic input] +_remote_debugging.BinaryWriter.__exit__ + exc_type: object = None + exc_val: object = None + exc_tb: object = None + +Exit context manager, finalizing the file. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self, + PyObject *exc_type, + PyObject *exc_val, + PyObject *exc_tb) +/*[clinic end generated code: output=61831f47c72a53c6 input=12334ce1009af37f]*/ +{ + if (self->writer) { + /* Finalize on normal exit */ + if (binary_writer_finalize(self->writer) < 0) { + binary_writer_destroy(self->writer); + self->writer = NULL; + return NULL; + } + binary_writer_destroy(self->writer); + self->writer = NULL; + } + Py_RETURN_FALSE; +} + +/*[clinic input] +_remote_debugging.BinaryWriter.get_stats + +Get encoding statistics for the writer. + +Returns a dict with encoding statistics including repeat/full/suffix/pop-push +record counts, frames written/saved, and compression ratio. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self) +/*[clinic end generated code: output=06522cd52544df89 input=82968491b53ad277]*/ +{ + if (!self->writer) { + PyErr_SetString(PyExc_ValueError, "Writer is closed"); + return NULL; + } + return binary_writer_get_stats(self->writer); +} + +static PyObject * +BinaryWriter_get_total_samples(BinaryWriterObject *self, void *closure) +{ + if (!self->writer) { + /* Use cached value after finalize/close */ + return PyLong_FromUnsignedLong(self->cached_total_samples); + } + return PyLong_FromUnsignedLong(self->writer->total_samples); +} + +static PyGetSetDef BinaryWriter_getset[] = { + {"total_samples", (getter)BinaryWriter_get_total_samples, NULL, "Total samples written", NULL}, + {NULL} +}; + +static PyMethodDef BinaryWriter_methods[] = { + _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF + _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF + _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF + _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF + _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF + _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF + {NULL, NULL, 0, NULL} +}; + +static void +BinaryWriter_dealloc(PyObject *op) +{ + BinaryWriterObject *self = BinaryWriter_CAST(op); + PyTypeObject *tp = Py_TYPE(self); + if (self->writer) { + binary_writer_destroy(self->writer); + } + tp->tp_free(self); + Py_DECREF(tp); +} + +static PyType_Slot BinaryWriter_slots[] = { + {Py_tp_getset, BinaryWriter_getset}, + {Py_tp_methods, BinaryWriter_methods}, + {Py_tp_init, _remote_debugging_BinaryWriter___init__}, + {Py_tp_dealloc, BinaryWriter_dealloc}, + {0, NULL} +}; + +static PyType_Spec BinaryWriter_spec = { + .name = "_remote_debugging.BinaryWriter", + .basicsize = sizeof(BinaryWriterObject), + .flags = ( + Py_TPFLAGS_DEFAULT + | Py_TPFLAGS_IMMUTABLETYPE + ), + .slots = BinaryWriter_slots, }; +/* ============================================================================ + * BINARY READER CLASS + * ============================================================================ */ + +#define BinaryReader_CAST(op) ((BinaryReaderObject *)(op)) + +/*[clinic input] +class _remote_debugging.BinaryReader "BinaryReaderObject *" "&PyBinaryReader_Type" +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=36400aaf6f53216d]*/ + +/*[clinic input] +_remote_debugging.BinaryReader.__init__ + filename: str + +High-performance binary reader for profiling data. + +Arguments: + filename: Path to input file + +Use as a context manager or call close() when done. +[clinic start generated code]*/ + +static int +_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self, + const char *filename) +/*[clinic end generated code: output=9699226f7ae052bb input=4201f9cc500ef2f6]*/ +{ + if (self->reader) { + binary_reader_close(self->reader); + } + + self->reader = binary_reader_open(filename); + if (!self->reader) { + return -1; + } + + return 0; +} + +/*[clinic input] +_remote_debugging.BinaryReader.replay + collector: object + progress_callback: object = None + +Replay samples through a collector. + +Arguments: + collector: Collector object with collect() method + progress_callback: Optional callable(current, total) + +Returns: + Number of samples replayed +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self, + PyObject *collector, + PyObject *progress_callback) +/*[clinic end generated code: output=442345562574b61c input=ebb687aed3e0f4f1]*/ +{ + if (!self->reader) { + PyErr_SetString(PyExc_ValueError, "Reader is closed"); + return NULL; + } + + Py_ssize_t replayed = binary_reader_replay(self->reader, collector, progress_callback); + if (replayed < 0) { + return NULL; + } + + return PyLong_FromSsize_t(replayed); +} + +/*[clinic input] +_remote_debugging.BinaryReader.get_info + +Get metadata about the binary file. + +Returns: + Dict with file metadata +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self) +/*[clinic end generated code: output=7f641fbd39147391 input=02e75e39c8a6cd1f]*/ +{ + if (!self->reader) { + PyErr_SetString(PyExc_ValueError, "Reader is closed"); + return NULL; + } + + return binary_reader_get_info(self->reader); +} + +/*[clinic input] +_remote_debugging.BinaryReader.get_stats + +Get reconstruction statistics from replay. + +Returns a dict with statistics about record types decoded and samples +reconstructed during replay. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self) +/*[clinic end generated code: output=628b9ab5e4c4fd36 input=d8dd6654abd6c3c0]*/ +{ + if (!self->reader) { + PyErr_SetString(PyExc_ValueError, "Reader is closed"); + return NULL; + } + return binary_reader_get_stats(self->reader); +} + +/*[clinic input] +_remote_debugging.BinaryReader.close + +Close the reader and free resources. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self) +/*[clinic end generated code: output=ad0238cf5240b4f8 input=b919a66c737712d5]*/ +{ + if (self->reader) { + binary_reader_close(self->reader); + self->reader = NULL; + } + Py_RETURN_NONE; +} + +/*[clinic input] +_remote_debugging.BinaryReader.__enter__ + +Enter context manager. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self) +/*[clinic end generated code: output=fade133538e93817 input=4794844c9efdc4f6]*/ +{ + Py_INCREF(self); + return (PyObject *)self; +} + +/*[clinic input] +_remote_debugging.BinaryReader.__exit__ + exc_type: object = None + exc_val: object = None + exc_tb: object = None + +Exit context manager, closing the file. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self, + PyObject *exc_type, + PyObject *exc_val, + PyObject *exc_tb) +/*[clinic end generated code: output=2acdd36cfdc14e4a input=87284243d7935835]*/ +{ + if (self->reader) { + binary_reader_close(self->reader); + self->reader = NULL; + } + Py_RETURN_FALSE; +} + +static PyObject * +BinaryReader_get_sample_count(BinaryReaderObject *self, void *closure) +{ + if (!self->reader) { + return PyLong_FromLong(0); + } + return PyLong_FromUnsignedLong(self->reader->sample_count); +} + +static PyObject * +BinaryReader_get_sample_interval_us(BinaryReaderObject *self, void *closure) +{ + if (!self->reader) { + return PyLong_FromLong(0); + } + return PyLong_FromUnsignedLongLong(self->reader->sample_interval_us); +} + +static PyGetSetDef BinaryReader_getset[] = { + {"sample_count", (getter)BinaryReader_get_sample_count, NULL, "Number of samples in file", NULL}, + {"sample_interval_us", (getter)BinaryReader_get_sample_interval_us, NULL, "Sample interval in microseconds", NULL}, + {NULL} +}; + +static PyMethodDef BinaryReader_methods[] = { + _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF + _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF + _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF + _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF + _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF + _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF + {NULL, NULL, 0, NULL} +}; + +static void +BinaryReader_dealloc(PyObject *op) +{ + BinaryReaderObject *self = BinaryReader_CAST(op); + PyTypeObject *tp = Py_TYPE(self); + if (self->reader) { + binary_reader_close(self->reader); + } + tp->tp_free(self); + Py_DECREF(tp); +} + +static PyType_Slot BinaryReader_slots[] = { + {Py_tp_getset, BinaryReader_getset}, + {Py_tp_methods, BinaryReader_methods}, + {Py_tp_init, _remote_debugging_BinaryReader___init__}, + {Py_tp_dealloc, BinaryReader_dealloc}, + {0, NULL} +}; + +static PyType_Spec BinaryReader_spec = { + .name = "_remote_debugging.BinaryReader", + .basicsize = sizeof(BinaryReaderObject), + .flags = ( + Py_TPFLAGS_DEFAULT + | Py_TPFLAGS_IMMUTABLETYPE + ), + .slots = BinaryReader_slots, +}; + +/* ============================================================================ + * MODULE METHODS + * ============================================================================ */ + +/*[clinic input] +_remote_debugging.zstd_available + +Check if zstd compression is available. + +Returns: + True if zstd available, False otherwise +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_zstd_available_impl(PyObject *module) +/*[clinic end generated code: output=55e35a70ef280cdd input=a1b4d41bc09c7cf9]*/ +{ + return PyBool_FromLong(binary_io_zstd_available()); +} + /* ============================================================================ * MODULE-LEVEL FUNCTIONS * ============================================================================ */ @@ -1188,11 +1713,19 @@ _remote_debugging_is_python_process_impl(PyObject *module, int pid) } static PyMethodDef remote_debugging_methods[] = { + _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF _REMOTE_DEBUGGING_GET_CHILD_PIDS_METHODDEF _REMOTE_DEBUGGING_IS_PYTHON_PROCESS_METHODDEF {NULL, NULL, 0, NULL}, }; +static PyModuleDef_Slot remote_debugging_slots[] = { + {Py_mod_exec, _remote_debugging_exec}, + {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, + {0, NULL}, +}; + static struct PyModuleDef remote_debugging_module = { PyModuleDef_HEAD_INIT, .m_name = "_remote_debugging", diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj index 830b7b8744862c..0e86ce9f4c918c 100644 --- a/PCbuild/_remote_debugging.vcxproj +++ b/PCbuild/_remote_debugging.vcxproj @@ -105,10 +105,13 @@ + + + diff --git a/PCbuild/_remote_debugging.vcxproj.filters b/PCbuild/_remote_debugging.vcxproj.filters index 793a3256c52d58..59d4d5c5c335fb 100644 --- a/PCbuild/_remote_debugging.vcxproj.filters +++ b/PCbuild/_remote_debugging.vcxproj.filters @@ -33,6 +33,12 @@ Source Files + + Source Files + + + Source Files + Source Files @@ -41,6 +47,9 @@ Header Files + + Header Files + diff --git a/configure b/configure index a1bc7991aa8dc2..b1faeaf806a9c6 100755 --- a/configure +++ b/configure @@ -858,6 +858,8 @@ HAVE_GETHOSTBYNAME_R_3_ARG HAVE_GETHOSTBYNAME_R_5_ARG HAVE_GETHOSTBYNAME_R_6_ARG LIBOBJS +REMOTE_DEBUGGING_LIBS +REMOTE_DEBUGGING_CFLAGS LIBZSTD_LIBS LIBZSTD_CFLAGS LIBLZMA_LIBS @@ -23023,6 +23025,22 @@ printf "%s\n" "yes" >&6; } have_libzstd=yes fi +if test "x$have_libzstd" = xyes +then : + + REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS" + REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS" + +else case e in #( + e) + REMOTE_DEBUGGING_CFLAGS="" + REMOTE_DEBUGGING_LIBS="" + ;; +esac +fi + + + @@ -31644,8 +31662,8 @@ fi if test "x$py_cv_module__remote_debugging" = xyes then : - - + as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_CFLAGS=$REMOTE_DEBUGGING_CFLAGS$as_nl" + as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_LDFLAGS=$REMOTE_DEBUGGING_LIBS$as_nl" fi diff --git a/configure.ac b/configure.ac index a284a118f0296f..043ec957f40894 100644 --- a/configure.ac +++ b/configure.ac @@ -5529,6 +5529,18 @@ PKG_CHECK_MODULES([LIBZSTD], [libzstd >= 1.4.5], [have_libzstd=yes], [ ]) ]) +dnl _remote_debugging module: optional zstd compression support +dnl The module always builds, but zstd compression is only available when libzstd is found +AS_VAR_IF([have_libzstd], [yes], [ + REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS" + REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS" +], [ + REMOTE_DEBUGGING_CFLAGS="" + REMOTE_DEBUGGING_LIBS="" +]) +AC_SUBST([REMOTE_DEBUGGING_CFLAGS]) +AC_SUBST([REMOTE_DEBUGGING_LIBS]) + dnl PY_CHECK_NETDB_FUNC(FUNCTION) AC_DEFUN([PY_CHECK_NETDB_FUNC], [PY_CHECK_FUNC([$1], [@%:@include ])]) @@ -7911,7 +7923,7 @@ PY_STDLIB_MOD_SIMPLE([_pickle]) PY_STDLIB_MOD_SIMPLE([_posixsubprocess]) PY_STDLIB_MOD_SIMPLE([_queue]) PY_STDLIB_MOD_SIMPLE([_random]) -PY_STDLIB_MOD_SIMPLE([_remote_debugging]) +PY_STDLIB_MOD_SIMPLE([_remote_debugging], [$REMOTE_DEBUGGING_CFLAGS], [$REMOTE_DEBUGGING_LIBS]) PY_STDLIB_MOD_SIMPLE([select]) PY_STDLIB_MOD_SIMPLE([_struct]) PY_STDLIB_MOD_SIMPLE([_types])