diff --git a/Doc/library/profiling.sampling.rst b/Doc/library/profiling.sampling.rst
index 1f60e2cb578c4d..9026b443a0e3a0 100644
--- a/Doc/library/profiling.sampling.rst
+++ b/Doc/library/profiling.sampling.rst
@@ -200,6 +200,36 @@ On most systems, attaching to another process requires appropriate permissions.
 See :ref:`profiling-permissions` for platform-specific requirements.
 
 
+.. _replay-command:
+
+The ``replay`` command
+----------------------
+
+The ``replay`` command converts binary profile files to other output formats::
+
+   python -m profiling.sampling replay profile.bin
+   python -m profiling.sampling replay --flamegraph -o profile.html profile.bin
+
+This command is useful when you have captured profiling data in binary format
+and want to analyze it later or convert it to a visualization format. Binary
+profiles can be replayed multiple times to different formats without
+re-profiling.
+
+::
+
+   # Convert binary to pstats (default, prints to stdout)
+   python -m profiling.sampling replay profile.bin
+
+   # Convert binary to flame graph
+   python -m profiling.sampling replay --flamegraph -o output.html profile.bin
+
+   # Convert binary to gecko format for Firefox Profiler
+   python -m profiling.sampling replay --gecko -o profile.json profile.bin
+
+   # Convert binary to heatmap
+   python -m profiling.sampling replay --heatmap -o my_heatmap profile.bin
+
+
 Profiling in production
 -----------------------
 
@@ -1041,6 +1071,57 @@ intuitive view that shows exactly where time is spent without requiring
 interpretation of hierarchical visualizations.
 
 
+Binary format
+-------------
+
+Binary format (:option:`--binary`) produces a compact binary file for efficient
+storage of profiling data::
+
+   python -m profiling.sampling run --binary -o profile.bin script.py
+   python -m profiling.sampling attach --binary -o profile.bin 12345
+
+The :option:`--compression` option controls data compression:
+
+- ``auto`` (default): Use zstd compression if available, otherwise no
+  compression
+- ``zstd``: Force zstd compression (requires zstd support)
+- ``none``: Disable compression
+
+::
+
+   python -m profiling.sampling run --binary --compression=zstd -o profile.bin script.py
+
+To analyze binary profiles, use the :ref:`replay-command` to convert them to
+other formats like flame graphs or pstats output.
+
+
+Record and replay workflow
+==========================
+
+The binary format combined with the replay command enables a record-and-replay
+workflow that separates data capture from analysis. Rather than generating
+visualizations during profiling, you capture raw data to a compact binary file
+and convert it to different formats later.
+
+This approach has three main benefits. First, sampling runs faster because the
+work of building data structures for visualization is deferred until replay.
+Second, a single binary capture can be converted to multiple output formats
+without re-profiling---pstats for a quick overview, flame graph for visual
+exploration, heatmap for line-level detail. Third, binary files are compact
+and easy to share with colleagues who can convert them to their preferred
+format.
+
+A typical workflow::
+
+   # Capture profile in production or during tests
+   python -m profiling.sampling attach --binary -o profile.bin 12345
+
+   # Later, analyze with different formats
+   python -m profiling.sampling replay profile.bin
+   python -m profiling.sampling replay --flamegraph -o profile.html profile.bin
+   python -m profiling.sampling replay --heatmap -o heatmap profile.bin
+
+
 Live mode
 =========
 
@@ -1252,6 +1333,10 @@ Global options
 
    Attach to and profile a running process by PID.
 
+.. option:: replay
+
+   Convert a binary profile file to another output format.
+
 
 Sampling options
 ----------------
@@ -1335,12 +1420,22 @@ Output options
 
    Generate HTML heatmap with line-level sample counts.
 
+.. option:: --binary
+
+   Generate high-performance binary format for later conversion with the
+   ``replay`` command.
+
+.. option:: --compression <type>
+
+   Compression for binary format: ``auto`` (use zstd if available, default),
+   ``zstd``, or ``none``.
+
 .. option:: -o <path>, --output <path>
 
    Output file or directory path. Default behavior varies by format:
-   ``--pstats`` writes to stdout, ``--flamegraph`` and ``--gecko`` generate
-   files like ``flamegraph.PID.html``, and ``--heatmap`` creates a directory
-   named ``heatmap_PID``.
+   ``--pstats`` writes to stdout, ``--flamegraph``, ``--gecko``, and
+   ``--binary`` generate files like ``flamegraph.PID.html``, and ``--heatmap``
+   creates a directory named ``heatmap_PID``.
 
 
 pstats display options
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 56bc003ac3e246..e625bf2fef1912 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -1653,9 +1653,11 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_varnames));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(code));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(col_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(collector));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(command));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(comment_factory));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile_mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compression));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(config));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(consts));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(context));
@@ -1718,7 +1720,9 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(event));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eventmask));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_tb));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_val));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_value));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(excepthook));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
@@ -1974,6 +1978,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(print_file_and_line));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(priority));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_callback));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_routine));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(proto));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(protocol));
@@ -2014,6 +2019,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reversed));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(rounding));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(salt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sample_interval_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sched_priority));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(scheduler));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(script));
@@ -2053,8 +2059,10 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(spam));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src_dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stack_frames));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start_time_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
@@ -2095,6 +2103,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(times));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timespec));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timetuple));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timeunit));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(top));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index 8be948b92ec8f9..771f0f8cb4ad87 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -376,9 +376,11 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(co_varnames)
         STRUCT_FOR_ID(code)
         STRUCT_FOR_ID(col_offset)
+        STRUCT_FOR_ID(collector)
         STRUCT_FOR_ID(command)
         STRUCT_FOR_ID(comment_factory)
         STRUCT_FOR_ID(compile_mode)
+        STRUCT_FOR_ID(compression)
         STRUCT_FOR_ID(config)
         STRUCT_FOR_ID(consts)
         STRUCT_FOR_ID(context)
@@ -441,7 +443,9 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(event)
         STRUCT_FOR_ID(eventmask)
         STRUCT_FOR_ID(exc)
+        STRUCT_FOR_ID(exc_tb)
         STRUCT_FOR_ID(exc_type)
+        STRUCT_FOR_ID(exc_val)
         STRUCT_FOR_ID(exc_value)
         STRUCT_FOR_ID(excepthook)
         STRUCT_FOR_ID(exception)
@@ -697,6 +701,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(print_file_and_line)
         STRUCT_FOR_ID(priority)
         STRUCT_FOR_ID(progress)
+        STRUCT_FOR_ID(progress_callback)
         STRUCT_FOR_ID(progress_routine)
         STRUCT_FOR_ID(proto)
         STRUCT_FOR_ID(protocol)
@@ -737,6 +742,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(reversed)
         STRUCT_FOR_ID(rounding)
         STRUCT_FOR_ID(salt)
+        STRUCT_FOR_ID(sample_interval_us)
         STRUCT_FOR_ID(sched_priority)
         STRUCT_FOR_ID(scheduler)
         STRUCT_FOR_ID(script)
@@ -776,8 +782,10 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(spam)
         STRUCT_FOR_ID(src)
         STRUCT_FOR_ID(src_dir_fd)
+        STRUCT_FOR_ID(stack_frames)
         STRUCT_FOR_ID(stacklevel)
         STRUCT_FOR_ID(start)
+        STRUCT_FOR_ID(start_time_us)
         STRUCT_FOR_ID(statement)
         STRUCT_FOR_ID(stats)
         STRUCT_FOR_ID(status)
@@ -818,6 +826,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(times)
         STRUCT_FOR_ID(timespec)
         STRUCT_FOR_ID(timestamp)
+        STRUCT_FOR_ID(timestamp_us)
         STRUCT_FOR_ID(timetuple)
         STRUCT_FOR_ID(timeunit)
         STRUCT_FOR_ID(top)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index d381fb9d2d42a3..499a2569b9a06c 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1651,9 +1651,11 @@ extern "C" {
     INIT_ID(co_varnames), \
     INIT_ID(code), \
     INIT_ID(col_offset), \
+    INIT_ID(collector), \
     INIT_ID(command), \
     INIT_ID(comment_factory), \
     INIT_ID(compile_mode), \
+    INIT_ID(compression), \
     INIT_ID(config), \
     INIT_ID(consts), \
     INIT_ID(context), \
@@ -1716,7 +1718,9 @@ extern "C" {
     INIT_ID(event), \
     INIT_ID(eventmask), \
     INIT_ID(exc), \
+    INIT_ID(exc_tb), \
     INIT_ID(exc_type), \
+    INIT_ID(exc_val), \
     INIT_ID(exc_value), \
     INIT_ID(excepthook), \
     INIT_ID(exception), \
@@ -1972,6 +1976,7 @@ extern "C" {
     INIT_ID(print_file_and_line), \
     INIT_ID(priority), \
     INIT_ID(progress), \
+    INIT_ID(progress_callback), \
     INIT_ID(progress_routine), \
     INIT_ID(proto), \
     INIT_ID(protocol), \
@@ -2012,6 +2017,7 @@ extern "C" {
     INIT_ID(reversed), \
     INIT_ID(rounding), \
     INIT_ID(salt), \
+    INIT_ID(sample_interval_us), \
     INIT_ID(sched_priority), \
     INIT_ID(scheduler), \
     INIT_ID(script), \
@@ -2051,8 +2057,10 @@ extern "C" {
     INIT_ID(spam), \
     INIT_ID(src), \
     INIT_ID(src_dir_fd), \
+    INIT_ID(stack_frames), \
     INIT_ID(stacklevel), \
     INIT_ID(start), \
+    INIT_ID(start_time_us), \
     INIT_ID(statement), \
     INIT_ID(stats), \
     INIT_ID(status), \
@@ -2093,6 +2101,7 @@ extern "C" {
     INIT_ID(times), \
     INIT_ID(timespec), \
     INIT_ID(timestamp), \
+    INIT_ID(timestamp_us), \
     INIT_ID(timetuple), \
     INIT_ID(timeunit), \
     INIT_ID(top), \
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index 24e50828935106..1375f46018f943 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -1284,6 +1284,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(collector);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(command);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1296,6 +1300,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(compression);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(config);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1544,10 +1552,18 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_tb);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(exc_type);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_val);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(exc_value);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2568,6 +2584,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(progress_callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(progress_routine);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2728,6 +2748,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sample_interval_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(sched_priority);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2884,6 +2908,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stack_frames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(stacklevel);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2892,6 +2920,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(start_time_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(statement);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -3052,6 +3084,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(timestamp_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(timetuple);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/InternalDocs/profiling_binary_format.md b/InternalDocs/profiling_binary_format.md
new file mode 100644
index 00000000000000..b4ec2b39323d32
--- /dev/null
+++ b/InternalDocs/profiling_binary_format.md
@@ -0,0 +1,442 @@
+# Profiling Binary Format
+
+The profiling module includes a binary file format for storing sampling
+profiler data. This document describes the format's structure and the
+design decisions behind it.
+
+The implementation is in
+[`Modules/_remote_debugging/binary_io.c`](../Modules/_remote_debugging/binary_io.c)
+with declarations in
+[`Modules/_remote_debugging/binary_io.h`](../Modules/_remote_debugging/binary_io.h).
+
+## Overview
+
+The sampling profiler can generate enormous amounts of data. A typical
+profiling session sampling at 1000 Hz for 60 seconds produces 60,000 samples.
+Each sample contains a full call stack, often 20-50 frames deep, and each
+frame includes a filename, function name, and line number. In a text-based
+format like collapsed stacks, this would mean repeating the same long file
+paths and function names thousands of times.
+
+The binary format addresses this through two key strategies:
+
+1. **Deduplication**: Strings and frames are stored once in lookup tables,
+   then referenced by small integer indices. A 100-character file path that
+   appears in 50,000 samples is stored once, not 50,000 times.
+
+2. **Compact encoding**: Variable-length integers (varints) encode small
+   values in fewer bytes. Since most indices are small (under 128), they
+   typically need only one byte instead of four.
+
+Together with optional zstd compression, these techniques reduce file sizes
+by 10-50x compared to text formats while also enabling faster I/O.
+
+## File Layout
+
+The file consists of five sections:
+
+```
++------------------+  Offset 0
+|     Header       |  64 bytes (fixed)
++------------------+  Offset 64
+|                  |
+|   Sample Data    |  Variable size (optionally compressed)
+|                  |
++------------------+  string_table_offset
+|   String Table   |  Variable size
++------------------+  frame_table_offset
+|   Frame Table    |  Variable size
++------------------+  file_size - 32
+|     Footer       |  32 bytes (fixed)
++------------------+  file_size
+```
+
+The layout is designed for streaming writes during profiling. The profiler
+cannot know in advance how many unique strings or frames will be encountered,
+so these tables must be built incrementally and written at the end.
+
+The header comes first so readers can quickly validate the file and locate
+the metadata tables. The sample data follows immediately, allowing the writer
+to stream samples directly to disk (or through a compression stream) without
+buffering the entire dataset in memory.
+
+The string and frame tables are placed after sample data because they grow
+as new unique entries are discovered during profiling. By deferring their
+output until finalization, the writer avoids the complexity of reserving
+space or rewriting portions of the file.
+
+The footer at the end contains counts needed to allocate arrays before
+parsing the tables. Placing it at a fixed offset from the end (rather than
+at a variable offset recorded in the header) means readers can locate it
+with a single seek to `file_size - 32`, without first reading the header.
+
+## Header
+
+```
+ Offset   Size   Type      Description
++--------+------+---------+----------------------------------------+
+|    0   |  4   | uint32  | Magic number (0x54414348 = "TACH")     |
+|    4   |  4   | uint32  | Format version (currently 2)           |
+|    8   |  8   | uint64  | Start timestamp (microseconds)         |
+|   16   |  8   | uint64  | Sample interval (microseconds)         |
+|   24   |  4   | uint32  | Total sample count                     |
+|   28   |  4   | uint32  | Thread count                           |
+|   32   |  8   | uint64  | String table offset                    |
+|   40   |  8   | uint64  | Frame table offset                     |
+|   48   |  4   | uint32  | Compression type (0=none, 1=zstd)      |
+|   52   | 12   | bytes   | Reserved (zero-filled)                 |
++--------+------+---------+----------------------------------------+
+```
+
+The header is written as zeros initially, then overwritten with actual values
+during finalization. This requires the output stream to be seekable, which
+is acceptable since the format targets regular files rather than pipes or
+network streams.
+
+## Sample Data
+
+Sample data begins at offset 64 and extends to `string_table_offset`. Samples
+use delta compression to minimize redundancy when consecutive samples from the
+same thread have identical or similar call stacks.
+
+### Stack Encoding Types
+
+Each sample record begins with thread identification, then an encoding byte:
+
+| Code | Name | Description |
+|------|------|-------------|
+| 0x00 | REPEAT | RLE: identical stack repeated N times |
+| 0x01 | FULL | Complete stack (first sample or no match) |
+| 0x02 | SUFFIX | Shares N frames from bottom of previous stack |
+| 0x03 | POP_PUSH | Remove M frames from top, add N new frames |
+
+### Record Formats
+
+**REPEAT (0x00) - Run-Length Encoded Identical Stacks:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x00 (REPEAT)                          |
+| count           | varint    | Number of samples in this RLE group    |
+| samples         | varies    | Interleaved: [delta: varint, status: 1]|
+|                 |           | repeated count times                   |
++-----------------+-----------+----------------------------------------+
+```
+The stack is inherited from this thread's previous sample. Each sample in the
+group gets its own timestamp delta and status byte, stored as interleaved pairs
+(delta1, status1, delta2, status2, ...) rather than separate arrays.
+
+**FULL (0x01) - Complete Stack:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x01 (FULL)                            |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| stack_depth     | varint    | Number of frames in call stack         |
+| frame_indices   | varint[]  | Array of frame table indices           |
++-----------------+-----------+----------------------------------------+
+```
+Used for the first sample from a thread, or when delta encoding would not
+provide savings.
+
+**SUFFIX (0x02) - Shared Suffix Match:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x02 (SUFFIX)                          |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| shared_count    | varint    | Frames shared from bottom of prev stack|
+| new_count       | varint    | New frames at top of stack             |
+| new_frames      | varint[]  | Array of new_count frame indices       |
++-----------------+-----------+----------------------------------------+
+```
+Used when a function call added frames to the top of the stack. The shared
+frames from the previous stack are kept, and new frames are prepended.
+
+**POP_PUSH (0x03) - Pop and Push:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x03 (POP_PUSH)                        |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| pop_count       | varint    | Frames to remove from top of prev stack|
+| push_count      | varint    | New frames to add at top               |
+| new_frames      | varint[]  | Array of push_count frame indices      |
++-----------------+-----------+----------------------------------------+
+```
+Used when the code path changed: some frames were popped (function returns)
+and new frames were pushed (different function calls).
+
+### Thread and Interpreter Identification
+
+Thread IDs are 64-bit values that can be large (memory addresses on some
+platforms) and vary unpredictably. Using a fixed 8-byte encoding avoids
+the overhead of varint encoding for large values and simplifies parsing
+since the reader knows exactly where each field begins.
+
+The interpreter ID identifies which Python sub-interpreter the thread
+belongs to, allowing analysis tools to separate activity across interpreters
+in processes using multiple sub-interpreters.
+
+### Status Byte
+
+The status byte is a bitfield encoding thread state at sample time:
+
+| Bit | Flag                  | Meaning                                    |
+|-----|-----------------------|--------------------------------------------|
+|  0  | THREAD_STATUS_HAS_GIL | Thread holds the GIL (Global Interpreter Lock) |
+|  1  | THREAD_STATUS_ON_CPU  | Thread is actively running on a CPU core   |
+|  2  | THREAD_STATUS_UNKNOWN | Thread state could not be determined       |
+|  3  | THREAD_STATUS_GIL_REQUESTED | Thread is waiting to acquire the GIL  |
+|  4  | THREAD_STATUS_HAS_EXCEPTION | Thread has a pending exception         |
+
+Multiple flags can be set simultaneously (e.g., a thread can hold the GIL
+while also running on CPU). Analysis tools use these to filter samples or
+visualize thread states over time.
+
+### Timestamp Delta Encoding
+
+Timestamps use delta encoding rather than absolute values. Absolute
+timestamps in microseconds require 8 bytes each, but consecutive samples
+from the same thread are typically separated by the sampling interval
+(e.g., 1000 microseconds), so the delta between them is small and fits
+in 1-2 varint bytes. The writer tracks the previous timestamp for each
+thread separately. The first sample from a thread encodes its delta from
+the profiling start time; subsequent samples encode the delta from that
+thread's previous sample. This per-thread tracking is necessary because
+samples are interleaved across threads in arrival order, not grouped by
+thread.
+
+For REPEAT (RLE) records, timestamp deltas and status bytes are stored as
+interleaved pairs (delta, status, delta, status, ...) - one pair per
+repeated sample - allowing efficient batching while preserving the exact
+timing and state of each sample.
+
+### Frame Indexing
+
+Each frame in a call stack is represented by an index into the frame table
+rather than inline data. This provides massive space savings because call
+stacks are highly repetitive: the same function appears in many samples
+(hot functions), call stacks often share common prefixes (main -> app ->
+handler -> ...), and recursive functions create repeated frame sequences.
+A frame index is typically 1-2 varint bytes. Inline frame data would be
+20-200+ bytes (two strings plus a line number). For a profile with 100,000
+samples averaging 30 frames each, this reduces frame data from potentially
+gigabytes to tens of megabytes.
+
+Frame indices are written innermost-first (the currently executing frame
+has index 0 in the array). This ordering works well with delta compression:
+function calls typically add frames at the top (index 0), while shared
+frames remain at the bottom.
+
+## String Table
+
+The string table stores deduplicated UTF-8 strings (filenames and function
+names). It begins at `string_table_offset` and contains entries in order of
+their assignment during writing:
+
+```
++----------------+
+| length: varint |
+| data: bytes    |
++----------------+  (repeated for each string)
+```
+
+Strings are stored in the order they were first encountered during writing.
+The first unique filename gets index 0, the second gets index 1, and so on.
+Length-prefixing (rather than null-termination) allows strings containing
+null bytes and enables readers to allocate exact-sized buffers. The varint
+length encoding means short strings (under 128 bytes) need only one length
+byte.
+
+## Frame Table
+
+The frame table stores deduplicated frame entries:
+
+```
++----------------------+
+| filename_idx: varint |
+| funcname_idx: varint |
+| lineno: svarint      |
++----------------------+  (repeated for each frame)
+```
+
+Each unique (filename, funcname, lineno) combination gets one entry. Two
+calls to the same function at different line numbers produce different
+frame entries; two calls at the same line number share one entry.
+
+Strings and frames are deduplicated separately because they have different
+cardinalities and reference patterns. A codebase might have hundreds of
+unique source files but thousands of unique functions. Many functions share
+the same filename, so storing the filename index in each frame entry (rather
+than the full string) provides an additional layer of deduplication. A frame
+entry is just three varints (typically 3-6 bytes) rather than two full
+strings plus a line number.
+
+Line numbers use signed varint (zigzag encoding) rather than unsigned to
+handle edge cases. Synthetic frames—generated frames that don't correspond
+directly to Python source code, such as C extension boundaries or internal
+interpreter frames—use line number 0 or -1 to indicate the absence of a
+source location. Zigzag encoding ensures these small negative values encode
+efficiently (−1 becomes 1, which is one byte) rather than requiring the
+maximum varint length.
+
+## Footer
+
+```
+ Offset   Size   Type      Description
++--------+------+---------+----------------------------------------+
+|    0   |  4   | uint32  | String count                           |
+|    4   |  4   | uint32  | Frame count                            |
+|    8   |  8   | uint64  | Total file size                        |
+|   16   | 16   | bytes   | Checksum (reserved, currently zeros)   |
++--------+------+---------+----------------------------------------+
+```
+
+The string and frame counts allow readers to pre-allocate arrays of the
+correct size before parsing the tables. Without these counts, readers would
+need to either scan the tables twice (once to count, once to parse) or use
+dynamically-growing arrays.
+
+The file size field provides a consistency check: if the actual file size
+does not match, the file may be truncated or corrupted.
+
+The checksum field is reserved for future use. A checksum would allow
+detection of corruption but adds complexity and computation cost. The
+current implementation leaves this as zeros.
+
+## Variable-Length Integer Encoding
+
+The format uses LEB128 (Little Endian Base 128) for unsigned integers and
+zigzag + LEB128 for signed integers. These encodings are widely used
+(Protocol Buffers, DWARF debug info, WebAssembly) and well-understood.
+
+### Unsigned Varint (LEB128)
+
+Each byte stores 7 bits of data. The high bit indicates whether more bytes
+follow:
+
+```
+Value        Encoded bytes
+0-127        [0xxxxxxx]                    (1 byte)
+128-16383    [1xxxxxxx] [0xxxxxxx]         (2 bytes)
+16384+       [1xxxxxxx] [1xxxxxxx] ...     (3+ bytes)
+```
+
+Most indices in profiling data are small. A profile with 1000 unique frames
+needs at most 2 bytes per frame index. The common case (indices under 128)
+needs only 1 byte.
+
+### Signed Varint (Zigzag)
+
+Standard LEB128 encodes −1 as a very large unsigned value, requiring many
+bytes. Zigzag encoding interleaves positive and negative values:
+
+```
+ 0 -> 0    -1 -> 1     1 -> 2    -2 -> 3     2 -> 4
+```
+
+This ensures small-magnitude values (whether positive or negative) encode
+in few bytes.
+
+## Compression
+
+When compression is enabled, the sample data region contains a zstd stream.
+The string table, frame table, and footer remain uncompressed so readers can
+access metadata without decompressing the entire file. A tool that only needs
+to report "this file contains 50,000 samples of 3 threads" can read the header
+and footer without touching the compressed sample data. This also simplifies
+the format: the header's offset fields point directly to the tables rather
+than to positions within a decompressed stream.
+
+Zstd provides an excellent balance of compression ratio and speed. Profiling
+data compresses very well (often 5-10x) due to repetitive patterns: the same
+small set of frame indices appears repeatedly, and delta-encoded timestamps
+cluster around the sampling interval. Zstd's streaming API allows compression
+without buffering the entire dataset. The writer feeds sample data through
+the compressor incrementally, flushing compressed chunks to disk as they
+become available.
+
+Level 5 compression is used as a default. Lower levels (1-3) are faster but
+compress less; higher levels (6+) compress more but slow down writing. Level
+5 provides good compression with minimal impact on profiling overhead.
+
+## Reading and Writing
+
+### Writing
+
+1. Open the output file and write 64 zero bytes as a placeholder header
+2. Initialize empty string and frame dictionaries for deduplication
+3. For each sample:
+   - Intern any new strings, assigning sequential indices
+   - Intern any new frames, assigning sequential indices
+   - Encode the sample record and write to the buffer
+   - Flush the buffer through compression (if enabled) when full
+4. Flush remaining buffered data and finalize compression
+5. Write the string table (length-prefixed strings in index order)
+6. Write the frame table (varint-encoded entries in index order)
+7. Write the footer with final counts
+8. Seek to offset 0 and write the header with actual values
+
+The writer maintains two dictionaries: one mapping strings to indices, one
+mapping (filename_idx, funcname_idx, lineno) tuples to frame indices. These
+enable O(1) lookup during interning.
+
+### Reading
+
+1. Read the header and validate magic/version
+2. Seek to end − 32 and read the footer
+3. Allocate string array of `string_count` elements
+4. Parse the string table, populating the array
+5. Allocate frame array of `frame_count * 3` uint32 elements
+6. Parse the frame table, populating the array
+7. If compressed, decompress the sample data region
+8. Iterate through samples, resolving indices to strings/frames
+
+The reader builds lookup arrays rather than dictionaries since it only needs
+index-to-value mapping, not value-to-index.
+
+## Platform Considerations
+
+On Unix systems (Linux, macOS), the reader uses `mmap()` to map the file
+into the process address space. The kernel handles paging data in and out
+as needed, no explicit read() calls or buffer management are required,
+multiple readers can share the same physical pages, and sequential access
+patterns benefit from kernel read-ahead.
+
+The implementation uses `madvise()` to hint the access pattern to the kernel:
+`MADV_SEQUENTIAL` indicates the file will be read linearly, enabling
+aggressive read-ahead. `MADV_WILLNEED` requests pre-faulting of pages.
+On Linux, `MAP_POPULATE` pre-faults all pages at mmap time rather than on
+first access, moving page fault overhead from the parsing loop to the
+initial mapping for more predictable performance. For large files (over
+32 MB), `MADV_HUGEPAGE` requests transparent huge pages (2 MB instead of
+4 KB) to reduce TLB pressure when accessing large amounts of data.
+
+On Windows, the implementation falls back to standard file I/O with full
+file buffering. Profiling data files are typically small enough (tens to
+hundreds of megabytes) that this is acceptable.
+
+The writer uses a 512 KB buffer to batch small writes. Each sample record
+is typically tens of bytes; writing these individually would incur excessive
+syscall overhead. The buffer accumulates data until full, then flushes in
+one write() call (or feeds through the compression stream).
+
+## Future Considerations
+
+The format reserves space for future extensions. The 12 reserved bytes in
+the header could hold additional metadata. The 16-byte checksum field in
+the footer is currently unused. The version field allows incompatible
+changes with graceful rejection. New compression types could be added
+(compression_type > 1).
+
+Any changes that alter the meaning of existing fields or the parsing logic
+should increment the version number to prevent older readers from
+misinterpreting new files.
diff --git a/Lib/profiling/sampling/binary_collector.py b/Lib/profiling/sampling/binary_collector.py
new file mode 100644
index 00000000000000..d8d38f4c078927
--- /dev/null
+++ b/Lib/profiling/sampling/binary_collector.py
@@ -0,0 +1,121 @@
+"""Thin Python wrapper around C binary writer for profiling data."""
+
+import time
+
+from .collector import Collector
+
+# Compression type constants (must match binary_io.h)
+COMPRESSION_NONE = 0
+COMPRESSION_ZSTD = 1
+
+
+def _resolve_compression(compression):
+    """Resolve compression type from string or int.
+
+    Args:
+        compression: 'auto', 'zstd', 'none', or int (0/1)
+
+    Returns:
+        int: Compression type constant
+    """
+    if isinstance(compression, int):
+        return compression
+
+    compression = compression.lower()
+    if compression == 'none':
+        return COMPRESSION_NONE
+    elif compression == 'zstd':
+        return COMPRESSION_ZSTD
+    elif compression == 'auto':
+        # Auto: use zstd if available, otherwise none
+        import _remote_debugging
+        if _remote_debugging.zstd_available():
+            return COMPRESSION_ZSTD
+        return COMPRESSION_NONE
+    else:
+        raise ValueError(f"Unknown compression type: {compression}")
+
+
+class BinaryCollector(Collector):
+    """High-performance binary collector using C implementation.
+
+    This collector writes profiling data directly to a binary file format
+    with optional zstd compression. All I/O is performed in C for maximum
+    throughput.
+
+    The binary format uses string/frame deduplication and varint encoding
+    for efficient storage.
+    """
+
+    def __init__(self, filename, sample_interval_usec, *, skip_idle=False,
+                 compression='auto'):
+        """Create a new binary collector.
+
+        Args:
+            filename: Path to output binary file
+            sample_interval_usec: Sampling interval in microseconds
+            skip_idle: If True, skip idle threads (not used in binary format)
+            compression: 'auto', 'zstd', 'none', or int (0=none, 1=zstd)
+        """
+        import _remote_debugging
+
+        self.filename = filename
+        self.sample_interval_usec = sample_interval_usec
+        self.skip_idle = skip_idle
+
+        compression_type = _resolve_compression(compression)
+        start_time_us = int(time.monotonic() * 1_000_000)
+        self._writer = _remote_debugging.BinaryWriter(
+            filename, sample_interval_usec, start_time_us, compression=compression_type
+        )
+
+    def collect(self, stack_frames, timestamp_us=None):
+        """Collect profiling data from stack frames.
+
+        This passes stack_frames directly to the C writer which handles
+        all encoding and buffering.
+
+        Args:
+            stack_frames: List of InterpreterInfo objects from _remote_debugging
+            timestamp_us: Optional timestamp in microseconds. If not provided,
+                          uses time.monotonic() to generate one.
+        """
+        if timestamp_us is None:
+            timestamp_us = int(time.monotonic() * 1_000_000)
+        self._writer.write_sample(stack_frames, timestamp_us)
+
+    def collect_failed_sample(self):
+        """Record a failed sample attempt (no-op for binary format)."""
+        pass
+
+    def export(self, filename=None):
+        """Finalize and close the binary file.
+
+        Args:
+            filename: Ignored (binary files are written incrementally)
+        """
+        self._writer.finalize()
+
+    @property
+    def total_samples(self):
+        return self._writer.total_samples
+
+    def get_stats(self):
+        """Get encoding statistics.
+
+        Returns:
+            Dict with encoding statistics including repeat/full/suffix/pop-push
+            record counts, frames written/saved, and compression ratio.
+        """
+        return self._writer.get_stats()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - finalize unless there was an error."""
+        if exc_type is None:
+            self._writer.finalize()
+        else:
+            self._writer.close()
+        return False
diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py
new file mode 100644
index 00000000000000..50c96668cc585b
--- /dev/null
+++ b/Lib/profiling/sampling/binary_reader.py
@@ -0,0 +1,128 @@
+"""Thin Python wrapper around C binary reader for profiling data."""
+
+
+class BinaryReader:
+    """High-performance binary reader using C implementation.
+
+    This reader uses memory-mapped I/O (on Unix) for fast replay of
+    profiling data from binary files.
+
+    Use as a context manager:
+        with BinaryReader('profile.bin') as reader:
+            info = reader.get_info()
+            reader.replay_samples(collector, progress_callback)
+    """
+
+    def __init__(self, filename):
+        """Create a new binary reader.
+
+        Args:
+            filename: Path to input binary file
+        """
+        self.filename = filename
+        self._reader = None
+
+    def __enter__(self):
+        import _remote_debugging
+        self._reader = _remote_debugging.BinaryReader(self.filename)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._reader is not None:
+            self._reader.close()
+            self._reader = None
+        return False
+
+    def get_info(self):
+        """Get metadata about the binary file.
+
+        Returns:
+            dict: File metadata including:
+                - sample_count: Number of samples in the file
+                - sample_interval_us: Sampling interval in microseconds
+                - start_time_us: Start timestamp in microseconds
+                - string_count: Number of unique strings
+                - frame_count: Number of unique frames
+                - compression: Compression type used
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_info()
+
+    def replay_samples(self, collector, progress_callback=None):
+        """Replay samples from binary file through a collector.
+
+        This allows converting binary profiling data to other formats
+        (e.g., flamegraph, pstats) by replaying through the appropriate
+        collector.
+
+        Args:
+            collector: A Collector instance with a collect() method
+            progress_callback: Optional callable(current, total) for progress
+
+        Returns:
+            int: Number of samples replayed
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.replay(collector, progress_callback)
+
+    @property
+    def sample_count(self):
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_info()['sample_count']
+
+    def get_stats(self):
+        """Get reconstruction statistics from replay.
+
+        Returns:
+            dict: Statistics about record types decoded and samples
+                  reconstructed during replay.
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_stats()
+
+
+def convert_binary_to_format(input_file, output_file, output_format,
+                             sample_interval_usec=None, progress_callback=None):
+    """Convert a binary profiling file to another format.
+
+    Args:
+        input_file: Path to input binary file
+        output_file: Path to output file
+        output_format: Target format ('flamegraph', 'collapsed', 'pstats', etc.)
+        sample_interval_usec: Override sample interval (uses file's if None)
+        progress_callback: Optional callable(current, total) for progress
+
+    Returns:
+        int: Number of samples converted
+    """
+    from .gecko_collector import GeckoCollector
+    from .stack_collector import FlamegraphCollector, CollapsedStackCollector
+    from .pstats_collector import PStatsCollector
+
+    with BinaryReader(input_file) as reader:
+        info = reader.get_info()
+        interval = sample_interval_usec or info['sample_interval_us']
+
+        # Create appropriate collector based on format
+        if output_format == 'flamegraph':
+            collector = FlamegraphCollector(interval)
+        elif output_format == 'collapsed':
+            collector = CollapsedStackCollector(interval)
+        elif output_format == 'pstats':
+            collector = PStatsCollector(interval)
+        elif output_format == 'gecko':
+            collector = GeckoCollector(interval)
+        else:
+            raise ValueError(f"Unknown output format: {output_format}")
+
+        # Replay samples through collector
+        count = reader.replay_samples(collector, progress_callback)
+
+        # Export to target format
+        collector.export(output_file)
+
+        return count
diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py
index e1ff3758c0d341..266a490476936d 100644
--- a/Lib/profiling/sampling/cli.py
+++ b/Lib/profiling/sampling/cli.py
@@ -15,6 +15,8 @@
 from .stack_collector import CollapsedStackCollector, FlamegraphCollector
 from .heatmap_collector import HeatmapCollector
 from .gecko_collector import GeckoCollector
+from .binary_collector import BinaryCollector
+from .binary_reader import BinaryReader, convert_binary_to_format
 from .constants import (
     PROFILING_MODE_ALL,
     PROFILING_MODE_WALL,
@@ -74,6 +76,7 @@ class CustomFormatter(
     "flamegraph": "html",
     "gecko": "json",
     "heatmap": "html",
+    "binary": "bin",
 }
 
 COLLECTOR_MAP = {
@@ -82,6 +85,7 @@ class CustomFormatter(
     "flamegraph": FlamegraphCollector,
     "gecko": GeckoCollector,
     "heatmap": HeatmapCollector,
+    "binary": BinaryCollector,
 }
 
 def _setup_child_monitor(args, parent_pid):
@@ -179,7 +183,7 @@ def _parse_mode(mode_string):
 def _check_process_died(process):
     """Check if process died and raise an error with stderr if available."""
     if process.poll() is None:
-        return  # Process still running
+        return
 
     # Process died - try to get stderr for error message
     stderr_msg = ""
@@ -364,7 +368,7 @@ def _add_mode_options(parser):
     )
 
 
-def _add_format_options(parser):
+def _add_format_options(parser, include_compression=True):
     """Add output format options to a parser."""
     output_group = parser.add_argument_group("Output options")
     format_group = output_group.add_mutually_exclusive_group()
@@ -403,8 +407,23 @@ def _add_format_options(parser):
         dest="format",
         help="Generate interactive HTML heatmap visualization with line-level sample counts",
     )
+    format_group.add_argument(
+        "--binary",
+        action="store_const",
+        const="binary",
+        dest="format",
+        help="Generate high-performance binary format (use 'replay' command to convert)",
+    )
     parser.set_defaults(format="pstats")
 
+    if include_compression:
+        output_group.add_argument(
+            "--compression",
+            choices=["auto", "zstd", "none"],
+            default="auto",
+            help="Compression for binary format: auto (use zstd if available), zstd, none",
+        )
+
     output_group.add_argument(
         "-o",
         "--output",
@@ -459,15 +478,18 @@ def _sort_to_mode(sort_choice):
     return sort_map.get(sort_choice, SORT_MODE_NSAMPLES)
 
 
-def _create_collector(format_type, interval, skip_idle, opcodes=False):
+def _create_collector(format_type, interval, skip_idle, opcodes=False,
+                      output_file=None, compression='auto'):
     """Create the appropriate collector based on format type.
 
     Args:
-        format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap')
+        format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary')
         interval: Sampling interval in microseconds
         skip_idle: Whether to skip idle samples
         opcodes: Whether to collect opcode information (only used by gecko format
                  for creating interval markers in Firefox Profiler)
+        output_file: Output file path (required for binary format)
+        compression: Compression type for binary format ('auto', 'zstd', 'none')
 
     Returns:
         A collector instance of the appropriate type
@@ -476,6 +498,13 @@ def _create_collector(format_type, interval, skip_idle, opcodes=False):
     if collector_class is None:
         raise ValueError(f"Unknown format: {format_type}")
 
+    # Binary format requires output file and compression
+    if format_type == "binary":
+        if output_file is None:
+            raise ValueError("Binary format requires an output file")
+        return collector_class(output_file, interval, skip_idle=skip_idle,
+                              compression=compression)
+
     # Gecko format never skips idle (it needs both GIL and CPU data)
     # and is the only format that uses opcodes for interval markers
     if format_type == "gecko":
@@ -511,7 +540,12 @@ def _handle_output(collector, args, pid, mode):
         pid: Process ID (for generating filenames)
         mode: Profiling mode used
     """
-    if args.format == "pstats":
+    if args.format == "binary":
+        # Binary format already wrote to file incrementally, just finalize
+        collector.export(None)
+        filename = collector.filename
+        print(f"Binary profile written to {filename} ({collector.total_samples} samples)")
+    elif args.format == "pstats":
         if args.outfile:
             # If outfile is a directory, generate filename inside it
             if os.path.isdir(args.outfile):
@@ -544,6 +578,13 @@ def _validate_args(args, parser):
         args: Parsed command-line arguments
         parser: ArgumentParser instance for error reporting
     """
+    # Replay command has minimal validation
+    if args.command == "replay":
+        # Can't replay to binary format
+        if args.format == "binary":
+            parser.error("Cannot replay to binary format. Use a different output format.")
+        return
+
     # Check if live mode is available
     if hasattr(args, 'live') and args.live and LiveStatsCollector is None:
         parser.error(
@@ -556,7 +597,7 @@ def _validate_args(args, parser):
             parser.error("--subprocesses is incompatible with --live mode.")
 
     # Async-aware mode is incompatible with --native, --no-gc, --mode, and --all-threads
-    if args.async_aware:
+    if getattr(args, 'async_aware', False):
         issues = []
         if args.native:
             issues.append("--native")
@@ -573,7 +614,7 @@ def _validate_args(args, parser):
             )
 
     # --async-mode requires --async-aware
-    if hasattr(args, 'async_mode') and args.async_mode != "running" and not args.async_aware:
+    if hasattr(args, 'async_mode') and args.async_mode != "running" and not getattr(args, 'async_aware', False):
         parser.error("--async-mode requires --async-aware to be enabled.")
 
     # Live mode is incompatible with format options
@@ -601,7 +642,7 @@ def _validate_args(args, parser):
         return
 
     # Validate gecko mode doesn't use non-wall mode
-    if args.format == "gecko" and args.mode != "wall":
+    if args.format == "gecko" and getattr(args, 'mode', 'wall') != "wall":
         parser.error(
             "--mode option is incompatible with --gecko. "
             "Gecko format automatically includes both GIL-holding and CPU status analysis."
@@ -609,7 +650,7 @@ def _validate_args(args, parser):
 
     # Validate --opcodes is only used with compatible formats
     opcodes_compatible_formats = ("live", "gecko", "flamegraph", "heatmap")
-    if args.opcodes and args.format not in opcodes_compatible_formats:
+    if getattr(args, 'opcodes', False) and args.format not in opcodes_compatible_formats:
         parser.error(
             f"--opcodes is only compatible with {', '.join('--' + f for f in opcodes_compatible_formats)}."
         )
@@ -721,6 +762,30 @@ def main():
     _add_format_options(attach_parser)
     _add_pstats_options(attach_parser)
 
+    # === REPLAY COMMAND ===
+    replay_parser = subparsers.add_parser(
+        "replay",
+        help="Replay a binary profile and convert to another format",
+        formatter_class=CustomFormatter,
+        description="""Replay a binary profile file and convert to another format
+
+Examples:
+  # Convert binary to flamegraph
+  `python -m profiling.sampling replay --flamegraph -o output.html profile.bin`
+
+  # Convert binary to pstats and print to stdout
+  `python -m profiling.sampling replay profile.bin`
+
+  # Convert binary to gecko format
+  `python -m profiling.sampling replay --gecko -o profile.json profile.bin`""",
+    )
+    replay_parser.add_argument(
+        "input_file",
+        help="Binary profile file to replay",
+    )
+    _add_format_options(replay_parser, include_compression=False)
+    _add_pstats_options(replay_parser)
+
     # Parse arguments
     args = parser.parse_args()
 
@@ -731,6 +796,7 @@ def main():
     command_handlers = {
         "run": _handle_run,
         "attach": _handle_attach,
+        "replay": _handle_replay,
     }
 
     # Execute the appropriate command
@@ -760,8 +826,16 @@ def _handle_attach(args):
         mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
     )
 
+    output_file = None
+    if args.format == "binary":
+        output_file = args.outfile or _generate_output_filename(args.format, args.pid)
+
     # Create the appropriate collector
-    collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes)
+    collector = _create_collector(
+        args.format, args.interval, skip_idle, args.opcodes,
+        output_file=output_file,
+        compression=getattr(args, 'compression', 'auto')
+    )
 
     with _get_child_monitor_context(args, args.pid):
         collector = sample(
@@ -829,8 +903,16 @@ def _handle_run(args):
         mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
     )
 
+    output_file = None
+    if args.format == "binary":
+        output_file = args.outfile or _generate_output_filename(args.format, process.pid)
+
     # Create the appropriate collector
-    collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes)
+    collector = _create_collector(
+        args.format, args.interval, skip_idle, args.opcodes,
+        output_file=output_file,
+        compression=getattr(args, 'compression', 'auto')
+    )
 
     with _get_child_monitor_context(args, process.pid):
         try:
@@ -949,5 +1031,52 @@ def _handle_live_run(args):
                 process.wait()
 
 
+def _handle_replay(args):
+    """Handle the 'replay' command - convert binary profile to another format."""
+    import os
+
+    if not os.path.exists(args.input_file):
+        sys.exit(f"Error: Input file not found: {args.input_file}")
+
+    # Can't replay to binary format
+    if args.format == "binary":
+        sys.exit("Error: Cannot replay to binary format. Use a different output format.")
+
+    with BinaryReader(args.input_file) as reader:
+        info = reader.get_info()
+        interval = info['sample_interval_us']
+
+        print(f"Replaying {info['sample_count']} samples from {args.input_file}")
+        print(f"  Sample interval: {interval} us")
+        print(f"  Compression: {'zstd' if info.get('compression_type', 0) == 1 else 'none'}")
+
+        collector = _create_collector(args.format, interval, skip_idle=False)
+
+        def progress_callback(current, total):
+            if total > 0:
+                pct = current / total
+                bar_width = 40
+                filled = int(bar_width * pct)
+                bar = '█' * filled + '░' * (bar_width - filled)
+                print(f"\r  [{bar}] {pct*100:5.1f}% ({current:,}/{total:,})", end="", flush=True)
+
+        count = reader.replay_samples(collector, progress_callback)
+        print()
+
+        if args.format == "pstats":
+            if args.outfile:
+                collector.export(args.outfile)
+            else:
+                sort_choice = args.sort if args.sort is not None else "nsamples"
+                limit = args.limit if args.limit is not None else 15
+                sort_mode = _sort_to_mode(sort_choice)
+                collector.print_stats(sort_mode, limit, not args.no_summary, PROFILING_MODE_WALL)
+        else:
+            filename = args.outfile or _generate_output_filename(args.format, os.getpid())
+            collector.export(filename)
+
+        print(f"Replayed {count} samples")
+
+
 if __name__ == "__main__":
     main()
diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py
index a1f6ec190f6556..0b485bbbb4c240 100644
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@@ -44,8 +44,15 @@ def extract_lineno(location):
 
 class Collector(ABC):
     @abstractmethod
-    def collect(self, stack_frames):
-        """Collect profiling data from stack frames."""
+    def collect(self, stack_frames, timestamp_us=None):
+        """Collect profiling data from stack frames.
+
+        Args:
+            stack_frames: List of InterpreterInfo objects
+            timestamp_us: Optional timestamp in microseconds. If provided (from
+                binary replay), use this instead of current time. If None,
+                collectors should use time.monotonic() or similar.
+        """
 
     def collect_failed_sample(self):
         """Collect data about a failed sample attempt."""
@@ -79,6 +86,17 @@ def _iter_async_frames(self, awaited_info_list):
         # Phase 3: Build linear stacks from each leaf to root (optimized - no sorting!)
         yield from self._build_linear_stacks(leaf_task_ids, task_map, child_to_parent)
 
+    def _iter_stacks(self, stack_frames, skip_idle=False):
+        """Yield (frames, thread_id) for all stacks, handling both sync and async modes."""
+        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
+            for frames, thread_id, _ in self._iter_async_frames(stack_frames):
+                if frames:
+                    yield frames, thread_id
+        else:
+            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
+                if frames:
+                    yield frames, thread_id
+
     def _build_task_graph(self, awaited_info_list):
         task_map = {}
         child_to_parent = {}  # Maps child_id -> (selected_parent_id, parent_count)
diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index 608a15da483729..c1c9cfcf3b93a9 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -66,7 +66,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, opcodes=False):
         self.sample_interval_usec = sample_interval_usec
         self.skip_idle = skip_idle
         self.opcodes_enabled = opcodes
-        self.start_time = time.time() * 1000  # milliseconds since epoch
+        self.start_time = time.monotonic() * 1000  # milliseconds since start
 
         # Global string table (shared across all threads)
         self.global_strings = ["(root)"]  # Start with root
@@ -103,6 +103,9 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, opcodes=False):
         # Opcode state tracking per thread: tid -> (opcode, lineno, col_offset, funcname, filename, start_time)
         self.opcode_state = {}
 
+        # For binary replay: track base timestamp (first sample's timestamp)
+        self._replay_base_timestamp_us = None
+
     def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
                                   active_name, inactive_name, category, current_time):
         """Track binary state transitions and emit markers.
@@ -138,18 +141,35 @@ def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
                 self._add_marker(tid, active_name, active_dict.pop(tid),
                                current_time, category)
 
-    def collect(self, stack_frames):
-        """Collect a sample from stack frames."""
-        current_time = (time.time() * 1000) - self.start_time
+    def collect(self, stack_frames, timestamps_us=None):
+        """Collect samples from stack frames.
+
+        Args:
+            stack_frames: List of interpreter/thread frame info
+            timestamps_us: List of timestamps in microseconds (None for live sampling)
+        """
+        # Handle live sampling (no timestamps provided)
+        if timestamps_us is None:
+            current_time = (time.monotonic() * 1000) - self.start_time
+            times = [current_time]
+        else:
+            if not timestamps_us:
+                return
+            # Initialize base timestamp if needed
+            if self._replay_base_timestamp_us is None:
+                self._replay_base_timestamp_us = timestamps_us[0]
+            # Convert all timestamps to times (ms relative to first sample)
+            base = self._replay_base_timestamp_us
+            times = [(ts - base) / 1000 for ts in timestamps_us]
+
+        first_time = times[0]
 
         # Update interval calculation
         if self.sample_count > 0 and self.last_sample_time > 0:
-            self.interval = (
-                current_time - self.last_sample_time
-            ) / self.sample_count
-        self.last_sample_time = current_time
+            self.interval = (times[-1] - self.last_sample_time) / self.sample_count
+        self.last_sample_time = times[-1]
 
-        # Process threads and track GC per thread
+        # Process threads
         for interpreter_info in stack_frames:
             for thread_info in interpreter_info.threads:
                 frames = thread_info.frame_info
@@ -167,92 +187,86 @@ def collect(self, stack_frames):
                 on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU)
                 gil_requested = bool(status_flags & THREAD_STATUS_GIL_REQUESTED)
 
-                # Track GIL possession (Has GIL / No GIL)
+                # Track state transitions using first timestamp
                 self._track_state_transition(
                     tid, has_gil, self.has_gil_start, self.no_gil_start,
-                    "Has GIL", "No GIL", CATEGORY_GIL, current_time
+                    "Has GIL", "No GIL", CATEGORY_GIL, first_time
                 )
-
-                # Track CPU state (On CPU / Off CPU)
                 self._track_state_transition(
                     tid, on_cpu, self.on_cpu_start, self.off_cpu_start,
-                    "On CPU", "Off CPU", CATEGORY_CPU, current_time
+                    "On CPU", "Off CPU", CATEGORY_CPU, first_time
                 )
 
-                # Track code type (Python Code / Native Code)
-                # This is tri-state: Python (has_gil), Native (on_cpu without gil), or Neither
+                # Track code type
                 if has_gil:
                     self._track_state_transition(
                         tid, True, self.python_code_start, self.native_code_start,
-                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, current_time
+                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, first_time
                     )
                 elif on_cpu:
                     self._track_state_transition(
                         tid, True, self.native_code_start, self.python_code_start,
-                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, current_time
+                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, first_time
                     )
                 else:
-                    # Thread is idle (neither has GIL nor on CPU) - close any open code markers
-                    # This handles the third state that _track_state_transition doesn't cover
                     if tid in self.initialized_threads:
                         if tid in self.python_code_start:
                             self._add_marker(tid, "Python Code", self.python_code_start.pop(tid),
-                                           current_time, CATEGORY_CODE_TYPE)
+                                           first_time, CATEGORY_CODE_TYPE)
                         if tid in self.native_code_start:
                             self._add_marker(tid, "Native Code", self.native_code_start.pop(tid),
-                                           current_time, CATEGORY_CODE_TYPE)
+                                           first_time, CATEGORY_CODE_TYPE)
 
-                # Track "Waiting for GIL" intervals (one-sided tracking)
+                # Track GIL wait
                 if gil_requested:
-                    self.gil_wait_start.setdefault(tid, current_time)
+                    self.gil_wait_start.setdefault(tid, first_time)
                 elif tid in self.gil_wait_start:
                     self._add_marker(tid, "Waiting for GIL", self.gil_wait_start.pop(tid),
-                                   current_time, CATEGORY_GIL)
+                                   first_time, CATEGORY_GIL)
 
-                # Track exception state (Has Exception / No Exception)
+                # Track exception state
                 has_exception = bool(status_flags & THREAD_STATUS_HAS_EXCEPTION)
                 self._track_state_transition(
                     tid, has_exception, self.exception_start, self.no_exception_start,
-                    "Has Exception", "No Exception", CATEGORY_EXCEPTION, current_time
+                    "Has Exception", "No Exception", CATEGORY_EXCEPTION, first_time
                 )
 
-                # Track GC events by detecting <GC> frames in the stack trace
-                # This leverages the improved GC frame tracking from commit 336366fd7ca
-                # which precisely identifies the thread that initiated GC collection
+                # Track GC events
                 has_gc_frame = any(frame[2] == "<GC>" for frame in frames)
                 if has_gc_frame:
-                    # This thread initiated GC collection
                     if tid not in self.gc_start_per_thread:
-                        self.gc_start_per_thread[tid] = current_time
+                        self.gc_start_per_thread[tid] = first_time
                 elif tid in self.gc_start_per_thread:
-                    # End GC marker when no more GC frames are detected
                     self._add_marker(tid, "GC Collecting", self.gc_start_per_thread.pop(tid),
-                                   current_time, CATEGORY_GC)
+                                   first_time, CATEGORY_GC)
 
-                # Mark thread as initialized after processing all state transitions
+                # Mark thread as initialized
                 self.initialized_threads.add(tid)
 
-                # Categorize: idle if neither has GIL nor on CPU
+                # Skip idle threads if requested
                 is_idle = not has_gil and not on_cpu
-
-                # Skip idle threads if skip_idle is enabled
                 if self.skip_idle and is_idle:
                     continue
 
                 if not frames:
                     continue
 
-                # Process the stack
+                # Process stack once to get stack_index
                 stack_index = self._process_stack(thread_data, frames)
 
-                # Add sample - cache references to avoid dictionary lookups
+                # Add samples with timestamps
                 samples = thread_data["samples"]
-                samples["stack"].append(stack_index)
-                samples["time"].append(current_time)
-                samples["eventDelay"].append(None)
+                samples_stack = samples["stack"]
+                samples_time = samples["time"]
+                samples_delay = samples["eventDelay"]
+
+                for t in times:
+                    samples_stack.append(stack_index)
+                    samples_time.append(t)
+                    samples_delay.append(None)
 
-                # Track opcode state changes for interval markers (leaf frame only)
-                if self.opcodes_enabled:
+                # Handle opcodes
+                if self.opcodes_enabled and frames:
                     leaf_frame = frames[0]
                     filename, location, funcname, opcode = leaf_frame
                     if isinstance(location, tuple):
@@ -264,18 +278,15 @@ def collect(self, stack_frames):
                     current_state = (opcode, lineno, col_offset, funcname, filename)
 
                     if tid not in self.opcode_state:
-                        # First observation - start tracking
-                        self.opcode_state[tid] = (*current_state, current_time)
+                        self.opcode_state[tid] = (*current_state, first_time)
                     elif self.opcode_state[tid][:5] != current_state:
-                        # State changed - emit marker for previous state
                         prev_opcode, prev_lineno, prev_col, prev_funcname, prev_filename, prev_start = self.opcode_state[tid]
                         self._add_opcode_interval_marker(
-                            tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, current_time
+                            tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, first_time
                         )
-                        # Start tracking new state
-                        self.opcode_state[tid] = (*current_state, current_time)
+                        self.opcode_state[tid] = (*current_state, first_time)
 
-        self.sample_count += 1
+        self.sample_count += len(times)
 
     def _create_thread(self, tid):
         """Create a new thread structure with processed profile format."""
diff --git a/Lib/profiling/sampling/heatmap_collector.py b/Lib/profiling/sampling/heatmap_collector.py
index 5b4c89283be08c..4e7e359bf8903b 100644
--- a/Lib/profiling/sampling/heatmap_collector.py
+++ b/Lib/profiling/sampling/heatmap_collector.py
@@ -518,7 +518,7 @@ def set_stats(self, sample_interval_usec, duration_sec, sample_rate, error_rate=
         }
         self.stats.update(kwargs)
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         """Process stack frames and count samples per line.
 
         Args:
@@ -526,8 +526,9 @@ def process_frames(self, frames, thread_id):
                     leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
                     opcode is None if not gathered.
             thread_id: Thread ID for this stack trace
+            weight: Number of samples this stack represents (for batched RLE)
         """
-        self._total_samples += 1
+        self._total_samples += weight
         self._seen_lines.clear()
 
         for i, (filename, location, funcname, opcode) in enumerate(frames):
@@ -545,15 +546,16 @@ def process_frames(self, frames, thread_id):
                 self._seen_lines.add(line_key)
 
             self._record_line_sample(filename, lineno, funcname, is_leaf=is_leaf,
-                                     count_cumulative=count_cumulative)
+                                     count_cumulative=count_cumulative, weight=weight)
 
             if opcode is not None:
                 # Set opcodes_enabled flag when we first encounter opcode data
                 self.opcodes_enabled = True
                 self._record_bytecode_sample(filename, lineno, opcode,
-                                             end_lineno, col_offset, end_col_offset)
+                                             end_lineno, col_offset, end_col_offset,
+                                             weight=weight)
 
-            # Build call graph for adjacent frames
+            # Build call graph for adjacent frames (relationships are deduplicated anyway)
             if i + 1 < len(frames):
                 next_frame = frames[i + 1]
                 next_lineno = extract_lineno(next_frame[1])
@@ -575,24 +577,25 @@ def _is_valid_frame(self, filename, lineno):
         return True
 
     def _record_line_sample(self, filename, lineno, funcname, is_leaf=False,
-                            count_cumulative=True):
+                            count_cumulative=True, weight=1):
         """Record a sample for a specific line."""
         # Track cumulative samples (all occurrences in stack)
         if count_cumulative:
-            self.line_samples[(filename, lineno)] += 1
-            self.file_samples[filename][lineno] += 1
+            self.line_samples[(filename, lineno)] += weight
+            self.file_samples[filename][lineno] += weight
 
         # Track self/leaf samples (only when at top of stack)
         if is_leaf:
-            self.line_self_samples[(filename, lineno)] += 1
-            self.file_self_samples[filename][lineno] += 1
+            self.line_self_samples[(filename, lineno)] += weight
+            self.file_self_samples[filename][lineno] += weight
 
         # Record function definition location
         if funcname and (filename, funcname) not in self.function_definitions:
             self.function_definitions[(filename, funcname)] = lineno
 
     def _record_bytecode_sample(self, filename, lineno, opcode,
-                                end_lineno=None, col_offset=None, end_col_offset=None):
+                                end_lineno=None, col_offset=None, end_col_offset=None,
+                                weight=1):
         """Record a sample for a specific bytecode instruction.
 
         Args:
@@ -602,6 +605,7 @@ def _record_bytecode_sample(self, filename, lineno, opcode,
             end_lineno: End line number (may be -1 if not available)
             col_offset: Column offset in UTF-8 bytes (may be -1 if not available)
             end_col_offset: End column offset in UTF-8 bytes (may be -1 if not available)
+            weight: Number of samples this represents (for batched RLE)
         """
         key = (filename, lineno)
 
@@ -609,7 +613,7 @@ def _record_bytecode_sample(self, filename, lineno, opcode,
         if opcode not in self.line_opcodes[key]:
             self.line_opcodes[key][opcode] = {'count': 0, 'locations': set()}
 
-        self.line_opcodes[key][opcode]['count'] += 1
+        self.line_opcodes[key][opcode]['count'] += weight
 
         # Store unique location info if column offset is available (not -1)
         if col_offset is not None and col_offset >= 0:
diff --git a/Lib/profiling/sampling/live_collector/collector.py b/Lib/profiling/sampling/live_collector/collector.py
index 28af2e9744545a..dcb9fcabe32779 100644
--- a/Lib/profiling/sampling/live_collector/collector.py
+++ b/Lib/profiling/sampling/live_collector/collector.py
@@ -348,7 +348,7 @@ def collect_failed_sample(self):
         self.failed_samples += 1
         self.total_samples += 1
 
-    def collect(self, stack_frames):
+    def collect(self, stack_frames, timestamp_us=None):
         """Collect and display profiling data."""
         if self.start_time is None:
             self.start_time = time.perf_counter()
diff --git a/Lib/profiling/sampling/pstats_collector.py b/Lib/profiling/sampling/pstats_collector.py
index 7c154e25828a8f..1b2fe6a77278ee 100644
--- a/Lib/profiling/sampling/pstats_collector.py
+++ b/Lib/profiling/sampling/pstats_collector.py
@@ -18,7 +18,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False):
         self.skip_idle = skip_idle
         self._seen_locations = set()
 
-    def _process_frames(self, frames):
+    def _process_frames(self, frames, weight=1):
         """Process a single thread's frame stack."""
         if not frames:
             return
@@ -32,12 +32,12 @@ def _process_frames(self, frames):
             location = (frame.filename, lineno, frame.funcname)
             if location not in self._seen_locations:
                 self._seen_locations.add(location)
-                self.result[location]["cumulative_calls"] += 1
+                self.result[location]["cumulative_calls"] += weight
 
         # The top frame gets counted as an inline call (directly executing)
         top_lineno = extract_lineno(frames[0].location)
         top_location = (frames[0].filename, top_lineno, frames[0].funcname)
-        self.result[top_location]["direct_calls"] += 1
+        self.result[top_location]["direct_calls"] += weight
 
         # Track caller-callee relationships for call graph
         for i in range(1, len(frames)):
@@ -49,17 +49,12 @@ def _process_frames(self, frames):
             callee = (callee_frame.filename, callee_lineno, callee_frame.funcname)
             caller = (caller_frame.filename, caller_lineno, caller_frame.funcname)
 
-            self.callers[callee][caller] += 1
+            self.callers[callee][caller] += weight
 
-    def collect(self, stack_frames):
-        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
-            # Async frame processing
-            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
-                self._process_frames(frames)
-        else:
-            # Regular frame processing
-            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
-                self._process_frames(frames)
+    def collect(self, stack_frames, timestamps_us=None):
+        weight = len(timestamps_us) if timestamps_us else 1
+        for frames, _ in self._iter_stacks(stack_frames, skip_idle=self.skip_idle):
+            self._process_frames(frames, weight=weight)
 
     def export(self, filename):
         self.create_stats()
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py
index 294ec3003fc6bc..9c0cdce93c403e 100644
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -12,6 +12,7 @@
 from .stack_collector import CollapsedStackCollector, FlamegraphCollector
 from .heatmap_collector import HeatmapCollector
 from .gecko_collector import GeckoCollector
+from .binary_collector import BinaryCollector
 from .constants import (
     PROFILING_MODE_WALL,
     PROFILING_MODE_CPU,
@@ -137,6 +138,9 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
             if self.collect_stats:
                 self._print_unwinder_stats()
 
+            if isinstance(collector, BinaryCollector):
+                self._print_binary_stats(collector)
+
         # Pass stats to flamegraph collector if it's the right type
         if hasattr(collector, 'set_stats'):
             collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@@ -278,6 +282,53 @@ def _print_unwinder_stats(self):
         if stale_invalidations > 0:
             print(f"  {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
 
+    def _print_binary_stats(self, collector):
+        """Print binary I/O encoding statistics."""
+        try:
+            stats = collector.get_stats()
+        except (ValueError, RuntimeError):
+            return  # Collector closed or stats unavailable
+
+        print(f"  {ANSIColors.CYAN}Binary Encoding:{ANSIColors.RESET}")
+
+        repeat_records = stats.get('repeat_records', 0)
+        repeat_samples = stats.get('repeat_samples', 0)
+        full_records = stats.get('full_records', 0)
+        suffix_records = stats.get('suffix_records', 0)
+        pop_push_records = stats.get('pop_push_records', 0)
+        total_records = stats.get('total_records', 0)
+
+        if total_records > 0:
+            repeat_pct = repeat_records / total_records * 100
+            full_pct = full_records / total_records * 100
+            suffix_pct = suffix_records / total_records * 100
+            pop_push_pct = pop_push_records / total_records * 100
+        else:
+            repeat_pct = full_pct = suffix_pct = pop_push_pct = 0
+
+        print(f"    Records:          {total_records:,}")
+        print(f"      RLE repeat:     {repeat_records:,} ({ANSIColors.GREEN}{repeat_pct:.1f}%{ANSIColors.RESET}) [{repeat_samples:,} samples]")
+        print(f"      Full stack:     {full_records:,} ({full_pct:.1f}%)")
+        print(f"      Suffix match:   {suffix_records:,} ({suffix_pct:.1f}%)")
+        print(f"      Pop-push:       {pop_push_records:,} ({pop_push_pct:.1f}%)")
+
+        frames_written = stats.get('total_frames_written', 0)
+        frames_saved = stats.get('frames_saved', 0)
+        compression_pct = stats.get('frame_compression_pct', 0)
+
+        print(f"  {ANSIColors.CYAN}Frame Efficiency:{ANSIColors.RESET}")
+        print(f"    Frames written:   {frames_written:,}")
+        print(f"    Frames saved:     {frames_saved:,} ({ANSIColors.GREEN}{compression_pct:.1f}%{ANSIColors.RESET})")
+
+        bytes_written = stats.get('bytes_written', 0)
+        if bytes_written >= 1024 * 1024:
+            bytes_str = f"{bytes_written / (1024 * 1024):.1f} MB"
+        elif bytes_written >= 1024:
+            bytes_str = f"{bytes_written / 1024:.1f} KB"
+        else:
+            bytes_str = f"{bytes_written} B"
+        print(f"    Bytes (pre-zstd): {bytes_str}")
+
 
 def sample(
     pid,
diff --git a/Lib/profiling/sampling/stack_collector.py b/Lib/profiling/sampling/stack_collector.py
index e437facd8bb94b..55e643d0e9c8cb 100644
--- a/Lib/profiling/sampling/stack_collector.py
+++ b/Lib/profiling/sampling/stack_collector.py
@@ -18,21 +18,12 @@ def __init__(self, sample_interval_usec, *, skip_idle=False):
         self.sample_interval_usec = sample_interval_usec
         self.skip_idle = skip_idle
 
-    def collect(self, stack_frames, skip_idle=False):
-        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
-            # Async-aware mode: process async task frames
-            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
-                if not frames:
-                    continue
-                self.process_frames(frames, thread_id)
-        else:
-            # Sync-only mode
-            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
-                if not frames:
-                    continue
-                self.process_frames(frames, thread_id)
+    def collect(self, stack_frames, timestamps_us=None, skip_idle=False):
+        weight = len(timestamps_us) if timestamps_us else 1
+        for frames, thread_id in self._iter_stacks(stack_frames, skip_idle=skip_idle):
+            self.process_frames(frames, thread_id, weight=weight)
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         pass
 
 
@@ -41,13 +32,13 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.stack_counter = collections.Counter()
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         # Extract only (filename, lineno, funcname) - opcode not needed for collapsed stacks
         # frame is (filename, location, funcname, opcode)
         call_tree = tuple(
             (f[0], extract_lineno(f[1]), f[2]) for f in reversed(frames)
         )
-        self.stack_counter[(call_tree, thread_id)] += 1
+        self.stack_counter[(call_tree, thread_id)] += weight
 
     def export(self, filename):
         lines = []
@@ -96,23 +87,26 @@ def __init__(self, *args, **kwargs):
         # Per-thread statistics
         self.per_thread_stats = {}  # {thread_id: {has_gil, on_cpu, gil_requested, unknown, has_exception, total, gc_samples}}
 
-    def collect(self, stack_frames, skip_idle=False):
+    def collect(self, stack_frames, timestamps_us=None, skip_idle=False):
         """Override to track thread status statistics before processing frames."""
-        # Increment sample count once per sample
-        self._sample_count += 1
+        # Weight is number of timestamps (samples with identical stack)
+        weight = len(timestamps_us) if timestamps_us else 1
+
+        # Increment sample count by weight
+        self._sample_count += weight
 
         # Collect both aggregate and per-thread statistics using base method
         status_counts, has_gc_frame, per_thread_stats = self._collect_thread_status_stats(stack_frames)
 
-        # Merge aggregate status counts
+        # Merge aggregate status counts (multiply by weight)
         for key in status_counts:
-            self.thread_status_counts[key] += status_counts[key]
+            self.thread_status_counts[key] += status_counts[key] * weight
 
         # Update aggregate GC frame count
         if has_gc_frame:
-            self.samples_with_gc_frames += 1
+            self.samples_with_gc_frames += weight
 
-        # Merge per-thread statistics
+        # Merge per-thread statistics (multiply by weight)
         for thread_id, stats in per_thread_stats.items():
             if thread_id not in self.per_thread_stats:
                 self.per_thread_stats[thread_id] = {
@@ -125,10 +119,10 @@ def collect(self, stack_frames, skip_idle=False):
                     "gc_samples": 0,
                 }
             for key, value in stats.items():
-                self.per_thread_stats[thread_id][key] += value
+                self.per_thread_stats[thread_id][key] += value * weight
 
         # Call parent collect to process frames
-        super().collect(stack_frames, skip_idle=skip_idle)
+        super().collect(stack_frames, timestamps_us, skip_idle=skip_idle)
 
     def set_stats(self, sample_interval_usec, duration_sec, sample_rate,
                   error_rate=None, missed_samples=None, mode=None):
@@ -311,7 +305,7 @@ def convert_children(children, min_samples):
             "opcode_mapping": opcode_mapping
         }
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         """Process stack frames into flamegraph tree structure.
 
         Args:
@@ -319,10 +313,11 @@ def process_frames(self, frames, thread_id):
                     leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
                     opcode is None if not gathered.
             thread_id: Thread ID for this stack trace
+            weight: Number of samples this stack represents (for batched RLE)
         """
         # Reverse to root->leaf order for tree building
-        self._root["samples"] += 1
-        self._total_samples += 1
+        self._root["samples"] += weight
+        self._total_samples += weight
         self._root["threads"].add(thread_id)
         self._all_threads.add(thread_id)
 
@@ -336,11 +331,11 @@ def process_frames(self, frames, thread_id):
             if node is None:
                 node = {"samples": 0, "children": {}, "threads": set(), "opcodes": collections.Counter()}
                 current["children"][func] = node
-            node["samples"] += 1
+            node["samples"] += weight
             node["threads"].add(thread_id)
 
             if opcode is not None:
-                node["opcodes"][opcode] += 1
+                node["opcodes"][opcode] += weight
 
             current = node
 
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
new file mode 100644
index 00000000000000..64bef181da9ba2
--- /dev/null
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
@@ -0,0 +1,1016 @@
+"""Tests for binary format round-trip functionality."""
+
+import os
+import random
+import tempfile
+import unittest
+from collections import defaultdict
+
+try:
+    import _remote_debugging
+    from _remote_debugging import (
+        InterpreterInfo,
+        ThreadInfo,
+        FrameInfo,
+        LocationInfo,
+        THREAD_STATUS_HAS_GIL,
+        THREAD_STATUS_ON_CPU,
+        THREAD_STATUS_UNKNOWN,
+        THREAD_STATUS_GIL_REQUESTED,
+        THREAD_STATUS_HAS_EXCEPTION,
+    )
+    from profiling.sampling.binary_collector import BinaryCollector
+    from profiling.sampling.binary_reader import BinaryReader
+
+    ZSTD_AVAILABLE = _remote_debugging.zstd_available()
+except ImportError:
+    raise unittest.SkipTest(
+        "Test only runs when _remote_debugging is available"
+    )
+
+
+def make_frame(filename, lineno, funcname):
+    """Create a FrameInfo struct sequence."""
+    location = LocationInfo((lineno, lineno, -1, -1))
+    return FrameInfo((filename, location, funcname, None))
+
+
+def make_thread(thread_id, frames, status=0):
+    """Create a ThreadInfo struct sequence."""
+    return ThreadInfo((thread_id, status, frames))
+
+
+def make_interpreter(interp_id, threads):
+    """Create an InterpreterInfo struct sequence."""
+    return InterpreterInfo((interp_id, threads))
+
+
+def extract_lineno(location):
+    """Extract line number from location (tuple or int or None)."""
+    if location is None:
+        return 0  # Treat None as 0
+    if isinstance(location, tuple):
+        return location[0] if location[0] is not None else 0
+    return location
+
+
+class RawCollector:
+    """Collector that captures all raw data grouped by thread."""
+
+    def __init__(self):
+        # Key: (interpreter_id, thread_id) -> list of samples for that thread
+        self.by_thread = defaultdict(list)
+        self.total_count = 0
+
+    def collect(self, stack_frames, timestamps_us):
+        """Capture the raw sample data."""
+        # timestamps_us is a list; add one sample per timestamp
+        count = len(timestamps_us)
+        for interp in stack_frames:
+            for thread in interp.threads:
+                frames = []
+                for frame in thread.frame_info:
+                    frames.append(
+                        {
+                            "filename": frame.filename,
+                            "funcname": frame.funcname,
+                            "lineno": extract_lineno(frame.location),
+                        }
+                    )
+                key = (interp.interpreter_id, thread.thread_id)
+                sample = {"status": thread.status, "frames": frames}
+                for _ in range(count):
+                    self.by_thread[key].append(sample)
+                self.total_count += count
+
+    def export(self, filename):
+        pass
+
+
+def samples_to_by_thread(samples):
+    """Convert input samples to by-thread format for comparison."""
+    by_thread = defaultdict(list)
+    for sample in samples:
+        for interp in sample:
+            for thread in interp.threads:
+                frames = []
+                for frame in thread.frame_info:
+                    frames.append(
+                        {
+                            "filename": frame.filename,
+                            "funcname": frame.funcname,
+                            "lineno": extract_lineno(frame.location),
+                        }
+                    )
+                key = (interp.interpreter_id, thread.thread_id)
+                by_thread[key].append(
+                    {
+                        "status": thread.status,
+                        "frames": frames,
+                    }
+                )
+    return by_thread
+
+
+class BinaryFormatTestBase(unittest.TestCase):
+    """Base class with common setup/teardown for binary format tests."""
+
+    def setUp(self):
+        self.temp_files = []
+
+    def tearDown(self):
+        for f in self.temp_files:
+            if os.path.exists(f):
+                os.unlink(f)
+
+    def create_binary_file(self, samples, interval=1000, compression="none"):
+        """Create a test binary file and track it for cleanup."""
+        with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f:
+            filename = f.name
+        self.temp_files.append(filename)
+
+        collector = BinaryCollector(
+            filename, interval, compression=compression
+        )
+        for sample in samples:
+            collector.collect(sample)
+        collector.export(None)
+        return filename
+
+    def roundtrip(self, samples, interval=1000, compression="none"):
+        """Write samples to binary and read back."""
+        filename = self.create_binary_file(samples, interval, compression)
+        collector = RawCollector()
+        with BinaryReader(filename) as reader:
+            count = reader.replay_samples(collector)
+        return collector, count
+
+    def assert_samples_equal(self, expected_samples, collector):
+        """Assert that roundtripped samples match input exactly, per-thread."""
+        expected = samples_to_by_thread(expected_samples)
+
+        # Same threads present
+        self.assertEqual(
+            set(expected.keys()),
+            set(collector.by_thread.keys()),
+            "Thread set mismatch",
+        )
+
+        # For each thread, samples match in order
+        for key in expected:
+            exp_samples = expected[key]
+            act_samples = collector.by_thread[key]
+            interp_id, thread_id = key
+
+            self.assertEqual(
+                len(exp_samples),
+                len(act_samples),
+                f"Thread ({interp_id}, {thread_id}): sample count mismatch "
+                f"(expected {len(exp_samples)}, got {len(act_samples)})",
+            )
+
+            for i, (exp, act) in enumerate(zip(exp_samples, act_samples)):
+                self.assertEqual(
+                    exp["status"],
+                    act["status"],
+                    f"Thread ({interp_id}, {thread_id}), sample {i}: "
+                    f"status mismatch (expected {exp['status']}, got {act['status']})",
+                )
+
+                self.assertEqual(
+                    len(exp["frames"]),
+                    len(act["frames"]),
+                    f"Thread ({interp_id}, {thread_id}), sample {i}: "
+                    f"frame count mismatch",
+                )
+
+                for j, (exp_frame, act_frame) in enumerate(
+                    zip(exp["frames"], act["frames"])
+                ):
+                    self.assertEqual(
+                        exp_frame["filename"],
+                        act_frame["filename"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: filename mismatch",
+                    )
+                    self.assertEqual(
+                        exp_frame["funcname"],
+                        act_frame["funcname"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: funcname mismatch",
+                    )
+                    self.assertEqual(
+                        exp_frame["lineno"],
+                        act_frame["lineno"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: lineno mismatch "
+                        f"(expected {exp_frame['lineno']}, got {act_frame['lineno']})",
+                    )
+
+
+class TestBinaryRoundTrip(BinaryFormatTestBase):
+    """Tests for exact binary format round-trip."""
+
+    def test_single_sample_single_frame(self):
+        """Single sample with one frame roundtrips exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            12345, [make_frame("test.py", 42, "myfunc")]
+                        )
+                    ],
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_single_sample_multi_frame(self):
+        """Single sample with call stack roundtrips exactly."""
+        frames = [
+            make_frame("inner.py", 10, "inner"),
+            make_frame("middle.py", 20, "middle"),
+            make_frame("outer.py", 30, "outer"),
+        ]
+        samples = [[make_interpreter(0, [make_thread(100, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_samples_same_stack(self):
+        """Multiple identical samples roundtrip exactly (tests RLE)."""
+        frame = make_frame("hot.py", 99, "hot_func")
+        samples = [
+            [make_interpreter(0, [make_thread(1, [frame])])]
+            for _ in range(100)
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 100)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_samples_varying_stacks(self):
+        """Multiple samples with varying stacks roundtrip exactly."""
+        samples = []
+        for i in range(20):
+            depth = i % 5 + 1
+            frames = [
+                make_frame(f"f{j}.py", j * 10 + i, f"func{j}")
+                for j in range(depth)
+            ]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 20)
+        self.assert_samples_equal(samples, collector)
+
+    def test_thread_ids_preserved(self):
+        """Thread IDs are preserved exactly."""
+        thread_ids = [1, 12345, 0x7FFF12345678, 999999]
+        samples = []
+        for tid in thread_ids:
+            samples.append(
+                [
+                    make_interpreter(
+                        0, [make_thread(tid, [make_frame("t.py", 10, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(thread_ids))
+        self.assert_samples_equal(samples, collector)
+
+    def test_interpreter_ids_preserved(self):
+        """Interpreter IDs are preserved exactly."""
+        interp_ids = [0, 1, 5, 100]
+        samples = []
+        for iid in interp_ids:
+            samples.append(
+                [
+                    make_interpreter(
+                        iid, [make_thread(1, [make_frame("i.py", 10, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(interp_ids))
+        self.assert_samples_equal(samples, collector)
+
+    def test_status_flags_preserved(self):
+        """All thread status flags are preserved exactly."""
+        statuses = [
+            0,
+            THREAD_STATUS_HAS_GIL,
+            THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_UNKNOWN,
+            THREAD_STATUS_GIL_REQUESTED,
+            THREAD_STATUS_HAS_EXCEPTION,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_HAS_EXCEPTION,
+            THREAD_STATUS_HAS_GIL
+            | THREAD_STATUS_ON_CPU
+            | THREAD_STATUS_GIL_REQUESTED,
+        ]
+        samples = []
+        for i, status in enumerate(statuses):
+            samples.append(
+                [
+                    make_interpreter(
+                        0,
+                        [
+                            make_thread(
+                                1, [make_frame("s.py", 10 + i, "f")], status
+                            )
+                        ],
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(statuses))
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_threads_per_sample(self):
+        """Multiple threads in one sample roundtrip exactly."""
+        threads = [
+            make_thread(
+                1, [make_frame("t1.py", 10, "t1")], THREAD_STATUS_HAS_GIL
+            ),
+            make_thread(
+                2, [make_frame("t2.py", 20, "t2")], THREAD_STATUS_ON_CPU
+            ),
+            make_thread(3, [make_frame("t3.py", 30, "t3")], 0),
+        ]
+        samples = [[make_interpreter(0, threads)] for _ in range(10)]
+        collector, count = self.roundtrip(samples)
+        # 10 samples × 3 threads = 30 thread-samples
+        self.assertEqual(count, 30)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_interpreters_per_sample(self):
+        """Multiple interpreters in one sample roundtrip exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("i0.py", 10, "i0")])]
+                ),
+                make_interpreter(
+                    1, [make_thread(2, [make_frame("i1.py", 20, "i1")])]
+                ),
+            ]
+            for _ in range(5)
+        ]
+        collector, count = self.roundtrip(samples)
+        # 5 samples × 2 interpreters × 1 thread = 10 thread-samples
+        self.assertEqual(count, 10)
+        self.assert_samples_equal(samples, collector)
+
+    def test_same_thread_id_different_interpreters(self):
+        """Same thread_id in different interpreters must be tracked separately."""
+        # This test catches bugs where thread state is keyed only by thread_id
+        # without considering interpreter_id
+        samples = []
+        # Interleave samples from interpreter 0 and 1, both using thread_id=1
+        for i in range(20):
+            interp_id = i % 2  # Alternate between interpreter 0 and 1
+            frame = make_frame(
+                f"interp{interp_id}.py", 10 + i, f"func{interp_id}"
+            )
+            samples.append(
+                [make_interpreter(interp_id, [make_thread(1, [frame])])]
+            )
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 20)
+        self.assert_samples_equal(samples, collector)
+
+        # Verify both interpreters are present
+        keys = set(collector.by_thread.keys())
+        self.assertIn((0, 1), keys)  # interpreter 0, thread 1
+        self.assertIn((1, 1), keys)  # interpreter 1, thread 1
+
+        # Verify each interpreter got 10 samples
+        self.assertEqual(len(collector.by_thread[(0, 1)]), 10)
+        self.assertEqual(len(collector.by_thread[(1, 1)]), 10)
+
+        # Verify the samples are in the right order for each interpreter
+        for i, sample in enumerate(collector.by_thread[(0, 1)]):
+            expected_lineno = 10 + i * 2  # 10, 12, 14, ...
+            self.assertEqual(sample["frames"][0]["lineno"], expected_lineno)
+            self.assertEqual(sample["frames"][0]["filename"], "interp0.py")
+
+        for i, sample in enumerate(collector.by_thread[(1, 1)]):
+            expected_lineno = 11 + i * 2  # 11, 13, 15, ...
+            self.assertEqual(sample["frames"][0]["lineno"], expected_lineno)
+            self.assertEqual(sample["frames"][0]["filename"], "interp1.py")
+
+    def test_deep_call_stack(self):
+        """Deep call stack roundtrips exactly."""
+        depth = 100
+        frames = [
+            make_frame(f"f{i}.py", i + 1, f"func{i}") for i in range(depth)
+        ]
+        samples = [[make_interpreter(0, [make_thread(1, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_line_numbers_preserved(self):
+        """Various line numbers are preserved exactly."""
+        linenos = [1, 100, 1000, 65535, 100000]
+        samples = []
+        for lineno in linenos:
+            samples.append(
+                [
+                    make_interpreter(
+                        0, [make_thread(1, [make_frame("l.py", lineno, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(linenos))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_zstd_compression_roundtrip(self):
+        """Zstd compressed data roundtrips exactly."""
+        samples = []
+        for i in range(200):
+            frames = [
+                make_frame(f"z{j}.py", j * 10 + i + 1, f"zfunc{j}")
+                for j in range(3)
+            ]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertEqual(count, 200)
+        self.assert_samples_equal(samples, collector)
+
+    def test_sample_interval_preserved(self):
+        """Sample interval is preserved in file metadata."""
+        intervals = [100, 500, 1000, 5000, 10000]
+        for interval in intervals:
+            with self.subTest(interval=interval):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame("i.py", 1, "f")])]
+                        )
+                    ]
+                ]
+                filename = self.create_binary_file(samples, interval=interval)
+                with BinaryReader(filename) as reader:
+                    info = reader.get_info()
+                    self.assertEqual(info["sample_interval_us"], interval)
+
+    def test_threads_interleaved_samples(self):
+        """Multiple threads with interleaved varying samples."""
+        samples = []
+        for i in range(30):
+            threads = [
+                make_thread(
+                    1,
+                    [make_frame("t1.py", 10 + i, "t1")],
+                    THREAD_STATUS_HAS_GIL if i % 2 == 0 else 0,
+                ),
+                make_thread(
+                    2,
+                    [make_frame("t2.py", 20 + i, "t2")],
+                    THREAD_STATUS_ON_CPU if i % 3 == 0 else 0,
+                ),
+            ]
+            samples.append([make_interpreter(0, threads)])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 60)
+        self.assert_samples_equal(samples, collector)
+
+
+class TestBinaryEdgeCases(BinaryFormatTestBase):
+    """Tests for edge cases in binary format."""
+
+    def test_unicode_filenames(self):
+        """Unicode filenames roundtrip exactly."""
+        filenames = [
+            "/путь/файл.py",
+            "/路径/文件.py",
+            "/パス/ファイル.py",
+            "/chemin/café.py",
+        ]
+        for fname in filenames:
+            with self.subTest(filename=fname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame(fname, 1, "func")])]
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_unicode_funcnames(self):
+        """Unicode function names roundtrip exactly."""
+        funcnames = [
+            "функция",
+            "函数",
+            "関数",
+            "función",
+        ]
+        for funcname in funcnames:
+            with self.subTest(funcname=funcname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0,
+                            [
+                                make_thread(
+                                    1, [make_frame("test.py", 1, funcname)]
+                                )
+                            ],
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_special_char_filenames(self):
+        """Filenames with special characters roundtrip exactly."""
+        filenames = [
+            "/path/with spaces/file.py",
+            "/path/with\ttab/file.py",
+            "/path/with'quote/file.py",
+            '/path/with"double/file.py',
+            "/path/with\\backslash/file.py",
+        ]
+        for fname in filenames:
+            with self.subTest(filename=fname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame(fname, 1, "func")])]
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_special_funcnames(self):
+        """Function names with special characters roundtrip exactly."""
+        funcnames = [
+            "<lambda>",
+            "<listcomp>",
+            "<genexpr>",
+            "<module>",
+            "__init__",
+            "func.inner",
+        ]
+        for funcname in funcnames:
+            with self.subTest(funcname=funcname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0,
+                            [
+                                make_thread(
+                                    1, [make_frame("test.py", 1, funcname)]
+                                )
+                            ],
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_long_filename(self):
+        """Long filename roundtrips exactly."""
+        long_file = "/very/long/path/" + "sub/" * 50 + "file.py"
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame(long_file, 1, "func")])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_long_funcname(self):
+        """Long function name roundtrips exactly."""
+        long_func = "very_long_function_name_" + "x" * 200
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("test.py", 1, long_func)])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_empty_funcname(self):
+        """Empty function name roundtrips exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("test.py", 1, "")])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_large_sample_count(self):
+        """Large number of samples roundtrips exactly."""
+        num = 5000
+        samples = [
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1, [make_frame("test.py", (i % 100) + 1, "func")]
+                        )
+                    ],
+                )
+            ]
+            for i in range(num)
+        ]
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertEqual(count, num)
+        self.assert_samples_equal(samples, collector)
+
+    def test_context_manager_cleanup(self):
+        """Reader cleans up on context exit."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("t.py", 1, "f")])]
+                )
+            ]
+        ]
+        filename = self.create_binary_file(samples)
+        reader = BinaryReader(filename)
+        with reader:
+            collector = RawCollector()
+            count = reader.replay_samples(collector)
+            self.assertEqual(count, 1)
+        with self.assertRaises(RuntimeError):
+            reader.replay_samples(collector)
+
+    def test_invalid_file_path(self):
+        """Invalid file path raises appropriate error."""
+        with self.assertRaises((FileNotFoundError, OSError, ValueError)):
+            with BinaryReader("/nonexistent/path/file.bin") as reader:
+                reader.replay_samples(RawCollector())
+
+
+class TestBinaryEncodings(BinaryFormatTestBase):
+    """Tests specifically targeting different stack encodings."""
+
+    def test_stack_full_encoding(self):
+        """First sample uses STACK_FULL encoding and roundtrips."""
+        frames = [make_frame(f"f{i}.py", i + 1, f"func{i}") for i in range(5)]
+        samples = [[make_interpreter(0, [make_thread(1, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_repeat_encoding(self):
+        """Identical consecutive samples use RLE and roundtrip."""
+        frame = make_frame("repeat.py", 42, "repeat_func")
+        samples = [
+            [make_interpreter(0, [make_thread(1, [frame])])]
+            for _ in range(1000)
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1000)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_suffix_encoding(self):
+        """Samples sharing suffix use STACK_SUFFIX and roundtrip."""
+        samples = []
+        for i in range(10):
+            frames = [make_frame(f"new{i}.py", i + 1, f"new{i}")]
+            frames.extend(
+                [
+                    make_frame(f"shared{j}.py", j + 1, f"shared{j}")
+                    for j in range(5)
+                ]
+            )
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 10)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_pop_push_encoding(self):
+        """Samples with pop+push pattern roundtrip."""
+        samples = []
+        base_frames = [make_frame("base.py", 10, "base")]
+
+        # Call deeper
+        samples.append([make_interpreter(0, [make_thread(1, base_frames)])])
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [make_frame("call1.py", 20, "call1")]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [
+                                make_frame("call2.py", 30, "call2"),
+                                make_frame("call1.py", 20, "call1"),
+                            ]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        # Return
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [make_frame("call1.py", 25, "call1")]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        samples.append([make_interpreter(0, [make_thread(1, base_frames)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 5)
+        self.assert_samples_equal(samples, collector)
+
+    def test_mixed_encodings(self):
+        """Mix of different encoding patterns roundtrips."""
+        samples = []
+        # Some repeated samples (RLE)
+        frame1 = make_frame("hot.py", 1, "hot")
+        for _ in range(20):
+            samples.append([make_interpreter(0, [make_thread(1, [frame1])])])
+        # Some varying samples
+        for i in range(20):
+            frames = [make_frame(f"vary{i}.py", i + 1, f"vary{i}")]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        # More repeated
+        for _ in range(20):
+            samples.append([make_interpreter(0, [make_thread(1, [frame1])])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 60)
+        self.assert_samples_equal(samples, collector)
+
+    def test_alternating_threads_status_changes(self):
+        """Alternating thread status changes roundtrip correctly."""
+        samples = []
+        for i in range(50):
+            status1 = THREAD_STATUS_HAS_GIL if i % 2 == 0 else 0
+            status2 = (
+                THREAD_STATUS_ON_CPU if i % 3 == 0 else THREAD_STATUS_HAS_GIL
+            )
+            threads = [
+                make_thread(1, [make_frame("t1.py", 10, "t1")], status1),
+                make_thread(2, [make_frame("t2.py", 20, "t2")], status2),
+            ]
+            samples.append([make_interpreter(0, threads)])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 100)
+        self.assert_samples_equal(samples, collector)
+
+
+class TestBinaryStress(BinaryFormatTestBase):
+    """Randomized stress tests for binary format."""
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_random_samples_stress(self):
+        """Stress test with random samples - exercises hash table resizing."""
+        random.seed(42)  # Reproducible
+
+        # Large pools to force hash table resizing (exceeds initial 8192/4096 sizes)
+        filenames = [f"file{i}.py" for i in range(200)]
+        funcnames = [f"func{i}" for i in range(300)]
+        thread_ids = list(range(1, 50))
+        interp_ids = list(range(10))
+        statuses = [
+            0,
+            THREAD_STATUS_HAS_GIL,
+            THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_EXCEPTION,
+        ]
+
+        samples = []
+        for _ in range(1000):
+            num_interps = random.randint(1, 3)
+            interps = []
+            for _ in range(num_interps):
+                iid = random.choice(interp_ids)
+                num_threads = random.randint(1, 5)
+                threads = []
+                for _ in range(num_threads):
+                    tid = random.choice(thread_ids)
+                    status = random.choice(statuses)
+                    depth = random.randint(1, 15)
+                    frames = []
+                    for _ in range(depth):
+                        fname = random.choice(filenames)
+                        func = random.choice(funcnames)
+                        # Wide line number range to create many unique frames
+                        lineno = random.randint(1, 5000)
+                        frames.append(make_frame(fname, lineno, func))
+                    threads.append(make_thread(tid, frames, status))
+                interps.append(make_interpreter(iid, threads))
+            samples.append(interps)
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+    def test_rle_stress(self):
+        """Stress test RLE encoding with identical samples."""
+        random.seed(123)
+
+        # Create a few distinct stacks
+        stacks = []
+        for i in range(5):
+            depth = random.randint(1, 8)
+            frames = [
+                make_frame(f"rle{j}.py", j * 10, f"rle{j}")
+                for j in range(depth)
+            ]
+            stacks.append(frames)
+
+        # Generate samples with repeated stacks (should trigger RLE)
+        samples = []
+        for _ in range(100):
+            stack = random.choice(stacks)
+            repeat = random.randint(1, 50)
+            for _ in range(repeat):
+                samples.append([make_interpreter(0, [make_thread(1, stack)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(samples))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_multi_thread_stress(self):
+        """Stress test with many threads and interleaved samples."""
+        random.seed(456)
+
+        thread_ids = list(range(1, 20))
+        samples = []
+
+        for i in range(300):
+            # Randomly select 1-5 threads for this sample
+            num_threads = random.randint(1, 5)
+            selected = random.sample(thread_ids, num_threads)
+            threads = []
+            for tid in selected:
+                status = random.choice(
+                    [0, THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU]
+                )
+                depth = random.randint(1, 5)
+                frames = [
+                    make_frame(f"mt{tid}_{j}.py", i + j, f"f{j}")
+                    for j in range(depth)
+                ]
+                threads.append(make_thread(tid, frames, status))
+            samples.append([make_interpreter(0, threads)])
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+    def test_encoding_transitions_stress(self):
+        """Stress test stack encoding transitions."""
+        random.seed(789)
+
+        base_frames = [
+            make_frame(f"base{i}.py", i, f"base{i}") for i in range(5)
+        ]
+        samples = []
+
+        for i in range(200):
+            choice = random.randint(0, 4)
+            if choice == 0:
+                # Full new stack
+                depth = random.randint(1, 8)
+                frames = [
+                    make_frame(f"new{i}_{j}.py", j, f"new{j}")
+                    for j in range(depth)
+                ]
+            elif choice == 1:
+                # Repeat previous (will use RLE if identical)
+                frames = base_frames[: random.randint(1, 5)]
+            elif choice == 2:
+                # Add frames on top (suffix encoding)
+                extra = random.randint(1, 3)
+                frames = [
+                    make_frame(f"top{i}_{j}.py", j, f"top{j}")
+                    for j in range(extra)
+                ]
+                frames.extend(base_frames[: random.randint(2, 4)])
+            else:
+                # Pop and push (pop-push encoding)
+                keep = random.randint(1, 3)
+                push = random.randint(0, 2)
+                frames = [
+                    make_frame(f"push{i}_{j}.py", j, f"push{j}")
+                    for j in range(push)
+                ]
+                frames.extend(base_frames[:keep])
+
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(samples))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_same_thread_id_multiple_interpreters_stress(self):
+        """Stress test: same thread_id across multiple interpreters with interleaved samples.
+
+        This test catches bugs where thread state is keyed only by thread_id
+        without considering interpreter_id (both in writer and reader).
+        """
+        random.seed(999)
+
+        # Multiple interpreters, each with overlapping thread_ids
+        interp_ids = [0, 1, 2, 3]
+        # Same thread_ids used across all interpreters
+        shared_thread_ids = [1, 2, 3]
+
+        filenames = [f"file{i}.py" for i in range(10)]
+        funcnames = [f"func{i}" for i in range(15)]
+        statuses = [0, THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU]
+
+        samples = []
+        for i in range(1000):
+            # Randomly pick an interpreter
+            iid = random.choice(interp_ids)
+            # Randomly pick 1-3 threads (from shared pool)
+            num_threads = random.randint(1, 3)
+            selected_tids = random.sample(shared_thread_ids, num_threads)
+
+            threads = []
+            for tid in selected_tids:
+                status = random.choice(statuses)
+                depth = random.randint(1, 6)
+                frames = []
+                for d in range(depth):
+                    # Include interpreter and thread info in frame data for verification
+                    fname = f"i{iid}_t{tid}_{random.choice(filenames)}"
+                    func = random.choice(funcnames)
+                    lineno = i * 10 + d + 1  # Unique per sample
+                    frames.append(make_frame(fname, lineno, func))
+                threads.append(make_thread(tid, frames, status))
+
+            samples.append([make_interpreter(iid, threads)])
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+        # Verify that we have samples from multiple (interpreter, thread) combinations
+        # with the same thread_id
+        keys = set(collector.by_thread.keys())
+        # Should have samples for same thread_id in different interpreters
+        for tid in shared_thread_ids:
+            interps_with_tid = [iid for (iid, t) in keys if t == tid]
+            self.assertGreater(
+                len(interps_with_tid),
+                1,
+                f"Thread {tid} should appear in multiple interpreters",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst
new file mode 100644
index 00000000000000..f9c2cee51d1dcd
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst
@@ -0,0 +1,4 @@
+Add binary output format to :mod:`profiling.sampling` for compact storage of
+profiling data. The new ``--binary`` option captures samples to a file that
+can be converted to other formats using the ``replay`` command. Patch by
+Pablo Galindo
diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in
index acb08400e24e2e..2a4b937ce6bf80 100644
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@@ -41,7 +41,7 @@
 @MODULE__PICKLE_TRUE@_pickle _pickle.c
 @MODULE__QUEUE_TRUE@_queue _queuemodule.c
 @MODULE__RANDOM_TRUE@_random _randommodule.c
-@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/subprocess.c
+@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/binary_io_writer.c _remote_debugging/binary_io_reader.c _remote_debugging/subprocess.c
 @MODULE__STRUCT_TRUE@_struct _struct.c
 
 # build supports subinterpreters
diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h
index 2f3efedd1e0ed5..9557a200b237e9 100644
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@@ -16,7 +16,9 @@ extern "C" {
 #endif
 
 #ifndef Py_BUILD_CORE_BUILTIN
+#  ifndef Py_BUILD_CORE_MODULE
 #    define Py_BUILD_CORE_MODULE 1
+#  endif
 #endif
 
 #include "Python.h"
@@ -205,6 +207,8 @@ typedef struct {
     PyTypeObject *ThreadInfo_Type;
     PyTypeObject *InterpreterInfo_Type;
     PyTypeObject *AwaitedInfo_Type;
+    PyTypeObject *BinaryWriter_Type;
+    PyTypeObject *BinaryReader_Type;
 } RemoteDebuggingState;
 
 enum _ThreadState {
diff --git a/Modules/_remote_debugging/binary_io.h b/Modules/_remote_debugging/binary_io.h
new file mode 100644
index 00000000000000..e730fa8d9ace5c
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io.h
@@ -0,0 +1,585 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary I/O Header
+ *
+ * This header provides declarations for high-performance binary file I/O
+ * for profiling data with optional zstd streaming compression.
+ ******************************************************************************/
+
+#ifndef Py_BINARY_IO_H
+#define Py_BINARY_IO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "Python.h"
+#include "pycore_hashtable.h"
+#include <stdint.h>
+#include <stdio.h>
+
+/* ============================================================================
+ * BINARY FORMAT CONSTANTS
+ * ============================================================================ */
+
+#define BINARY_FORMAT_MAGIC     0x54414348  /* "TACH" (Tachyon) */
+#define BINARY_FORMAT_VERSION   2
+
+/* Buffer sizes: 512KB balances syscall amortization against memory use,
+ * and aligns well with filesystem block sizes and zstd dictionary windows */
+#define WRITE_BUFFER_SIZE       (512 * 1024)
+#define COMPRESSED_BUFFER_SIZE  (512 * 1024)
+
+/* Compression types */
+#define COMPRESSION_NONE        0
+#define COMPRESSION_ZSTD        1
+
+/* Stack encoding types for delta compression */
+#define STACK_REPEAT            0x00  /* RLE: identical to previous, with count */
+#define STACK_FULL              0x01  /* Full stack (first sample or no match) */
+#define STACK_SUFFIX            0x02  /* Shares N frames from bottom */
+#define STACK_POP_PUSH          0x03  /* Remove M frames, add N frames */
+
+/* Maximum stack depth we'll buffer for delta encoding */
+#define MAX_STACK_DEPTH         256
+
+/* Initial capacity for RLE pending buffer */
+#define INITIAL_RLE_CAPACITY    64
+
+/* Initial capacities for dynamic arrays - sized to reduce reallocations */
+#define INITIAL_STRING_CAPACITY 4096
+#define INITIAL_FRAME_CAPACITY  4096
+#define INITIAL_THREAD_CAPACITY 256
+
+/* ============================================================================
+ * STATISTICS STRUCTURES
+ * ============================================================================ */
+
+/* Writer statistics - tracks encoding efficiency */
+typedef struct {
+    uint64_t repeat_records;      /* Number of RLE repeat records written */
+    uint64_t repeat_samples;      /* Total samples encoded via RLE */
+    uint64_t full_records;        /* Number of full stack records */
+    uint64_t suffix_records;      /* Number of suffix match records */
+    uint64_t pop_push_records;    /* Number of pop-push records */
+    uint64_t total_frames_written;/* Total frame indices written */
+    uint64_t frames_saved;        /* Frames avoided due to delta encoding */
+    uint64_t bytes_written;       /* Total bytes written (before compression) */
+} BinaryWriterStats;
+
+/* Reader statistics - tracks reconstruction performance */
+typedef struct {
+    uint64_t repeat_records;      /* RLE records decoded */
+    uint64_t repeat_samples;      /* Samples decoded from RLE */
+    uint64_t full_records;        /* Full stack records decoded */
+    uint64_t suffix_records;      /* Suffix match records decoded */
+    uint64_t pop_push_records;    /* Pop-push records decoded */
+    uint64_t total_samples;       /* Total samples reconstructed */
+    uint64_t stack_reconstructions; /* Number of stack array reconstructions */
+} BinaryReaderStats;
+
+/* ============================================================================
+ * PLATFORM ABSTRACTION
+ * ============================================================================ */
+
+#if defined(__linux__) || defined(__APPLE__)
+    #include <sys/mman.h>
+    #include <unistd.h>
+    #include <sys/stat.h>
+    #include <fcntl.h>
+    #define USE_MMAP 1
+#else
+    #define USE_MMAP 0
+#endif
+
+/* 64-bit file position support for files larger than 2GB.
+ * On POSIX: use ftello/fseeko with off_t (already 64-bit on 64-bit systems)
+ * On Windows: use _ftelli64/_fseeki64 with __int64 */
+#if defined(_WIN32) || defined(_WIN64)
+    #include <io.h>
+    typedef __int64 file_offset_t;
+    #define FTELL64(fp) _ftelli64(fp)
+    #define FSEEK64(fp, offset, whence) _fseeki64(fp, offset, whence)
+#else
+    /* POSIX - off_t is 64-bit on 64-bit systems, ftello/fseeko handle large files */
+    typedef off_t file_offset_t;
+    #define FTELL64(fp) ftello(fp)
+    #define FSEEK64(fp, offset, whence) fseeko(fp, offset, whence)
+#endif
+
+/* Forward declare zstd types if available */
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* Branch prediction hints - same as Objects/obmalloc.c */
+#if (defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 2))) && defined(__OPTIMIZE__)
+#  define UNLIKELY(value) __builtin_expect((value), 0)
+#  define LIKELY(value) __builtin_expect((value), 1)
+#else
+#  define UNLIKELY(value) (value)
+#  define LIKELY(value) (value)
+#endif
+
+/* ============================================================================
+ * BINARY WRITER STRUCTURES
+ * ============================================================================ */
+
+/* zstd compression state (only used if HAVE_ZSTD defined) */
+typedef struct {
+#ifdef HAVE_ZSTD
+    ZSTD_CCtx *cctx;  /* Modern API: CCtx and CStream are the same since v1.3.0 */
+#else
+    void *cctx;  /* Placeholder */
+#endif
+    uint8_t *compressed_buffer;
+    size_t compressed_buffer_size;
+} ZstdCompressor;
+
+/* Frame entry - combines all frame data for better cache locality */
+typedef struct {
+    uint32_t filename_idx;
+    uint32_t funcname_idx;
+    int32_t lineno;
+} FrameEntry;
+
+/* Frame key for hash table lookup */
+typedef struct {
+    uint32_t filename_idx;
+    uint32_t funcname_idx;
+    int32_t lineno;
+} FrameKey;
+
+/* Pending RLE sample - buffered for run-length encoding */
+typedef struct {
+    uint64_t timestamp_delta;
+    uint8_t status;
+} PendingRLESample;
+
+/* Thread entry - tracks per-thread state for delta encoding */
+typedef struct {
+    uint64_t thread_id;
+    uint64_t prev_timestamp;
+    uint32_t interpreter_id;
+
+    /* Previous stack for delta encoding (frame indices, innermost first) */
+    uint32_t *prev_stack;
+    size_t prev_stack_depth;
+    size_t prev_stack_capacity;
+
+    /* RLE pending buffer - samples waiting to be written as a repeat group */
+    PendingRLESample *pending_rle;
+    size_t pending_rle_count;
+    size_t pending_rle_capacity;
+    int has_pending_rle;  /* Flag: do we have buffered repeats? */
+} ThreadEntry;
+
+/* Main binary writer structure */
+typedef struct {
+    FILE *fp;
+    char *filename;
+
+    /* Write buffer for batched I/O */
+    uint8_t *write_buffer;
+    size_t buffer_pos;
+    size_t buffer_size;
+
+    /* Compression */
+    int compression_type;
+    ZstdCompressor zstd;
+
+    /* Metadata */
+    uint64_t start_time_us;
+    uint64_t sample_interval_us;
+    uint32_t total_samples;
+
+    /* String hash table: PyObject* -> uint32_t index */
+    _Py_hashtable_t *string_hash;
+    /* String storage: array of UTF-8 encoded strings */
+    char **strings;
+    size_t *string_lengths;
+    size_t string_count;
+    size_t string_capacity;
+
+    /* Frame hash table: FrameKey* -> uint32_t index */
+    _Py_hashtable_t *frame_hash;
+    /* Frame storage: combined struct for better cache locality */
+    FrameEntry *frame_entries;
+    size_t frame_count;
+    size_t frame_capacity;
+
+    /* Thread timestamp tracking for delta encoding - combined for cache locality */
+    ThreadEntry *thread_entries;
+    size_t thread_count;
+    size_t thread_capacity;
+
+    /* Statistics */
+    BinaryWriterStats stats;
+} BinaryWriter;
+
+/* ============================================================================
+ * BINARY READER STRUCTURES
+ * ============================================================================ */
+
+/* Per-thread state for stack reconstruction during replay */
+typedef struct {
+    uint64_t thread_id;
+    uint32_t interpreter_id;
+    uint64_t prev_timestamp;
+
+    /* Reconstructed stack buffer (frame indices, innermost first) */
+    uint32_t *current_stack;
+    size_t current_stack_depth;
+    size_t current_stack_capacity;
+} ReaderThreadState;
+
+/* Main binary reader structure */
+typedef struct {
+    char *filename;
+
+#if USE_MMAP
+    int fd;
+    uint8_t *mapped_data;
+    size_t mapped_size;
+#else
+    FILE *fp;
+    uint8_t *file_data;
+    size_t file_size;
+#endif
+
+    /* Decompression state */
+    int compression_type;
+    /* Note: ZSTD_DCtx is not stored - created/freed during decompression */
+    uint8_t *decompressed_data;
+    size_t decompressed_size;
+
+    /* Header metadata */
+    uint64_t start_time_us;
+    uint64_t sample_interval_us;
+    uint32_t sample_count;
+    uint32_t thread_count;
+    uint64_t string_table_offset;
+    uint64_t frame_table_offset;
+
+    /* Parsed string table: array of Python string objects */
+    PyObject **strings;
+    uint32_t strings_count;
+
+    /* Parsed frame table: packed as [filename_idx, funcname_idx, lineno] */
+    uint32_t *frame_data;
+    uint32_t frames_count;
+
+    /* Sample data region */
+    uint8_t *sample_data;
+    size_t sample_data_size;
+
+    /* Per-thread state for stack reconstruction (used during replay) */
+    ReaderThreadState *thread_states;
+    size_t thread_state_count;
+    size_t thread_state_capacity;
+
+    /* Statistics */
+    BinaryReaderStats stats;
+} BinaryReader;
+
+/* ============================================================================
+ * VARINT ENCODING/DECODING (INLINE FOR PERFORMANCE)
+ * ============================================================================ */
+
+/* Encode unsigned 64-bit varint (LEB128). Returns bytes written. */
+static inline size_t
+encode_varint_u64(uint8_t *buf, uint64_t value)
+{
+    /* Fast path for single-byte values (0-127) - very common case */
+    if (value < 0x80) {
+        buf[0] = (uint8_t)value;
+        return 1;
+    }
+
+    size_t i = 0;
+    while (value >= 0x80) {
+        buf[i++] = (uint8_t)((value & 0x7F) | 0x80);
+        value >>= 7;
+    }
+    buf[i++] = (uint8_t)(value & 0x7F);
+    return i;
+}
+
+/* Encode unsigned 32-bit varint. Returns bytes written. */
+static inline size_t
+encode_varint_u32(uint8_t *buf, uint32_t value)
+{
+    return encode_varint_u64(buf, value);
+}
+
+/* Encode signed 32-bit varint (zigzag encoding). Returns bytes written. */
+static inline size_t
+encode_varint_i32(uint8_t *buf, int32_t value)
+{
+    /* Zigzag encode: map signed to unsigned */
+    uint32_t zigzag = ((uint32_t)value << 1) ^ (uint32_t)(value >> 31);
+    return encode_varint_u32(buf, zigzag);
+}
+
+/* Decode unsigned 64-bit varint (LEB128). Updates offset only on success.
+ * On error (overflow or incomplete), offset is NOT updated, allowing callers
+ * to detect errors via (offset == prev_offset) check. Sets PyErr on error. */
+static inline uint64_t
+decode_varint_u64(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    size_t pos = *offset;
+    uint64_t result = 0;
+    int shift = 0;
+
+    /* Fast path for single-byte varints (0-127) - most common case */
+    if (LIKELY(pos < max_size && (data[pos] & 0x80) == 0)) {
+        *offset = pos + 1;
+        return data[pos];
+    }
+
+    while (pos < max_size) {
+        uint8_t byte = data[pos++];
+        result |= (uint64_t)(byte & 0x7F) << shift;
+        if ((byte & 0x80) == 0) {
+            *offset = pos;
+            return result;
+        }
+        shift += 7;
+        if (UNLIKELY(shift >= 64)) {
+            PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+            return 0;
+        }
+    }
+
+    PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+    return 0;
+}
+
+/* Decode unsigned 32-bit varint. If value exceeds UINT32_MAX, treats as error. */
+static inline uint32_t
+decode_varint_u32(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    size_t saved_offset = *offset;
+    uint64_t value = decode_varint_u64(data, offset, max_size);
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    if (UNLIKELY(value > UINT32_MAX)) {
+        *offset = saved_offset;
+        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+        return 0;
+    }
+    return (uint32_t)value;
+}
+
+/* Decode signed 32-bit varint (zigzag encoding). */
+static inline int32_t
+decode_varint_i32(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    uint32_t zigzag = decode_varint_u32(data, offset, max_size);
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    return (int32_t)((zigzag >> 1) ^ -(int32_t)(zigzag & 1));
+}
+
+/* ============================================================================
+ * SHARED UTILITY FUNCTIONS
+ * ============================================================================ */
+
+/* Generic array growth - returns new pointer or NULL (sets PyErr_NoMemory)
+ * Includes overflow checking for capacity doubling and allocation size. */
+static inline void *
+grow_array(void *ptr, size_t *capacity, size_t elem_size)
+{
+    size_t old_cap = *capacity;
+
+    /* Check for overflow when doubling capacity */
+    if (old_cap > SIZE_MAX / 2) {
+        PyErr_SetString(PyExc_OverflowError, "Array capacity overflow");
+        return NULL;
+    }
+    size_t new_cap = old_cap * 2;
+
+    /* Check for overflow when calculating allocation size */
+    if (new_cap > SIZE_MAX / elem_size) {
+        PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow");
+        return NULL;
+    }
+
+    void *new_ptr = PyMem_Realloc(ptr, new_cap * elem_size);
+    if (new_ptr) {
+        *capacity = new_cap;
+    } else {
+        PyErr_NoMemory();
+    }
+    return new_ptr;
+}
+
+/* Macro wrapper for type safety with grow_array */
+#define GROW_ARRAY(ptr, count, capacity, type) \
+    ((count) < (capacity) ? 0 : \
+     ((ptr) = grow_array((ptr), &(capacity), sizeof(type))) ? 0 : -1)
+
+/* ============================================================================
+ * BINARY WRITER API
+ * ============================================================================ */
+
+/*
+ * Create a new binary writer.
+ *
+ * Arguments:
+ *   filename: Path to output file
+ *   sample_interval_us: Sampling interval in microseconds
+ *   compression_type: COMPRESSION_NONE or COMPRESSION_ZSTD
+ *   start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)
+ *
+ * Returns:
+ *   New BinaryWriter* on success, NULL on failure (PyErr set)
+ */
+BinaryWriter *binary_writer_create(
+    const char *filename,
+    uint64_t sample_interval_us,
+    int compression_type,
+    uint64_t start_time_us
+);
+
+/*
+ * Write a sample to the binary file.
+ *
+ * Arguments:
+ *   writer: Writer from binary_writer_create
+ *   stack_frames: List of InterpreterInfo struct sequences
+ *   timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)
+ *
+ * Returns:
+ *   0 on success, -1 on failure (PyErr set)
+ */
+int binary_writer_write_sample(
+    BinaryWriter *writer,
+    PyObject *stack_frames,
+    uint64_t timestamp_us
+);
+
+/*
+ * Finalize and close the binary file.
+ * Writes string/frame tables, footer, and updates header.
+ *
+ * Arguments:
+ *   writer: Writer to finalize
+ *
+ * Returns:
+ *   0 on success, -1 on failure (PyErr set)
+ */
+int binary_writer_finalize(BinaryWriter *writer);
+
+/*
+ * Destroy a binary writer and free all resources.
+ * Safe to call even if writer is partially initialized.
+ *
+ * Arguments:
+ *   writer: Writer to destroy (may be NULL)
+ */
+void binary_writer_destroy(BinaryWriter *writer);
+
+/* ============================================================================
+ * BINARY READER API
+ * ============================================================================ */
+
+/*
+ * Open a binary file for reading.
+ *
+ * Arguments:
+ *   filename: Path to input file
+ *
+ * Returns:
+ *   New BinaryReader* on success, NULL on failure (PyErr set)
+ */
+BinaryReader *binary_reader_open(const char *filename);
+
+/*
+ * Replay samples from binary file through a collector.
+ *
+ * Arguments:
+ *   reader: Reader from binary_reader_open
+ *   collector: Python collector with collect() method
+ *   progress_callback: Optional callable(current, total) or NULL
+ *
+ * Returns:
+ *   Number of samples replayed on success, -1 on failure (PyErr set)
+ */
+Py_ssize_t binary_reader_replay(
+    BinaryReader *reader,
+    PyObject *collector,
+    PyObject *progress_callback
+);
+
+/*
+ * Get metadata about the binary file.
+ *
+ * Arguments:
+ *   reader: Reader from binary_reader_open
+ *
+ * Returns:
+ *   Dict with file metadata on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_reader_get_info(BinaryReader *reader);
+
+/*
+ * Close a binary reader and free all resources.
+ *
+ * Arguments:
+ *   reader: Reader to close (may be NULL)
+ */
+void binary_reader_close(BinaryReader *reader);
+
+/* ============================================================================
+ * STATISTICS FUNCTIONS
+ * ============================================================================ */
+
+/*
+ * Get writer statistics as a Python dict.
+ *
+ * Arguments:
+ *   writer: Writer to get stats from
+ *
+ * Returns:
+ *   Dict with statistics on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_writer_get_stats(BinaryWriter *writer);
+
+/*
+ * Get reader statistics as a Python dict.
+ *
+ * Arguments:
+ *   reader: Reader to get stats from
+ *
+ * Returns:
+ *   Dict with statistics on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_reader_get_stats(BinaryReader *reader);
+
+/* ============================================================================
+ * UTILITY FUNCTIONS
+ * ============================================================================ */
+
+/*
+ * Check if zstd compression is available.
+ *
+ * Returns:
+ *   1 if zstd available, 0 otherwise
+ */
+int binary_io_zstd_available(void);
+
+/*
+ * Get the best available compression type.
+ *
+ * Returns:
+ *   COMPRESSION_ZSTD if available, COMPRESSION_NONE otherwise
+ */
+int binary_io_get_best_compression(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* Py_BINARY_IO_H */
diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c
new file mode 100644
index 00000000000000..10381e8b62bc00
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io_reader.c
@@ -0,0 +1,1139 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary Reader Implementation
+ *
+ * High-performance binary file reader for profiling data with optional zstd
+ * decompression.
+ ******************************************************************************/
+
+#ifndef Py_BUILD_CORE_MODULE
+#  define Py_BUILD_CORE_MODULE
+#endif
+
+#include "binary_io.h"
+#include "_remote_debugging.h"
+#include <string.h>
+
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* ============================================================================
+ * CONSTANTS FOR BINARY FORMAT SIZES
+ * ============================================================================ */
+
+/* File structure sizes */
+#define FILE_HEADER_PLACEHOLDER_SIZE 64  /* Placeholder written at file start */
+#define FILE_HEADER_SIZE 52              /* Actual header content size */
+#define FILE_FOOTER_SIZE 32              /* Footer size */
+#define MIN_DECOMPRESS_BUFFER_SIZE (64 * 1024)  /* Minimum decompression buffer */
+
+/* Progress callback frequency */
+#define PROGRESS_CALLBACK_INTERVAL 1000
+
+/* Maximum decompression size limit (1GB) */
+#define MAX_DECOMPRESS_SIZE (1ULL << 30)
+
+/* ============================================================================
+ * BINARY READER IMPLEMENTATION
+ * ============================================================================ */
+
+static inline int
+reader_parse_header(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    if (file_size < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "File too small for header");
+        return -1;
+    }
+
+    /* Use memcpy to avoid strict aliasing violations and unaligned access */
+    uint32_t magic;
+    uint32_t version;
+    memcpy(&magic, &data[0], sizeof(magic));
+    memcpy(&version, &data[4], sizeof(version));
+
+    if (magic != BINARY_FORMAT_MAGIC) {
+        PyErr_Format(PyExc_ValueError, "Invalid magic number: 0x%08x", magic);
+        return -1;
+    }
+
+    if (version != BINARY_FORMAT_VERSION) {
+        PyErr_Format(PyExc_ValueError, "Unsupported version: %u", version);
+        return -1;
+    }
+
+    memcpy(&reader->start_time_us, &data[8], sizeof(reader->start_time_us));
+    memcpy(&reader->sample_interval_us, &data[16], sizeof(reader->sample_interval_us));
+    memcpy(&reader->sample_count, &data[24], sizeof(reader->sample_count));
+    memcpy(&reader->thread_count, &data[28], sizeof(reader->thread_count));
+    memcpy(&reader->string_table_offset, &data[32], sizeof(reader->string_table_offset));
+    memcpy(&reader->frame_table_offset, &data[40], sizeof(reader->frame_table_offset));
+    memcpy(&reader->compression_type, &data[48], sizeof(reader->compression_type));
+
+    return 0;
+}
+
+static inline int
+reader_parse_footer(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    if (file_size < FILE_FOOTER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "File too small for footer");
+        return -1;
+    }
+
+    const uint8_t *footer = data + file_size - FILE_FOOTER_SIZE;
+    /* Use memcpy to avoid strict aliasing violations */
+    memcpy(&reader->strings_count, &footer[0], sizeof(reader->strings_count));
+    memcpy(&reader->frames_count, &footer[4], sizeof(reader->frames_count));
+
+    return 0;
+}
+
+#ifdef HAVE_ZSTD
+/* Maximum decompression buffer size to prevent memory exhaustion (1GB) */
+#define MAX_DECOMPRESS_SIZE (1ULL << 30)
+
+static inline int
+reader_decompress_samples(BinaryReader *reader, const uint8_t *data)
+{
+    size_t compressed_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE;
+    const uint8_t *compressed_data = data + FILE_HEADER_PLACEHOLDER_SIZE;
+
+    /* Validate compressed data region */
+    if (reader->string_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "Invalid string table offset");
+        return -1;
+    }
+
+    ZSTD_DCtx *dctx = ZSTD_createDCtx();
+    if (!dctx) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to create zstd decompression context");
+        return -1;
+    }
+
+    /* Try to get exact decompressed size from frame header for optimal allocation */
+    unsigned long long frame_content_size = ZSTD_getFrameContentSize(compressed_data, compressed_size);
+    size_t alloc_size;
+
+    if (frame_content_size == ZSTD_CONTENTSIZE_ERROR) {
+        /* Corrupted frame header - fail early */
+        ZSTD_freeDCtx(dctx);
+        PyErr_SetString(PyExc_ValueError, "Corrupted zstd frame header");
+        return -1;
+    } else if (frame_content_size != ZSTD_CONTENTSIZE_UNKNOWN &&
+               frame_content_size <= SIZE_MAX &&
+               frame_content_size <= MAX_DECOMPRESS_SIZE) {
+        alloc_size = (size_t)frame_content_size;
+    } else {
+        alloc_size = ZSTD_DStreamOutSize() * 4;
+        if (alloc_size < MIN_DECOMPRESS_BUFFER_SIZE) {
+            alloc_size = MIN_DECOMPRESS_BUFFER_SIZE;
+        }
+    }
+
+    reader->decompressed_data = PyMem_Malloc(alloc_size);
+    if (!reader->decompressed_data) {
+        ZSTD_freeDCtx(dctx);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    ZSTD_inBuffer input = { compressed_data, compressed_size, 0 };
+    size_t total_output = 0;
+    size_t last_result = 0;
+
+    while (input.pos < input.size) {
+        if (total_output >= alloc_size) {
+            /* Check for overflow before doubling */
+            if (alloc_size > SIZE_MAX / 2 || alloc_size * 2 > MAX_DECOMPRESS_SIZE) {
+                PyMem_Free(reader->decompressed_data);
+                reader->decompressed_data = NULL;
+                ZSTD_freeDCtx(dctx);
+                PyErr_SetString(PyExc_MemoryError, "Decompressed data exceeds maximum size");
+                return -1;
+            }
+            size_t new_size = alloc_size * 2;
+            uint8_t *new_buf = PyMem_Realloc(reader->decompressed_data, new_size);
+            if (!new_buf) {
+                PyMem_Free(reader->decompressed_data);
+                reader->decompressed_data = NULL;
+                ZSTD_freeDCtx(dctx);
+                PyErr_NoMemory();
+                return -1;
+            }
+            reader->decompressed_data = new_buf;
+            alloc_size = new_size;
+        }
+
+        ZSTD_outBuffer output = {
+            reader->decompressed_data + total_output,
+            alloc_size - total_output,
+            0
+        };
+
+        last_result = ZSTD_decompressStream(dctx, &output, &input);
+        if (ZSTD_isError(last_result)) {
+            PyMem_Free(reader->decompressed_data);
+            reader->decompressed_data = NULL;
+            ZSTD_freeDCtx(dctx);
+            PyErr_Format(PyExc_ValueError, "zstd decompression error: %s",
+                         ZSTD_getErrorName(last_result));
+            return -1;
+        }
+
+        total_output += output.pos;
+    }
+
+    /* Verify decompression is complete (last_result == 0 means frame is complete) */
+    if (last_result != 0) {
+        PyMem_Free(reader->decompressed_data);
+        reader->decompressed_data = NULL;
+        ZSTD_freeDCtx(dctx);
+        PyErr_SetString(PyExc_ValueError, "Incomplete zstd frame: data may be truncated");
+        return -1;
+    }
+
+    ZSTD_freeDCtx(dctx);
+    reader->decompressed_size = total_output;
+    reader->sample_data = reader->decompressed_data;
+    reader->sample_data_size = reader->decompressed_size;
+
+    return 0;
+}
+#endif
+
+static inline int
+reader_parse_string_table(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    reader->strings = PyMem_Calloc(reader->strings_count, sizeof(PyObject *));
+    if (!reader->strings && reader->strings_count > 0) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    size_t offset = reader->string_table_offset;
+    for (uint32_t i = 0; i < reader->strings_count; i++) {
+        size_t prev_offset = offset;
+        uint32_t str_len = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in string table");
+            return -1;
+        }
+        if (offset + str_len > file_size) {
+            PyErr_SetString(PyExc_ValueError, "String table overflow");
+            return -1;
+        }
+
+        reader->strings[i] = PyUnicode_DecodeUTF8((char *)&data[offset], str_len, "replace");
+        if (!reader->strings[i]) {
+            return -1;
+        }
+        offset += str_len;
+    }
+
+    return 0;
+}
+
+static inline int
+reader_parse_frame_table(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    /* Check for integer overflow in allocation size calculation.
+       Only needed on 32-bit where SIZE_MAX can be exceeded by uint32_t * 12. */
+#if SIZEOF_SIZE_T < 8
+    if (reader->frames_count > SIZE_MAX / (3 * sizeof(uint32_t))) {
+        PyErr_SetString(PyExc_OverflowError, "Frame count too large for allocation");
+        return -1;
+    }
+#endif
+
+    size_t alloc_size = (size_t)reader->frames_count * 3 * sizeof(uint32_t);
+    reader->frame_data = PyMem_Malloc(alloc_size);
+    if (!reader->frame_data && reader->frames_count > 0) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    size_t offset = reader->frame_table_offset;
+    for (uint32_t i = 0; i < reader->frames_count; i++) {
+        size_t base = (size_t)i * 3;
+        size_t prev_offset;
+
+        prev_offset = offset;
+        reader->frame_data[base] = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (filename)");
+            return -1;
+        }
+
+        prev_offset = offset;
+        reader->frame_data[base + 1] = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (funcname)");
+            return -1;
+        }
+
+        prev_offset = offset;
+        reader->frame_data[base + 2] = (uint32_t)decode_varint_i32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (lineno)");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+BinaryReader *
+binary_reader_open(const char *filename)
+{
+    BinaryReader *reader = PyMem_Calloc(1, sizeof(BinaryReader));
+    if (!reader) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+#if USE_MMAP
+    reader->fd = -1;  /* Explicit initialization for cleanup safety */
+#endif
+
+    reader->filename = PyMem_Malloc(strlen(filename) + 1);
+    if (!reader->filename) {
+        PyMem_Free(reader);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    strcpy(reader->filename, filename);
+
+#if USE_MMAP
+    /* Open with mmap on Unix */
+    reader->fd = open(filename, O_RDONLY);
+    if (reader->fd < 0) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    struct stat st;
+    if (fstat(reader->fd, &st) < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    reader->mapped_size = st.st_size;
+
+    /* Map the file into memory.
+     * MAP_POPULATE (Linux-only) pre-faults all pages at mmap time, which:
+     * - Catches issues (e.g., file truncation) immediately rather than as SIGBUS during reads
+     * - Eliminates page faults during subsequent reads for better performance
+     */
+#ifdef __linux__
+    reader->mapped_data = mmap(NULL, reader->mapped_size, PROT_READ,
+                               MAP_PRIVATE | MAP_POPULATE, reader->fd, 0);
+#else
+    reader->mapped_data = mmap(NULL, reader->mapped_size, PROT_READ,
+                               MAP_PRIVATE, reader->fd, 0);
+#endif
+    if (reader->mapped_data == MAP_FAILED) {
+        reader->mapped_data = NULL;
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    /* Hint sequential access pattern - failures are non-fatal */
+    (void)madvise(reader->mapped_data, reader->mapped_size, MADV_SEQUENTIAL);
+
+    /* Pre-fetch pages into memory - failures are non-fatal.
+     * Complements MAP_POPULATE on Linux, provides benefit on macOS. */
+    (void)madvise(reader->mapped_data, reader->mapped_size, MADV_WILLNEED);
+
+    /* Use transparent huge pages for large files to reduce TLB misses.
+     * Only beneficial for files >= 32MB where TLB pressure matters. */
+#ifdef MADV_HUGEPAGE
+    if (reader->mapped_size >= (32 * 1024 * 1024)) {
+        (void)madvise(reader->mapped_data, reader->mapped_size, MADV_HUGEPAGE);
+    }
+#endif
+
+    /* Add file descriptor-level hints for better kernel I/O scheduling */
+#if defined(__linux__) && defined(POSIX_FADV_SEQUENTIAL)
+    (void)posix_fadvise(reader->fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+    if (reader->mapped_size > (64 * 1024 * 1024)) {
+        (void)posix_fadvise(reader->fd, 0, 0, POSIX_FADV_WILLNEED);
+    }
+#endif
+
+    uint8_t *data = reader->mapped_data;
+    size_t file_size = reader->mapped_size;
+#else
+    /* Use stdio on Windows */
+    reader->fp = fopen(filename, "rb");
+    if (!reader->fp) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    if (FSEEK64(reader->fp, 0, SEEK_END) != 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    file_offset_t file_size_off = FTELL64(reader->fp);
+    if (file_size_off < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    reader->file_size = (size_t)file_size_off;
+    if (FSEEK64(reader->fp, 0, SEEK_SET) != 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    reader->file_data = PyMem_Malloc(reader->file_size);
+    if (!reader->file_data) {
+        PyErr_NoMemory();
+        goto error;
+    }
+
+    if (fread(reader->file_data, 1, reader->file_size, reader->fp) != reader->file_size) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    uint8_t *data = reader->file_data;
+    size_t file_size = reader->file_size;
+#endif
+
+    /* Parse header and footer */
+    if (reader_parse_header(reader, data, file_size) < 0) {
+        goto error;
+    }
+    if (reader_parse_footer(reader, data, file_size) < 0) {
+        goto error;
+    }
+
+    /* Validate table offsets are within file bounds */
+    if (reader->string_table_offset > file_size) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid string table offset: %llu exceeds file size %zu",
+            (unsigned long long)reader->string_table_offset, file_size);
+        goto error;
+    }
+    if (reader->frame_table_offset > file_size) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid frame table offset: %llu exceeds file size %zu",
+            (unsigned long long)reader->frame_table_offset, file_size);
+        goto error;
+    }
+    if (reader->string_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid string table offset: %llu is before data section",
+            (unsigned long long)reader->string_table_offset);
+        goto error;
+    }
+    if (reader->frame_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid frame table offset: %llu is before data section",
+            (unsigned long long)reader->frame_table_offset);
+        goto error;
+    }
+    if (reader->string_table_offset > reader->frame_table_offset) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid table offsets: string table (%llu) is after frame table (%llu)",
+            (unsigned long long)reader->string_table_offset,
+            (unsigned long long)reader->frame_table_offset);
+        goto error;
+    }
+
+    /* Handle compressed data */
+    if (reader->compression_type == COMPRESSION_ZSTD) {
+#ifdef HAVE_ZSTD
+        if (reader_decompress_samples(reader, data) < 0) {
+            goto error;
+        }
+#else
+        PyErr_SetString(PyExc_RuntimeError,
+            "File uses zstd compression but zstd support not compiled in");
+        goto error;
+#endif
+    } else {
+        reader->sample_data = data + FILE_HEADER_PLACEHOLDER_SIZE;
+        reader->sample_data_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE;
+    }
+
+    /* Parse string and frame tables */
+    if (reader_parse_string_table(reader, data, file_size) < 0) {
+        goto error;
+    }
+    if (reader_parse_frame_table(reader, data, file_size) < 0) {
+        goto error;
+    }
+
+    return reader;
+
+error:
+    binary_reader_close(reader);
+    return NULL;
+}
+
+/* Get or create reader thread state for stack reconstruction */
+static ReaderThreadState *
+reader_get_or_create_thread_state(BinaryReader *reader, uint64_t thread_id,
+                                   uint32_t interpreter_id)
+{
+    /* Search existing threads (key is thread_id + interpreter_id) */
+    for (size_t i = 0; i < reader->thread_state_count; i++) {
+        if (reader->thread_states[i].thread_id == thread_id &&
+            reader->thread_states[i].interpreter_id == interpreter_id) {
+            return &reader->thread_states[i];
+        }
+    }
+
+    if (!reader->thread_states) {
+        reader->thread_state_capacity = 16;
+        reader->thread_states = PyMem_Calloc(reader->thread_state_capacity, sizeof(ReaderThreadState));
+        if (!reader->thread_states) {
+            PyErr_NoMemory();
+            return NULL;
+        }
+    } else if (reader->thread_state_count >= reader->thread_state_capacity) {
+        reader->thread_states = grow_array(reader->thread_states,
+                                           &reader->thread_state_capacity,
+                                           sizeof(ReaderThreadState));
+        if (!reader->thread_states) {
+            return NULL;
+        }
+    }
+
+    ReaderThreadState *ts = &reader->thread_states[reader->thread_state_count++];
+    memset(ts, 0, sizeof(ReaderThreadState));
+    ts->thread_id = thread_id;
+    ts->interpreter_id = interpreter_id;
+    ts->prev_timestamp = reader->start_time_us;
+    ts->current_stack_capacity = MAX_STACK_DEPTH;
+    ts->current_stack = PyMem_Malloc(ts->current_stack_capacity * sizeof(uint32_t));
+    if (!ts->current_stack) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    return ts;
+}
+
+/* ============================================================================
+ * STACK DECODING HELPERS
+ * ============================================================================ */
+
+/* Decode a full stack from sample data.
+ * Updates ts->current_stack and ts->current_stack_depth.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_full(ReaderThreadState *ts, const uint8_t *data,
+                  size_t *offset, size_t max_size)
+{
+    uint32_t depth = decode_varint_u32(data, offset, max_size);
+
+    /* Validate depth against capacity to prevent buffer overflow */
+    if (depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Stack depth %u exceeds capacity %zu", depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    ts->current_stack_depth = depth;
+    for (uint32_t i = 0; i < depth; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    return 0;
+}
+
+/* Decode a suffix-encoded stack from sample data.
+ * The suffix encoding shares frames from the bottom of the previous stack.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_suffix(ReaderThreadState *ts, const uint8_t *data,
+                    size_t *offset, size_t max_size)
+{
+    uint32_t shared = decode_varint_u32(data, offset, max_size);
+    uint32_t new_count = decode_varint_u32(data, offset, max_size);
+
+    /* Validate shared doesn't exceed current stack depth */
+    if (shared > ts->current_stack_depth) {
+        PyErr_Format(PyExc_ValueError,
+            "Shared count %u exceeds current stack depth %zu",
+            shared, ts->current_stack_depth);
+        return -1;
+    }
+
+    /* Validate final depth doesn't exceed capacity */
+    size_t final_depth = (size_t)shared + new_count;
+    if (final_depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Final stack depth %zu exceeds capacity %zu",
+            final_depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    /* Move shared frames (from bottom of stack) to make room for new frames at the top */
+    if (new_count > 0 && shared > 0) {
+        size_t prev_shared_start = ts->current_stack_depth - shared;
+        memmove(&ts->current_stack[new_count],
+                &ts->current_stack[prev_shared_start],
+                shared * sizeof(uint32_t));
+    }
+
+    for (uint32_t i = 0; i < new_count; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    ts->current_stack_depth = final_depth;
+    return 0;
+}
+
+/* Decode a pop-push encoded stack from sample data.
+ * Pops frames from the top and pushes new frames.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_pop_push(ReaderThreadState *ts, const uint8_t *data,
+                      size_t *offset, size_t max_size)
+{
+    uint32_t pop = decode_varint_u32(data, offset, max_size);
+    uint32_t push = decode_varint_u32(data, offset, max_size);
+    size_t keep = (ts->current_stack_depth > pop) ? ts->current_stack_depth - pop : 0;
+
+    /* Validate final depth doesn't exceed capacity */
+    size_t final_depth = keep + push;
+    if (final_depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Final stack depth %zu exceeds capacity %zu",
+            final_depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    /* Move kept frames (from bottom of stack) to make room for new frames at the top.
+     * Even when push == 0, we need to move kept frames to index 0 if pop > 0. */
+    if (keep > 0) {
+        memmove(&ts->current_stack[push],
+                &ts->current_stack[pop],
+                keep * sizeof(uint32_t));
+    }
+
+    for (uint32_t i = 0; i < push; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    ts->current_stack_depth = final_depth;
+    return 0;
+}
+
+/* Build a Python list of FrameInfo objects from frame indices */
+static PyObject *
+build_frame_list(RemoteDebuggingState *state, BinaryReader *reader,
+                 const uint32_t *frame_indices, size_t stack_depth)
+{
+    PyObject *frame_list = PyList_New(stack_depth);
+    if (!frame_list) {
+        return NULL;
+    }
+
+    for (size_t k = 0; k < stack_depth; k++) {
+        uint32_t frame_idx = frame_indices[k];
+        if (frame_idx >= reader->frames_count) {
+            PyErr_Format(PyExc_ValueError, "Invalid frame index: %u", frame_idx);
+            goto error;
+        }
+
+        size_t base = frame_idx * 3;
+        uint32_t filename_idx = reader->frame_data[base];
+        uint32_t funcname_idx = reader->frame_data[base + 1];
+        int32_t lineno = (int32_t)reader->frame_data[base + 2];
+
+        if (filename_idx >= reader->strings_count ||
+            funcname_idx >= reader->strings_count) {
+            PyErr_SetString(PyExc_ValueError, "Invalid string index in frame");
+            goto error;
+        }
+
+        PyObject *frame_info = PyStructSequence_New(state->FrameInfo_Type);
+        if (!frame_info) {
+            goto error;
+        }
+
+        PyObject *location;
+        if (lineno > 0) {
+            location = Py_BuildValue("(iiii)", lineno, lineno, 0, 0);
+            if (!location) {
+                Py_DECREF(frame_info);
+                goto error;
+            }
+        }
+        else {
+            location = Py_NewRef(Py_None);
+        }
+
+        PyStructSequence_SetItem(frame_info, 0, Py_NewRef(reader->strings[filename_idx]));
+        PyStructSequence_SetItem(frame_info, 1, location);
+        PyStructSequence_SetItem(frame_info, 2, Py_NewRef(reader->strings[funcname_idx]));
+        PyStructSequence_SetItem(frame_info, 3, Py_NewRef(Py_None));
+        PyList_SET_ITEM(frame_list, k, frame_info);
+    }
+
+    return frame_list;
+
+error:
+    Py_DECREF(frame_list);
+    return NULL;
+}
+
+/* Helper to build sample_list from frame indices (shared by emit functions) */
+static PyObject *
+build_sample_list(RemoteDebuggingState *state, BinaryReader *reader,
+                  uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+                  const uint32_t *frame_indices, size_t stack_depth)
+{
+    PyObject *frame_list = NULL, *thread_info = NULL, *thread_list = NULL;
+    PyObject *interp_info = NULL, *sample_list = NULL;
+
+    frame_list = build_frame_list(state, reader, frame_indices, stack_depth);
+    if (!frame_list) {
+        goto error;
+    }
+
+    thread_info = PyStructSequence_New(state->ThreadInfo_Type);
+    if (!thread_info) {
+        goto error;
+    }
+    PyObject *tid = PyLong_FromUnsignedLongLong(thread_id);
+    if (!tid) {
+        goto error;
+    }
+    PyObject *st = PyLong_FromLong(status);
+    if (!st) {
+        Py_DECREF(tid);
+        goto error;
+    }
+    PyStructSequence_SetItem(thread_info, 0, tid);
+    PyStructSequence_SetItem(thread_info, 1, st);
+    PyStructSequence_SetItem(thread_info, 2, frame_list);
+    frame_list = NULL;  /* ownership transferred */
+
+    thread_list = PyList_New(1);
+    if (!thread_list) {
+        goto error;
+    }
+    PyList_SET_ITEM(thread_list, 0, thread_info);
+    thread_info = NULL;
+
+    interp_info = PyStructSequence_New(state->InterpreterInfo_Type);
+    if (!interp_info) {
+        goto error;
+    }
+    PyObject *iid = PyLong_FromUnsignedLong(interpreter_id);
+    if (!iid) {
+        goto error;
+    }
+    PyStructSequence_SetItem(interp_info, 0, iid);
+    PyStructSequence_SetItem(interp_info, 1, thread_list);
+    thread_list = NULL;
+
+    sample_list = PyList_New(1);
+    if (!sample_list) {
+        goto error;
+    }
+    PyList_SET_ITEM(sample_list, 0, interp_info);
+    return sample_list;
+
+error:
+    Py_XDECREF(sample_list);
+    Py_XDECREF(interp_info);
+    Py_XDECREF(thread_list);
+    Py_XDECREF(thread_info);
+    Py_XDECREF(frame_list);
+    return NULL;
+}
+
+/* Helper to emit a sample to the collector. timestamps_list is borrowed. */
+static int
+emit_sample(RemoteDebuggingState *state, PyObject *collector,
+            uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+            const uint32_t *frame_indices, size_t stack_depth,
+            BinaryReader *reader, PyObject *timestamps_list)
+{
+    PyObject *sample_list = build_sample_list(state, reader, thread_id,
+                                               interpreter_id, status,
+                                               frame_indices, stack_depth);
+    if (!sample_list) {
+        return -1;
+    }
+
+    PyObject *result = PyObject_CallMethod(collector, "collect", "OO", sample_list, timestamps_list);
+    Py_DECREF(sample_list);
+
+    if (!result) {
+        return -1;
+    }
+    Py_DECREF(result);
+    return 0;
+}
+
+/* Helper to trim timestamp list and emit batch. Returns 0 on success, -1 on error. */
+static int
+emit_batch(RemoteDebuggingState *state, PyObject *collector,
+           uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+           const uint32_t *frame_indices, size_t stack_depth,
+           BinaryReader *reader, PyObject *timestamps_list, Py_ssize_t actual_size)
+{
+    /* Trim list to actual size */
+    if (PyList_SetSlice(timestamps_list, actual_size, PyList_GET_SIZE(timestamps_list), NULL) < 0) {
+        return -1;
+    }
+    return emit_sample(state, collector, thread_id, interpreter_id, status,
+                       frame_indices, stack_depth, reader, timestamps_list);
+}
+
+/* Helper to invoke progress callback, clearing any errors */
+static inline void
+invoke_progress_callback(PyObject *callback, Py_ssize_t current, uint32_t total)
+{
+    if (callback && callback != Py_None) {
+        PyObject *result = PyObject_CallFunction(callback, "nI", current, total);
+        if (result) {
+            Py_DECREF(result);
+        } else {
+            PyErr_Clear();
+        }
+    }
+}
+
+Py_ssize_t
+binary_reader_replay(BinaryReader *reader, PyObject *collector, PyObject *progress_callback)
+{
+    if (!PyObject_HasAttrString(collector, "collect")) {
+        PyErr_SetString(PyExc_TypeError, "Collector must have a collect() method");
+        return -1;
+    }
+
+    /* Get module state for struct sequence types */
+    PyObject *module = PyImport_ImportModule("_remote_debugging");
+    if (!module) {
+        return -1;
+    }
+    RemoteDebuggingState *state = RemoteDebugging_GetState(module);
+    Py_DECREF(module);
+
+    if (!state) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to get module state");
+        return -1;
+    }
+
+    size_t offset = 0;
+    Py_ssize_t replayed = 0;
+
+    /* Initial progress callback at 0% */
+    invoke_progress_callback(progress_callback, 0, reader->sample_count);
+
+    while (offset < reader->sample_data_size) {
+        /* Read thread_id (8 bytes) + interpreter_id (4 bytes) */
+        if (offset + 13 > reader->sample_data_size) {
+            break;  /* End of data */
+        }
+
+        /* Use memcpy to avoid strict aliasing violations */
+        uint64_t thread_id;
+        uint32_t interpreter_id;
+        memcpy(&thread_id, &reader->sample_data[offset], sizeof(thread_id));
+        offset += 8;
+
+        memcpy(&interpreter_id, &reader->sample_data[offset], sizeof(interpreter_id));
+        offset += 4;
+
+        /* Get or create thread state for reconstruction */
+        ReaderThreadState *ts = reader_get_or_create_thread_state(reader, thread_id, interpreter_id);
+        if (!ts) {
+            return -1;
+        }
+
+        /* Read encoding byte */
+        uint8_t encoding = reader->sample_data[offset++];
+
+        switch (encoding) {
+        case STACK_REPEAT: {
+            /* RLE repeat: [count: varint] [delta: varint, status: 1]... */
+            size_t prev_offset = offset;
+            uint32_t count = decode_varint_u32(reader->sample_data, &offset, reader->sample_data_size);
+            /* Detect varint decode failure */
+            if (offset == prev_offset) {
+                PyErr_SetString(PyExc_ValueError, "Malformed varint for RLE count");
+                return -1;
+            }
+
+            /* Validate RLE count to prevent DoS from malicious files.
+             * Each RLE sample needs at least 2 bytes (1 byte min varint + 1 status byte).
+             * Also reject absurdly large counts that would exhaust memory. */
+            size_t remaining_data = reader->sample_data_size - offset;
+            size_t max_possible_samples = remaining_data / 2;
+            if (count > max_possible_samples) {
+                PyErr_Format(PyExc_ValueError,
+                    "Invalid RLE count %u exceeds maximum possible %zu for remaining data",
+                    count, max_possible_samples);
+                return -1;
+            }
+
+            reader->stats.repeat_records++;
+            reader->stats.repeat_samples += count;
+
+            /* Process RLE samples, batching by status */
+            PyObject *timestamps_list = NULL;
+            uint8_t batch_status = 0;
+            Py_ssize_t batch_idx = 0;
+
+            for (uint32_t i = 0; i < count; i++) {
+                size_t delta_prev_offset = offset;
+                uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size);
+                if (offset == delta_prev_offset) {
+                    Py_XDECREF(timestamps_list);
+                    PyErr_SetString(PyExc_ValueError, "Malformed varint in RLE sample data");
+                    return -1;
+                }
+                if (offset >= reader->sample_data_size) {
+                    Py_XDECREF(timestamps_list);
+                    PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data in RLE");
+                    return -1;
+                }
+                uint8_t status = reader->sample_data[offset++];
+                ts->prev_timestamp += delta;
+
+                /* Start new batch on first sample or status change */
+                if (i == 0 || status != batch_status) {
+                    if (timestamps_list) {
+                        int rc = emit_batch(state, collector, thread_id, interpreter_id,
+                                            batch_status, ts->current_stack, ts->current_stack_depth,
+                                            reader, timestamps_list, batch_idx);
+                        Py_DECREF(timestamps_list);
+                        if (rc < 0) {
+                            return -1;
+                        }
+                    }
+                    timestamps_list = PyList_New(count - i);
+                    if (!timestamps_list) {
+                        return -1;
+                    }
+                    batch_status = status;
+                    batch_idx = 0;
+                }
+
+                PyObject *ts_obj = PyLong_FromUnsignedLongLong(ts->prev_timestamp);
+                if (!ts_obj) {
+                    Py_DECREF(timestamps_list);
+                    return -1;
+                }
+                PyList_SET_ITEM(timestamps_list, batch_idx++, ts_obj);
+            }
+
+            /* Emit final batch */
+            if (timestamps_list) {
+                int rc = emit_batch(state, collector, thread_id, interpreter_id,
+                                    batch_status, ts->current_stack, ts->current_stack_depth,
+                                    reader, timestamps_list, batch_idx);
+                Py_DECREF(timestamps_list);
+                if (rc < 0) {
+                    return -1;
+                }
+            }
+
+            replayed += count;
+            reader->stats.total_samples += count;
+
+            /* Progress callback after batch */
+            if (replayed % PROGRESS_CALLBACK_INTERVAL < count) {
+                invoke_progress_callback(progress_callback, replayed, reader->sample_count);
+            }
+            break;
+        }
+
+        case STACK_FULL:
+        case STACK_SUFFIX:
+        case STACK_POP_PUSH: {
+            /* All three encodings share: [delta: varint] [status: 1] ... */
+            size_t prev_offset = offset;
+            uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size);
+            /* Detect varint decode failure: offset unchanged means error */
+            if (offset == prev_offset) {
+                PyErr_SetString(PyExc_ValueError, "Malformed varint in sample data");
+                return -1;
+            }
+            if (offset >= reader->sample_data_size) {
+                PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data");
+                return -1;
+            }
+            uint8_t status = reader->sample_data[offset++];
+            ts->prev_timestamp += delta;
+
+            if (encoding == STACK_FULL) {
+                if (decode_stack_full(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.full_records++;
+            } else if (encoding == STACK_SUFFIX) {
+                if (decode_stack_suffix(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.suffix_records++;
+            } else { /* STACK_POP_PUSH */
+                if (decode_stack_pop_push(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.pop_push_records++;
+            }
+            reader->stats.stack_reconstructions++;
+
+            /* Build single-element timestamp list */
+            PyObject *ts_obj = PyLong_FromUnsignedLongLong(ts->prev_timestamp);
+            if (!ts_obj) {
+                return -1;
+            }
+            PyObject *timestamps_list = PyList_New(1);
+            if (!timestamps_list) {
+                Py_DECREF(ts_obj);
+                return -1;
+            }
+            PyList_SET_ITEM(timestamps_list, 0, ts_obj);
+
+            if (emit_sample(state, collector, thread_id, interpreter_id, status,
+                           ts->current_stack, ts->current_stack_depth, reader,
+                           timestamps_list) < 0) {
+                Py_DECREF(timestamps_list);
+                return -1;
+            }
+            Py_DECREF(timestamps_list);
+            replayed++;
+            reader->stats.total_samples++;
+            break;
+        }
+
+        default:
+            PyErr_Format(PyExc_ValueError, "Unknown stack encoding: %u", encoding);
+            return -1;
+        }
+
+        /* Progress callback */
+        if (replayed % PROGRESS_CALLBACK_INTERVAL == 0) {
+            invoke_progress_callback(progress_callback, replayed, reader->sample_count);
+        }
+    }
+
+    /* Final progress callback at 100% */
+    invoke_progress_callback(progress_callback, replayed, reader->sample_count);
+
+    return replayed;
+}
+
+PyObject *
+binary_reader_get_info(BinaryReader *reader)
+{
+    return Py_BuildValue(
+        "{s:I, s:K, s:K, s:I, s:I, s:I, s:I, s:i}",
+        "version", BINARY_FORMAT_VERSION,
+        "start_time_us", reader->start_time_us,
+        "sample_interval_us", reader->sample_interval_us,
+        "sample_count", reader->sample_count,
+        "thread_count", reader->thread_count,
+        "string_count", reader->strings_count,
+        "frame_count", reader->frames_count,
+        "compression_type", reader->compression_type
+    );
+}
+
+PyObject *
+binary_writer_get_stats(BinaryWriter *writer)
+{
+    BinaryWriterStats *s = &writer->stats;
+
+    /* Calculate derived stats */
+    uint64_t total_records = s->repeat_records + s->full_records +
+                             s->suffix_records + s->pop_push_records;
+    uint64_t total_samples = writer->total_samples;
+    uint64_t potential_frames = s->total_frames_written + s->frames_saved;
+    double compression_ratio = (potential_frames > 0) ?
+        (double)s->frames_saved / potential_frames * 100.0 : 0.0;
+
+    return Py_BuildValue(
+        "{s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:d}",
+        "repeat_records", s->repeat_records,
+        "repeat_samples", s->repeat_samples,
+        "full_records", s->full_records,
+        "suffix_records", s->suffix_records,
+        "pop_push_records", s->pop_push_records,
+        "total_records", total_records,
+        "total_samples", total_samples,
+        "total_frames_written", s->total_frames_written,
+        "frames_saved", s->frames_saved,
+        "bytes_written", s->bytes_written,
+        "frame_compression_pct", compression_ratio
+    );
+}
+
+PyObject *
+binary_reader_get_stats(BinaryReader *reader)
+{
+    BinaryReaderStats *s = &reader->stats;
+
+    uint64_t total_records = s->repeat_records + s->full_records +
+                             s->suffix_records + s->pop_push_records;
+
+    return Py_BuildValue(
+        "{s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K}",
+        "repeat_records", s->repeat_records,
+        "repeat_samples", s->repeat_samples,
+        "full_records", s->full_records,
+        "suffix_records", s->suffix_records,
+        "pop_push_records", s->pop_push_records,
+        "total_records", total_records,
+        "total_samples", s->total_samples,
+        "stack_reconstructions", s->stack_reconstructions
+    );
+}
+
+void
+binary_reader_close(BinaryReader *reader)
+{
+    if (!reader) {
+        return;
+    }
+
+    PyMem_Free(reader->filename);
+
+#if USE_MMAP
+    if (reader->mapped_data) {
+        munmap(reader->mapped_data, reader->mapped_size);
+        reader->mapped_data = NULL;  /* Prevent use-after-free */
+        reader->mapped_size = 0;
+    }
+    if (reader->fd >= 0) {
+        close(reader->fd);
+        reader->fd = -1;  /* Mark as closed */
+    }
+#else
+    if (reader->fp) {
+        fclose(reader->fp);
+        reader->fp = NULL;
+    }
+    if (reader->file_data) {
+        PyMem_Free(reader->file_data);
+        reader->file_data = NULL;
+        reader->file_size = 0;
+    }
+#endif
+
+    PyMem_Free(reader->decompressed_data);
+
+    if (reader->strings) {
+        for (uint32_t i = 0; i < reader->strings_count; i++) {
+            Py_XDECREF(reader->strings[i]);
+        }
+        PyMem_Free(reader->strings);
+    }
+
+    PyMem_Free(reader->frame_data);
+
+    if (reader->thread_states) {
+        for (size_t i = 0; i < reader->thread_state_count; i++) {
+            PyMem_Free(reader->thread_states[i].current_stack);
+        }
+        PyMem_Free(reader->thread_states);
+    }
+
+    PyMem_Free(reader);
+}
diff --git a/Modules/_remote_debugging/binary_io_writer.c b/Modules/_remote_debugging/binary_io_writer.c
new file mode 100644
index 00000000000000..fbcdea5cbe526b
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io_writer.c
@@ -0,0 +1,1149 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary Writer Implementation
+ *
+ * High-performance binary file writer for profiling data with optional zstd
+ * streaming compression.
+ ******************************************************************************/
+
+#ifndef Py_BUILD_CORE_MODULE
+#  define Py_BUILD_CORE_MODULE
+#endif
+
+#include "binary_io.h"
+#include "_remote_debugging.h"
+#include <string.h>
+
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* ============================================================================
+ * CONSTANTS FOR BINARY FORMAT SIZES
+ * ============================================================================ */
+
+/* Sample header sizes */
+#define SAMPLE_HEADER_FIXED_SIZE 13      /* thread_id(8) + interpreter_id(4) + encoding(1) */
+#define SAMPLE_HEADER_MAX_SIZE 26        /* fixed + max_varint(10) + status(1) + margin */
+#define MAX_VARINT_SIZE 10               /* Maximum bytes for a varint64 */
+#define MAX_VARINT_SIZE_U32 5            /* Maximum bytes for a varint32 */
+/* Frame buffer: depth varint (max 2 bytes for 256) + 256 frames * 5 bytes/varint + margin */
+#define MAX_FRAME_BUFFER_SIZE ((MAX_STACK_DEPTH * MAX_VARINT_SIZE_U32) + MAX_VARINT_SIZE_U32 + 16)
+
+/* File structure sizes */
+#define FILE_HEADER_PLACEHOLDER_SIZE 64  /* Placeholder written at file start */
+#define FILE_HEADER_SIZE 52              /* Actual header content size */
+#define FILE_FOOTER_SIZE 32              /* Footer size */
+
+/* ============================================================================
+ * WRITER-SPECIFIC UTILITY HELPERS
+ * ============================================================================ */
+
+/* Grow two parallel arrays together (e.g., strings and string_lengths).
+ * Returns 0 on success, -1 on error (sets PyErr).
+ * On error, original arrays are preserved (truly atomic update). */
+static inline int
+grow_parallel_arrays(void **array1, void **array2, size_t *capacity,
+                     size_t elem_size1, size_t elem_size2)
+{
+    size_t old_cap = *capacity;
+
+    if (old_cap > SIZE_MAX / 2) {
+        PyErr_SetString(PyExc_OverflowError, "Array capacity overflow");
+        return -1;
+    }
+    size_t new_cap = old_cap * 2;
+
+    if (new_cap > SIZE_MAX / elem_size1 || new_cap > SIZE_MAX / elem_size2) {
+        PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow");
+        return -1;
+    }
+
+    size_t new_size1 = new_cap * elem_size1;
+    size_t new_size2 = new_cap * elem_size2;
+    size_t old_size1 = old_cap * elem_size1;
+    size_t old_size2 = old_cap * elem_size2;
+
+    /* Allocate fresh memory blocks (not realloc) to ensure atomicity.
+     * If either allocation fails, original arrays are completely unchanged. */
+    void *new_array1 = PyMem_Malloc(new_size1);
+    if (!new_array1) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    void *new_array2 = PyMem_Malloc(new_size2);
+    if (!new_array2) {
+        /* Second allocation failed - free first and return with no state change */
+        PyMem_Free(new_array1);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /* Both allocations succeeded - copy data and update pointers atomically */
+    memcpy(new_array1, *array1, old_size1);
+    memcpy(new_array2, *array2, old_size2);
+
+    PyMem_Free(*array1);
+    PyMem_Free(*array2);
+
+    *array1 = new_array1;
+    *array2 = new_array2;
+    *capacity = new_cap;
+    return 0;
+}
+
+/* Checked fwrite with GIL release - returns 0 on success, -1 on error (sets PyErr).
+ * This version releases the GIL during the write operation to allow other Python
+ * threads to run during potentially blocking I/O. */
+static inline int
+fwrite_checked_allow_threads(const void *data, size_t size, FILE *fp)
+{
+    size_t written;
+    Py_BEGIN_ALLOW_THREADS
+    written = fwrite(data, 1, size, fp);
+    Py_END_ALLOW_THREADS
+    if (written != size) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    return 0;
+}
+
+/* Forward declaration for writer_write_bytes */
+static inline int writer_write_bytes(BinaryWriter *writer, const void *data, size_t size);
+
+/* Encode and write a varint u32 - returns 0 on success, -1 on error */
+static inline int
+writer_write_varint_u32(BinaryWriter *writer, uint32_t value)
+{
+    uint8_t buf[MAX_VARINT_SIZE];
+    size_t len = encode_varint_u32(buf, value);
+    return writer_write_bytes(writer, buf, len);
+}
+
+/* Encode and write a varint u64 - returns 0 on success, -1 on error */
+static inline int
+writer_write_varint_u64(BinaryWriter *writer, uint64_t value)
+{
+    uint8_t buf[MAX_VARINT_SIZE];
+    size_t len = encode_varint_u64(buf, value);
+    return writer_write_bytes(writer, buf, len);
+}
+
+
+/* ============================================================================
+ * UTILITY FUNCTIONS
+ * ============================================================================ */
+
+int
+binary_io_zstd_available(void)
+{
+#ifdef HAVE_ZSTD
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int
+binary_io_get_best_compression(void)
+{
+#ifdef HAVE_ZSTD
+    return COMPRESSION_ZSTD;
+#else
+    return COMPRESSION_NONE;
+#endif
+}
+
+/* ============================================================================
+ * BINARY WRITER IMPLEMENTATION
+ * ============================================================================ */
+
+static int
+writer_init_zstd(BinaryWriter *writer)
+{
+#ifdef HAVE_ZSTD
+    writer->zstd.cctx = ZSTD_createCCtx();
+    if (!writer->zstd.cctx) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to create zstd compression context");
+        return -1;
+    }
+
+    /* Compression level 5: better ratio for repetitive profiling data */
+    size_t result = ZSTD_CCtx_setParameter(writer->zstd.cctx,
+                                           ZSTD_c_compressionLevel, 5);
+    if (ZSTD_isError(result)) {
+        PyErr_Format(PyExc_RuntimeError, "Failed to set zstd compression level: %s",
+                     ZSTD_getErrorName(result));
+        ZSTD_freeCCtx(writer->zstd.cctx);
+        writer->zstd.cctx = NULL;
+        return -1;
+    }
+
+    /* Use large buffer (512KB) for fewer I/O syscalls */
+    writer->zstd.compressed_buffer = PyMem_Malloc(COMPRESSED_BUFFER_SIZE);
+    if (!writer->zstd.compressed_buffer) {
+        ZSTD_freeCCtx(writer->zstd.cctx);
+        writer->zstd.cctx = NULL;
+        PyErr_NoMemory();
+        return -1;
+    }
+    writer->zstd.compressed_buffer_size = COMPRESSED_BUFFER_SIZE;
+
+    return 0;
+#else
+    PyErr_SetString(PyExc_RuntimeError,
+        "zstd compression requested but not available (HAVE_ZSTD not defined)");
+    return -1;
+#endif
+}
+
+static int
+writer_flush_buffer(BinaryWriter *writer)
+{
+    if (writer->buffer_pos == 0) {
+        return 0;
+    }
+
+#ifdef HAVE_ZSTD
+    if (writer->compression_type == COMPRESSION_ZSTD) {
+        ZSTD_inBuffer input = { writer->write_buffer, writer->buffer_pos, 0 };
+
+        while (input.pos < input.size) {
+            ZSTD_outBuffer output = {
+                writer->zstd.compressed_buffer,
+                writer->zstd.compressed_buffer_size,
+                0
+            };
+
+            size_t result = ZSTD_compressStream2(
+                writer->zstd.cctx, &output, &input, ZSTD_e_continue
+            );
+
+            if (ZSTD_isError(result)) {
+                PyErr_Format(PyExc_IOError, "zstd compression error: %s",
+                             ZSTD_getErrorName(result));
+                return -1;
+            }
+
+            if (output.pos > 0) {
+                if (fwrite_checked_allow_threads(writer->zstd.compressed_buffer, output.pos, writer->fp) < 0) {
+                    return -1;
+                }
+            }
+        }
+    } else
+#endif
+    {
+        if (fwrite_checked_allow_threads(writer->write_buffer, writer->buffer_pos, writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    writer->buffer_pos = 0;
+    return 0;
+}
+
+static inline int
+writer_write_bytes(BinaryWriter *writer, const void *data, size_t size)
+{
+    const uint8_t *src = (const uint8_t *)data;
+    size_t original_size = size;
+
+    while (size > 0) {
+        size_t space = writer->buffer_size - writer->buffer_pos;
+        size_t to_copy = (size < space) ? size : space;
+
+        memcpy(writer->write_buffer + writer->buffer_pos, src, to_copy);
+        writer->buffer_pos += to_copy;
+        src += to_copy;
+        size -= to_copy;
+
+        if (writer->buffer_pos == writer->buffer_size) {
+            if (writer_flush_buffer(writer) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    writer->stats.bytes_written += original_size;
+    return 0;
+}
+
+/* ============================================================================
+ * HASH TABLE SUPPORT FUNCTIONS (using _Py_hashtable)
+ * ============================================================================ */
+
+/* Hash function for Python strings - uses Python's cached hash */
+static Py_uhash_t
+string_hash_func(const void *key)
+{
+    PyObject *str = (PyObject *)key;
+    Py_hash_t hash = PyObject_Hash(str);
+    if (hash == -1) {
+        PyErr_Clear();
+        return 0;
+    }
+    return (Py_uhash_t)hash;
+}
+
+static int
+string_compare_func(const void *key1, const void *key2)
+{
+    PyObject *str1 = (PyObject *)key1;
+    PyObject *str2 = (PyObject *)key2;
+    if (str1 == str2) {
+        return 1;
+    }
+    int result = PyObject_RichCompareBool(str1, str2, Py_EQ);
+    if (result == -1) {
+        PyErr_Clear();
+        return 0;
+    }
+    return result;
+}
+
+static void
+string_key_destroy(void *key)
+{
+    Py_XDECREF((PyObject *)key);
+}
+
+static Py_uhash_t
+frame_key_hash_func(const void *key)
+{
+    const FrameKey *fk = (const FrameKey *)key;
+    /* FNV-1a style hash combining all three values */
+    Py_uhash_t hash = 2166136261u;
+    hash ^= fk->filename_idx;
+    hash *= 16777619u;
+    hash ^= fk->funcname_idx;
+    hash *= 16777619u;
+    hash ^= (uint32_t)fk->lineno;
+    hash *= 16777619u;
+    return hash;
+}
+
+static int
+frame_key_compare_func(const void *key1, const void *key2)
+{
+    const FrameKey *fk1 = (const FrameKey *)key1;
+    const FrameKey *fk2 = (const FrameKey *)key2;
+    return (fk1->filename_idx == fk2->filename_idx &&
+            fk1->funcname_idx == fk2->funcname_idx &&
+            fk1->lineno == fk2->lineno);
+}
+
+static void
+frame_key_destroy(void *key)
+{
+    PyMem_Free(key);
+}
+
+static inline int
+writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index)
+{
+    void *existing = _Py_hashtable_get(writer->string_hash, string);
+    if (existing != NULL) {
+        *index = (uint32_t)(uintptr_t)existing - 1;  /* index+1 stored to distinguish from NULL */
+        return 0;
+    }
+
+    if (writer->string_count >= writer->string_capacity) {
+        if (grow_parallel_arrays((void **)&writer->strings,
+                                  (void **)&writer->string_lengths,
+                                  &writer->string_capacity,
+                                  sizeof(char *), sizeof(size_t)) < 0) {
+            return -1;
+        }
+    }
+
+    Py_ssize_t str_len;
+    const char *str_data = PyUnicode_AsUTF8AndSize(string, &str_len);
+    if (!str_data) {
+        return -1;
+    }
+
+    char *str_copy = PyMem_Malloc(str_len + 1);
+    if (!str_copy) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    memcpy(str_copy, str_data, str_len + 1);
+
+    *index = (uint32_t)writer->string_count;
+
+    /* Add to hash table FIRST to ensure atomic rollback on failure */
+    Py_INCREF(string);
+    if (_Py_hashtable_set(writer->string_hash, string, (void *)(uintptr_t)(*index + 1)) < 0) {
+        Py_DECREF(string);
+        PyMem_Free(str_copy);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    writer->strings[writer->string_count] = str_copy;
+    writer->string_lengths[writer->string_count] = str_len;
+    writer->string_count++;
+
+    return 0;
+}
+
+static inline int
+writer_intern_frame(BinaryWriter *writer, uint32_t filename_idx, uint32_t funcname_idx,
+                    int32_t lineno, uint32_t *index)
+{
+    FrameKey lookup_key = {filename_idx, funcname_idx, lineno};
+
+    void *existing = _Py_hashtable_get(writer->frame_hash, &lookup_key);
+    if (existing != NULL) {
+        *index = (uint32_t)(uintptr_t)existing - 1;  /* index+1 stored to distinguish from NULL */
+        return 0;
+    }
+
+    if (GROW_ARRAY(writer->frame_entries, writer->frame_count,
+                   writer->frame_capacity, FrameEntry) < 0) {
+        return -1;
+    }
+
+    FrameKey *key = PyMem_Malloc(sizeof(FrameKey));
+    if (!key) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    *key = lookup_key;
+
+    *index = (uint32_t)writer->frame_count;
+    FrameEntry *fe = &writer->frame_entries[writer->frame_count];
+    fe->filename_idx = filename_idx;
+    fe->funcname_idx = funcname_idx;
+    fe->lineno = lineno;
+
+    if (_Py_hashtable_set(writer->frame_hash, key, (void *)(uintptr_t)(*index + 1)) < 0) {
+        PyMem_Free(key);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    writer->frame_count++;
+    return 0;
+}
+
+/* Get or create a thread entry for the given thread_id.
+ * Returns pointer to ThreadEntry, or NULL on allocation failure.
+ * If is_new is non-NULL, sets it to 1 if this is a new thread, 0 otherwise. */
+static ThreadEntry *
+writer_get_or_create_thread_entry(BinaryWriter *writer, uint64_t thread_id,
+                                   uint32_t interpreter_id, int *is_new)
+{
+    /* Linear search is OK for small number of threads.
+     * Key is (thread_id, interpreter_id) since same thread_id can exist in different interpreters. */
+    for (size_t i = 0; i < writer->thread_count; i++) {
+        if (writer->thread_entries[i].thread_id == thread_id &&
+            writer->thread_entries[i].interpreter_id == interpreter_id) {
+            if (is_new) {
+                *is_new = 0;
+            }
+            return &writer->thread_entries[i];
+        }
+    }
+
+    if (writer->thread_count >= writer->thread_capacity) {
+        writer->thread_entries = grow_array(writer->thread_entries,
+                                            &writer->thread_capacity,
+                                            sizeof(ThreadEntry));
+        if (!writer->thread_entries) {
+            return NULL;
+        }
+    }
+
+    ThreadEntry *entry = &writer->thread_entries[writer->thread_count];
+    memset(entry, 0, sizeof(ThreadEntry));
+    entry->thread_id = thread_id;
+    entry->interpreter_id = interpreter_id;
+    entry->prev_timestamp = writer->start_time_us;
+    entry->prev_stack_capacity = MAX_STACK_DEPTH;
+    entry->pending_rle_capacity = INITIAL_RLE_CAPACITY;
+
+    entry->prev_stack = PyMem_Malloc(entry->prev_stack_capacity * sizeof(uint32_t));
+    if (!entry->prev_stack) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    entry->pending_rle = PyMem_Malloc(entry->pending_rle_capacity * sizeof(PendingRLESample));
+    if (!entry->pending_rle) {
+        PyMem_Free(entry->prev_stack);
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    writer->thread_count++;
+    if (is_new) {
+        *is_new = 1;
+    }
+    return entry;
+}
+
+/* Compare two stacks and return the encoding type and parameters.
+ * Sets:
+ *   - shared_count: number of frames matching from bottom of stack
+ *   - pop_count: frames to remove from prev stack
+ *   - push_count: new frames to add
+ *
+ * Returns the best encoding type to use. */
+static int
+compare_stacks(const uint32_t *prev_stack, size_t prev_depth,
+               const uint32_t *curr_stack, size_t curr_depth,
+               size_t *shared_count, size_t *pop_count, size_t *push_count)
+{
+    if (prev_depth == curr_depth) {
+        int identical = 1;
+        for (size_t i = 0; i < prev_depth; i++) {
+            if (prev_stack[i] != curr_stack[i]) {
+                identical = 0;
+                break;
+            }
+        }
+        if (identical) {
+            *shared_count = prev_depth;
+            *pop_count = 0;
+            *push_count = 0;
+            return STACK_REPEAT;
+        }
+    }
+
+    /* Find longest common suffix (frames at the bottom/outer part of stack).
+     * Stacks are stored innermost-first, so suffix is at the end. */
+    size_t suffix_len = 0;
+    size_t min_depth = (prev_depth < curr_depth) ? prev_depth : curr_depth;
+
+    for (size_t i = 0; i < min_depth; i++) {
+        size_t prev_idx = prev_depth - 1 - i;
+        size_t curr_idx = curr_depth - 1 - i;
+        if (prev_stack[prev_idx] == curr_stack[curr_idx]) {
+            suffix_len++;
+        } else {
+            break;
+        }
+    }
+
+    *shared_count = suffix_len;
+    *pop_count = prev_depth - suffix_len;
+    *push_count = curr_depth - suffix_len;
+
+    /* Choose best encoding based on byte cost */
+    /* STACK_FULL: 1 (type) + 1-2 (depth) + sum(frame varints) */
+    /* STACK_SUFFIX: 1 (type) + 1-2 (shared) + 1-2 (new_count) + sum(new frame varints) */
+    /* STACK_POP_PUSH: 1 (type) + 1-2 (pop) + 1-2 (push) + sum(new frame varints) */
+
+    /* If no common suffix, use full stack */
+    if (suffix_len == 0) {
+        return STACK_FULL;
+    }
+
+    /* If only adding frames (suffix == prev_depth), use SUFFIX */
+    if (*pop_count == 0 && *push_count > 0) {
+        return STACK_SUFFIX;
+    }
+
+    /* If popping and/or pushing, use POP_PUSH if it saves bytes */
+    /* Heuristic: POP_PUSH is better when we're modifying top frames */
+    if (*pop_count > 0 || *push_count > 0) {
+        /* Use full stack if sharing less than half the frames */
+        if (suffix_len < curr_depth / 2) {
+            return STACK_FULL;
+        }
+        return STACK_POP_PUSH;
+    }
+
+    return STACK_FULL;
+}
+
+/* Write common sample header: thread_id(8) + interpreter_id(4) + encoding(1).
+ * Returns 0 on success, -1 on failure. */
+static inline int
+write_sample_header(BinaryWriter *writer, ThreadEntry *entry, uint8_t encoding)
+{
+    uint8_t header[SAMPLE_HEADER_FIXED_SIZE];
+    memcpy(header, &entry->thread_id, 8);
+    memcpy(header + 8, &entry->interpreter_id, 4);
+    header[12] = encoding;
+    return writer_write_bytes(writer, header, SAMPLE_HEADER_FIXED_SIZE);
+}
+
+/* Flush pending RLE samples for a thread.
+ * Writes the RLE record to the output buffer.
+ * Returns 0 on success, -1 on failure. */
+static int
+flush_pending_rle(BinaryWriter *writer, ThreadEntry *entry)
+{
+    if (!entry->has_pending_rle || entry->pending_rle_count == 0) {
+        return 0;
+    }
+
+    /* Write RLE record:
+     * [thread_id: 8] [interpreter_id: 4] [STACK_REPEAT: 1] [count: varint]
+     * [timestamp_delta_1: varint] [status_1: 1] ... [timestamp_delta_N: varint] [status_N: 1]
+     */
+
+    if (write_sample_header(writer, entry, STACK_REPEAT) < 0) {
+        return -1;
+    }
+
+    if (writer_write_varint_u32(writer, (uint32_t)entry->pending_rle_count) < 0) {
+        return -1;
+    }
+
+    for (size_t i = 0; i < entry->pending_rle_count; i++) {
+        if (writer_write_varint_u64(writer, entry->pending_rle[i].timestamp_delta) < 0) {
+            return -1;
+        }
+        if (writer_write_bytes(writer, &entry->pending_rle[i].status, 1) < 0) {
+            return -1;
+        }
+        writer->total_samples++;
+    }
+
+    writer->stats.repeat_records++;
+    writer->stats.repeat_samples += entry->pending_rle_count;
+    /* Each RLE sample saves writing the entire stack */
+    writer->stats.frames_saved += entry->pending_rle_count * entry->prev_stack_depth;
+
+    entry->pending_rle_count = 0;
+    entry->has_pending_rle = 0;
+
+    return 0;
+}
+
+/* Write a single sample with the specified encoding.
+ * Returns 0 on success, -1 on failure. */
+static int
+write_sample_with_encoding(BinaryWriter *writer, ThreadEntry *entry,
+                           uint64_t timestamp_delta, uint8_t status,
+                           int encoding_type,
+                           const uint32_t *frame_indices, size_t stack_depth,
+                           size_t shared_count, size_t pop_count, size_t push_count)
+{
+    /* Header: thread_id(8) + interpreter_id(4) + encoding(1) + delta(varint) + status(1) */
+    uint8_t header_buf[SAMPLE_HEADER_MAX_SIZE];
+    memcpy(header_buf, &entry->thread_id, 8);
+    memcpy(header_buf + 8, &entry->interpreter_id, 4);
+    header_buf[12] = (uint8_t)encoding_type;
+    size_t varint_len = encode_varint_u64(header_buf + 13, timestamp_delta);
+    header_buf[13 + varint_len] = status;
+
+    if (writer_write_bytes(writer, header_buf, 14 + varint_len) < 0) {
+        return -1;
+    }
+
+    uint8_t frame_buf[MAX_FRAME_BUFFER_SIZE];
+    size_t frame_buf_pos = 0;
+    size_t frames_written = 0;
+
+    switch (encoding_type) {
+    case STACK_FULL:
+        /* [depth: varint] [frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)stack_depth);
+        for (size_t i = 0; i < stack_depth; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = stack_depth;
+        writer->stats.full_records++;
+        break;
+
+    case STACK_SUFFIX:
+        /* [shared_count: varint] [new_count: varint] [new_frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)shared_count);
+        frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, (uint32_t)push_count);
+        /* New frames are at the top (beginning) of current stack */
+        for (size_t i = 0; i < push_count; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = push_count;
+        writer->stats.suffix_records++;
+        /* Saved writing shared_count frames */
+        writer->stats.frames_saved += shared_count;
+        break;
+
+    case STACK_POP_PUSH:
+        /* [pop_count: varint] [push_count: varint] [new_frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)pop_count);
+        frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, (uint32_t)push_count);
+        /* New frames are at the top (beginning) of current stack */
+        for (size_t i = 0; i < push_count; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = push_count;
+        writer->stats.pop_push_records++;
+        /* Saved writing shared_count frames (stack_depth - push_count if we had written full) */
+        writer->stats.frames_saved += shared_count;
+        break;
+
+    default:
+        PyErr_SetString(PyExc_RuntimeError, "Invalid stack encoding type");
+        return -1;
+    }
+
+    if (writer_write_bytes(writer, frame_buf, frame_buf_pos) < 0) {
+        return -1;
+    }
+
+    writer->stats.total_frames_written += frames_written;
+    writer->total_samples++;
+    return 0;
+}
+
+BinaryWriter *
+binary_writer_create(const char *filename, uint64_t sample_interval_us, int compression_type,
+                     uint64_t start_time_us)
+{
+    BinaryWriter *writer = PyMem_Calloc(1, sizeof(BinaryWriter));
+    if (!writer) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    writer->filename = PyMem_Malloc(strlen(filename) + 1);
+    if (!writer->filename) {
+        PyMem_Free(writer);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    strcpy(writer->filename, filename);
+
+    writer->start_time_us = start_time_us;
+    writer->sample_interval_us = sample_interval_us;
+    writer->compression_type = compression_type;
+
+    writer->write_buffer = PyMem_Malloc(WRITE_BUFFER_SIZE);
+    if (!writer->write_buffer) {
+        goto error;
+    }
+    writer->buffer_size = WRITE_BUFFER_SIZE;
+
+    writer->string_hash = _Py_hashtable_new_full(
+        string_hash_func,
+        string_compare_func,
+        string_key_destroy,  /* Key destroy: decref the Python string */
+        NULL,                /* Value destroy: values are just indices, not pointers */
+        NULL                 /* Use default allocator */
+    );
+    if (!writer->string_hash) {
+        goto error;
+    }
+    writer->strings = PyMem_Malloc(INITIAL_STRING_CAPACITY * sizeof(char *));
+    if (!writer->strings) {
+        goto error;
+    }
+    writer->string_lengths = PyMem_Malloc(INITIAL_STRING_CAPACITY * sizeof(size_t));
+    if (!writer->string_lengths) {
+        goto error;
+    }
+    writer->string_capacity = INITIAL_STRING_CAPACITY;
+
+    writer->frame_hash = _Py_hashtable_new_full(
+        frame_key_hash_func,
+        frame_key_compare_func,
+        frame_key_destroy,   /* Key destroy: free the FrameKey */
+        NULL,                /* Value destroy: values are just indices, not pointers */
+        NULL                 /* Use default allocator */
+    );
+    if (!writer->frame_hash) {
+        goto error;
+    }
+    writer->frame_entries = PyMem_Malloc(INITIAL_FRAME_CAPACITY * sizeof(FrameEntry));
+    if (!writer->frame_entries) {
+        goto error;
+    }
+    writer->frame_capacity = INITIAL_FRAME_CAPACITY;
+
+    writer->thread_entries = PyMem_Malloc(INITIAL_THREAD_CAPACITY * sizeof(ThreadEntry));
+    if (!writer->thread_entries) {
+        goto error;
+    }
+    writer->thread_capacity = INITIAL_THREAD_CAPACITY;
+
+    if (compression_type == COMPRESSION_ZSTD) {
+        if (writer_init_zstd(writer) < 0) {
+            goto error;
+        }
+    }
+
+    writer->fp = fopen(filename, "wb");
+    if (!writer->fp) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    /* Hint sequential write pattern to kernel for better I/O scheduling */
+#if defined(__linux__) && defined(POSIX_FADV_SEQUENTIAL)
+    {
+        int fd = fileno(writer->fp);
+        if (fd >= 0) {
+            (void)posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+        }
+    }
+#endif
+
+    uint8_t header[FILE_HEADER_PLACEHOLDER_SIZE] = {0};
+    if (fwrite_checked_allow_threads(header, FILE_HEADER_PLACEHOLDER_SIZE, writer->fp) < 0) {
+        goto error;
+    }
+
+    return writer;
+
+error:
+    binary_writer_destroy(writer);
+    return NULL;
+}
+
+/* Build a frame stack from Python frame list by interning all strings and frames.
+ * Returns 0 on success, -1 on error. */
+static int
+build_frame_stack(BinaryWriter *writer, PyObject *frame_list,
+                  uint32_t *curr_stack, size_t *curr_depth)
+{
+    Py_ssize_t stack_depth = PyList_Size(frame_list);
+    *curr_depth = (stack_depth < MAX_STACK_DEPTH) ? stack_depth : MAX_STACK_DEPTH;
+
+    for (Py_ssize_t k = 0; k < (Py_ssize_t)*curr_depth; k++) {
+        /* Use unchecked accessors since we control the data structures */
+        PyObject *frame_info = PyList_GET_ITEM(frame_list, k);
+
+        /* Get filename, location, funcname from FrameInfo using unchecked access */
+        PyObject *filename = PyStructSequence_GET_ITEM(frame_info, 0);
+        PyObject *location = PyStructSequence_GET_ITEM(frame_info, 1);
+        PyObject *funcname = PyStructSequence_GET_ITEM(frame_info, 2);
+
+        /* Extract lineno from location (can be None for synthetic frames) */
+        int32_t lineno = 0;
+        if (location != Py_None) {
+            /* Use unchecked access - first element is lineno */
+            PyObject *lineno_obj = PyTuple_Check(location) ?
+                PyTuple_GET_ITEM(location, 0) :
+                PyStructSequence_GET_ITEM(location, 0);
+            lineno = (int32_t)PyLong_AsLong(lineno_obj);
+            if (UNLIKELY(PyErr_Occurred() != NULL)) {
+                PyErr_Clear();
+                lineno = 0;
+            }
+        }
+
+        /* Intern filename */
+        uint32_t filename_idx;
+        if (writer_intern_string(writer, filename, &filename_idx) < 0) {
+            return -1;
+        }
+
+        /* Intern funcname */
+        uint32_t funcname_idx;
+        if (writer_intern_string(writer, funcname, &funcname_idx) < 0) {
+            return -1;
+        }
+
+        /* Intern frame */
+        uint32_t frame_idx;
+        if (writer_intern_frame(writer, filename_idx, funcname_idx, lineno, &frame_idx) < 0) {
+            return -1;
+        }
+
+        curr_stack[k] = frame_idx;
+    }
+    return 0;
+}
+
+/* Process a single thread's sample.
+ * Returns 0 on success, -1 on error. */
+static int
+process_thread_sample(BinaryWriter *writer, PyObject *thread_info,
+                      uint32_t interpreter_id, uint64_t timestamp_us)
+{
+    PyObject *thread_id_obj = PyStructSequence_GET_ITEM(thread_info, 0);
+    PyObject *status_obj = PyStructSequence_GET_ITEM(thread_info, 1);
+    PyObject *frame_list = PyStructSequence_GET_ITEM(thread_info, 2);
+
+    uint64_t thread_id = PyLong_AsUnsignedLongLong(thread_id_obj);
+    if (thread_id == (uint64_t)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    long status_long = PyLong_AsLong(status_obj);
+    if (status_long == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+    uint8_t status = (uint8_t)status_long;
+
+    int is_new_thread = 0;
+    ThreadEntry *entry = writer_get_or_create_thread_entry(
+        writer, thread_id, interpreter_id, &is_new_thread);
+    if (!entry) {
+        return -1;
+    }
+
+    /* Calculate timestamp delta */
+    uint64_t delta = timestamp_us - entry->prev_timestamp;
+    entry->prev_timestamp = timestamp_us;
+
+    /* Process frames and build current stack */
+    uint32_t curr_stack[MAX_STACK_DEPTH];
+    size_t curr_depth;
+    if (build_frame_stack(writer, frame_list, curr_stack, &curr_depth) < 0) {
+        return -1;
+    }
+
+    /* Compare with previous stack to determine encoding */
+    size_t shared_count, pop_count, push_count;
+    int encoding = compare_stacks(
+        entry->prev_stack, entry->prev_stack_depth,
+        curr_stack, curr_depth,
+        &shared_count, &pop_count, &push_count);
+
+    if (encoding == STACK_REPEAT && !is_new_thread) {
+        /* Buffer this sample for RLE */
+        if (GROW_ARRAY(entry->pending_rle, entry->pending_rle_count,
+                       entry->pending_rle_capacity, PendingRLESample) < 0) {
+            return -1;
+        }
+        entry->pending_rle[entry->pending_rle_count].timestamp_delta = delta;
+        entry->pending_rle[entry->pending_rle_count].status = status;
+        entry->pending_rle_count++;
+        entry->has_pending_rle = 1;
+    } else {
+        /* Stack changed - flush any pending RLE first */
+        if (entry->has_pending_rle) {
+            if (flush_pending_rle(writer, entry) < 0) {
+                return -1;
+            }
+        }
+
+        if (write_sample_with_encoding(writer, entry, delta, status, encoding,
+                                       curr_stack, curr_depth,
+                                       shared_count, pop_count, push_count) < 0) {
+            return -1;
+        }
+
+        memcpy(entry->prev_stack, curr_stack, curr_depth * sizeof(uint32_t));
+        entry->prev_stack_depth = curr_depth;
+    }
+
+    return 0;
+}
+
+int
+binary_writer_write_sample(BinaryWriter *writer, PyObject *stack_frames, uint64_t timestamp_us)
+{
+    if (!PyList_Check(stack_frames)) {
+        PyErr_SetString(PyExc_TypeError, "stack_frames must be a list");
+        return -1;
+    }
+
+    Py_ssize_t num_interpreters = PyList_GET_SIZE(stack_frames);
+    for (Py_ssize_t i = 0; i < num_interpreters; i++) {
+        PyObject *interp_info = PyList_GET_ITEM(stack_frames, i);
+
+        PyObject *interp_id_obj = PyStructSequence_GET_ITEM(interp_info, 0);
+        PyObject *threads = PyStructSequence_GET_ITEM(interp_info, 1);
+
+        unsigned long interp_id_long = PyLong_AsUnsignedLong(interp_id_obj);
+        if (interp_id_long == (unsigned long)-1 && PyErr_Occurred()) {
+            return -1;
+        }
+        /* Bounds check: interpreter_id is stored as uint32_t in binary format */
+        if (interp_id_long > UINT32_MAX) {
+            PyErr_Format(PyExc_OverflowError,
+                "interpreter_id %lu exceeds maximum value %lu",
+                interp_id_long, (unsigned long)UINT32_MAX);
+            return -1;
+        }
+        uint32_t interpreter_id = (uint32_t)interp_id_long;
+
+        Py_ssize_t num_threads = PyList_GET_SIZE(threads);
+        for (Py_ssize_t j = 0; j < num_threads; j++) {
+            PyObject *thread_info = PyList_GET_ITEM(threads, j);
+            if (process_thread_sample(writer, thread_info, interpreter_id, timestamp_us) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int
+binary_writer_finalize(BinaryWriter *writer)
+{
+    for (size_t i = 0; i < writer->thread_count; i++) {
+        if (writer->thread_entries[i].has_pending_rle) {
+            if (flush_pending_rle(writer, &writer->thread_entries[i]) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    if (writer_flush_buffer(writer) < 0) {
+        return -1;
+    }
+
+#ifdef HAVE_ZSTD
+    /* Finalize compression stream */
+    if (writer->compression_type == COMPRESSION_ZSTD && writer->zstd.cctx) {
+        ZSTD_inBuffer input = { NULL, 0, 0 };
+        size_t remaining;
+
+        do {
+            ZSTD_outBuffer output = {
+                writer->zstd.compressed_buffer,
+                writer->zstd.compressed_buffer_size,
+                0
+            };
+
+            remaining = ZSTD_compressStream2(writer->zstd.cctx, &output, &input, ZSTD_e_end);
+
+            if (ZSTD_isError(remaining)) {
+                PyErr_Format(PyExc_IOError, "zstd finalization error: %s",
+                             ZSTD_getErrorName(remaining));
+                return -1;
+            }
+
+            if (output.pos > 0) {
+                if (fwrite_checked_allow_threads(writer->zstd.compressed_buffer, output.pos, writer->fp) < 0) {
+                    return -1;
+                }
+            }
+        } while (remaining > 0);
+    }
+#endif
+
+    /* Use 64-bit file position for >2GB files */
+    file_offset_t string_table_offset = FTELL64(writer->fp);
+    if (string_table_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    /* Release GIL during potentially large writes */
+    for (size_t i = 0; i < writer->string_count; i++) {
+        uint8_t len_buf[10];
+        size_t len_size = encode_varint_u32(len_buf, (uint32_t)writer->string_lengths[i]);
+        if (fwrite_checked_allow_threads(len_buf, len_size, writer->fp) < 0 ||
+            fwrite_checked_allow_threads(writer->strings[i], writer->string_lengths[i], writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    file_offset_t frame_table_offset = FTELL64(writer->fp);
+    if (frame_table_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    for (size_t i = 0; i < writer->frame_count; i++) {
+        FrameEntry *entry = &writer->frame_entries[i];
+        uint8_t buf[30];
+        size_t pos = encode_varint_u32(buf, entry->filename_idx);
+        pos += encode_varint_u32(buf + pos, entry->funcname_idx);
+        pos += encode_varint_i32(buf + pos, entry->lineno);
+        if (fwrite_checked_allow_threads(buf, pos, writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    /* Footer: string_count(4) + frame_count(4) + file_size(8) + checksum(16) */
+    file_offset_t footer_offset = FTELL64(writer->fp);
+    if (footer_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    uint64_t file_size = (uint64_t)footer_offset + 32;
+    uint8_t footer[32] = {0};
+    memcpy(footer + 0, &writer->string_count, 4);
+    memcpy(footer + 4, &writer->frame_count, 4);
+    memcpy(footer + 8, &file_size, 8);
+    /* bytes 16-31: checksum placeholder (zeros) */
+    if (fwrite_checked_allow_threads(footer, 32, writer->fp) < 0) {
+        return -1;
+    }
+
+    if (FSEEK64(writer->fp, 0, SEEK_SET) < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    /* Convert file offsets to uint64_t for portable header format */
+    uint64_t string_table_offset_u64 = (uint64_t)string_table_offset;
+    uint64_t frame_table_offset_u64 = (uint64_t)frame_table_offset;
+
+    uint8_t header[52] = {0};
+    uint32_t magic = BINARY_FORMAT_MAGIC;
+    uint32_t version = BINARY_FORMAT_VERSION;
+    memcpy(header + 0, &magic, 4);
+    memcpy(header + 4, &version, 4);
+    memcpy(header + 8, &writer->start_time_us, 8);
+    memcpy(header + 16, &writer->sample_interval_us, 8);
+    memcpy(header + 24, &writer->total_samples, 4);
+    memcpy(header + 28, &writer->thread_count, 4);
+    memcpy(header + 32, &string_table_offset_u64, 8);
+    memcpy(header + 40, &frame_table_offset_u64, 8);
+    memcpy(header + 48, &writer->compression_type, 4);
+    if (fwrite_checked_allow_threads(header, 52, writer->fp) < 0) {
+        return -1;
+    }
+
+    if (fclose(writer->fp) != 0) {
+        writer->fp = NULL;
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    writer->fp = NULL;
+
+    return 0;
+}
+
+void
+binary_writer_destroy(BinaryWriter *writer)
+{
+    if (!writer) {
+        return;
+    }
+
+    if (writer->fp) {
+        fclose(writer->fp);
+    }
+
+    PyMem_Free(writer->filename);
+    PyMem_Free(writer->write_buffer);
+
+#ifdef HAVE_ZSTD
+    if (writer->zstd.cctx) {
+        ZSTD_freeCCtx(writer->zstd.cctx);
+    }
+    PyMem_Free(writer->zstd.compressed_buffer);
+#endif
+
+    if (writer->string_hash) {
+        _Py_hashtable_destroy(writer->string_hash);
+    }
+    if (writer->strings) {
+        for (size_t i = 0; i < writer->string_count; i++) {
+            PyMem_Free(writer->strings[i]);
+        }
+        PyMem_Free(writer->strings);
+    }
+    PyMem_Free(writer->string_lengths);
+
+    if (writer->frame_hash) {
+        _Py_hashtable_destroy(writer->frame_hash);
+    }
+    PyMem_Free(writer->frame_entries);
+
+    if (writer->thread_entries) {
+        for (size_t i = 0; i < writer->thread_count; i++) {
+            PyMem_Free(writer->thread_entries[i].prev_stack);
+            PyMem_Free(writer->thread_entries[i].pending_rle);
+        }
+        PyMem_Free(writer->thread_entries);
+    }
+
+    PyMem_Free(writer);
+}
+
diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h
index 5cbf64517af608..263dfd685657da 100644
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@@ -7,6 +7,7 @@ preserve
 #  include "pycore_runtime.h"     // _Py_ID()
 #endif
 #include "pycore_critical_section.h"// Py_BEGIN_CRITICAL_SECTION()
+#include "pycore_long.h"          // _PyLong_UnsignedLongLong_Converter()
 #include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()
 
 PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
@@ -434,6 +435,659 @@ _remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(i
     return return_value;
 }
 
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___init____doc__,
+"BinaryWriter(filename, sample_interval_us, start_time_us, *,\n"
+"             compression=0)\n"
+"--\n"
+"\n"
+"High-performance binary writer for profiling data.\n"
+"\n"
+"Arguments:\n"
+"    filename: Path to output file\n"
+"    sample_interval_us: Sampling interval in microseconds\n"
+"    start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)\n"
+"    compression: 0=none, 1=zstd (default: 0)\n"
+"\n"
+"Use as a context manager or call finalize() when done.");
+
+static int
+_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self,
+                                             const char *filename,
+                                             unsigned long long sample_interval_us,
+                                             unsigned long long start_time_us,
+                                             int compression);
+
+static int
+_remote_debugging_BinaryWriter___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 4
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(filename), &_Py_ID(sample_interval_us), &_Py_ID(start_time_us), &_Py_ID(compression), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"filename", "sample_interval_us", "start_time_us", "compression", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "BinaryWriter",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[4];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 3;
+    const char *filename;
+    unsigned long long sample_interval_us;
+    unsigned long long start_time_us;
+    int compression = 0;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 3, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("BinaryWriter", "argument 'filename'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t filename_length;
+    filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length);
+    if (filename == NULL) {
+        goto exit;
+    }
+    if (strlen(filename) != (size_t)filename_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(fastargs[1], &sample_interval_us)) {
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(fastargs[2], &start_time_us)) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    compression = PyLong_AsInt(fastargs[3]);
+    if (compression == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = _remote_debugging_BinaryWriter___init___impl((BinaryWriterObject *)self, filename, sample_interval_us, start_time_us, compression);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_write_sample__doc__,
+"write_sample($self, /, stack_frames, timestamp_us)\n"
+"--\n"
+"\n"
+"Write a sample to the binary file.\n"
+"\n"
+"Arguments:\n"
+"    stack_frames: List of InterpreterInfo objects\n"
+"    timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF    \
+    {"write_sample", _PyCFunction_CAST(_remote_debugging_BinaryWriter_write_sample), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter_write_sample__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self,
+                                                 PyObject *stack_frames,
+                                                 unsigned long long timestamp_us);
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(stack_frames), &_Py_ID(timestamp_us), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"stack_frames", "timestamp_us", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "write_sample",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    PyObject *stack_frames;
+    unsigned long long timestamp_us;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 2, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    stack_frames = args[0];
+    if (!_PyLong_UnsignedLongLong_Converter(args[1], &timestamp_us)) {
+        goto exit;
+    }
+    return_value = _remote_debugging_BinaryWriter_write_sample_impl((BinaryWriterObject *)self, stack_frames, timestamp_us);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_finalize__doc__,
+"finalize($self, /)\n"
+"--\n"
+"\n"
+"Finalize and close the binary file.\n"
+"\n"
+"Writes string/frame tables, footer, and updates header.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF    \
+    {"finalize", (PyCFunction)_remote_debugging_BinaryWriter_finalize, METH_NOARGS, _remote_debugging_BinaryWriter_finalize__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_finalize_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_close__doc__,
+"close($self, /)\n"
+"--\n"
+"\n"
+"Close the writer without finalizing (discards data).");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF    \
+    {"close", (PyCFunction)_remote_debugging_BinaryWriter_close, METH_NOARGS, _remote_debugging_BinaryWriter_close__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_close(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_close_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___enter____doc__,
+"__enter__($self, /)\n"
+"--\n"
+"\n"
+"Enter context manager.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF    \
+    {"__enter__", (PyCFunction)_remote_debugging_BinaryWriter___enter__, METH_NOARGS, _remote_debugging_BinaryWriter___enter____doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter__(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter___enter___impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___exit____doc__,
+"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n"
+"--\n"
+"\n"
+"Exit context manager, finalizing the file.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF    \
+    {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryWriter___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter___exit____doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb);
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 3
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "__exit__",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[3];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    PyObject *exc_type = Py_None;
+    PyObject *exc_val = Py_None;
+    PyObject *exc_tb = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    if (args[0]) {
+        exc_type = args[0];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    if (args[1]) {
+        exc_val = args[1];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    exc_tb = args[2];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryWriter___exit___impl((BinaryWriterObject *)self, exc_type, exc_val, exc_tb);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get encoding statistics for the writer.\n"
+"\n"
+"Returns a dict with encoding statistics including repeat/full/suffix/pop-push\n"
+"record counts, frames written/saved, and compression ratio.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_BinaryWriter_get_stats, METH_NOARGS, _remote_debugging_BinaryWriter_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_get_stats_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___init____doc__,
+"BinaryReader(filename)\n"
+"--\n"
+"\n"
+"High-performance binary reader for profiling data.\n"
+"\n"
+"Arguments:\n"
+"    filename: Path to input file\n"
+"\n"
+"Use as a context manager or call close() when done.");
+
+static int
+_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self,
+                                             const char *filename);
+
+static int
+_remote_debugging_BinaryReader___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(filename), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"filename", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "BinaryReader",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    const char *filename;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("BinaryReader", "argument 'filename'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t filename_length;
+    filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length);
+    if (filename == NULL) {
+        goto exit;
+    }
+    if (strlen(filename) != (size_t)filename_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    return_value = _remote_debugging_BinaryReader___init___impl((BinaryReaderObject *)self, filename);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_replay__doc__,
+"replay($self, /, collector, progress_callback=None)\n"
+"--\n"
+"\n"
+"Replay samples through a collector.\n"
+"\n"
+"Arguments:\n"
+"    collector: Collector object with collect() method\n"
+"    progress_callback: Optional callable(current, total)\n"
+"\n"
+"Returns:\n"
+"    Number of samples replayed");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF    \
+    {"replay", _PyCFunction_CAST(_remote_debugging_BinaryReader_replay), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader_replay__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self,
+                                           PyObject *collector,
+                                           PyObject *progress_callback);
+
+static PyObject *
+_remote_debugging_BinaryReader_replay(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(collector), &_Py_ID(progress_callback), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"collector", "progress_callback", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "replay",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    PyObject *collector;
+    PyObject *progress_callback = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    collector = args[0];
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    progress_callback = args[1];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryReader_replay_impl((BinaryReaderObject *)self, collector, progress_callback);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_get_info__doc__,
+"get_info($self, /)\n"
+"--\n"
+"\n"
+"Get metadata about the binary file.\n"
+"\n"
+"Returns:\n"
+"    Dict with file metadata");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF    \
+    {"get_info", (PyCFunction)_remote_debugging_BinaryReader_get_info, METH_NOARGS, _remote_debugging_BinaryReader_get_info__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_get_info_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get reconstruction statistics from replay.\n"
+"\n"
+"Returns a dict with statistics about record types decoded and samples\n"
+"reconstructed during replay.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_BinaryReader_get_stats, METH_NOARGS, _remote_debugging_BinaryReader_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_get_stats_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_close__doc__,
+"close($self, /)\n"
+"--\n"
+"\n"
+"Close the reader and free resources.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF    \
+    {"close", (PyCFunction)_remote_debugging_BinaryReader_close, METH_NOARGS, _remote_debugging_BinaryReader_close__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_close(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_close_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___enter____doc__,
+"__enter__($self, /)\n"
+"--\n"
+"\n"
+"Enter context manager.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF    \
+    {"__enter__", (PyCFunction)_remote_debugging_BinaryReader___enter__, METH_NOARGS, _remote_debugging_BinaryReader___enter____doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader___enter__(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader___enter___impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___exit____doc__,
+"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n"
+"--\n"
+"\n"
+"Exit context manager, closing the file.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF    \
+    {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryReader___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader___exit____doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb);
+
+static PyObject *
+_remote_debugging_BinaryReader___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 3
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "__exit__",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[3];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    PyObject *exc_type = Py_None;
+    PyObject *exc_val = Py_None;
+    PyObject *exc_tb = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    if (args[0]) {
+        exc_type = args[0];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    if (args[1]) {
+        exc_val = args[1];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    exc_tb = args[2];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryReader___exit___impl((BinaryReaderObject *)self, exc_type, exc_val, exc_tb);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_zstd_available__doc__,
+"zstd_available($module, /)\n"
+"--\n"
+"\n"
+"Check if zstd compression is available.\n"
+"\n"
+"Returns:\n"
+"    True if zstd available, False otherwise");
+
+#define _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF    \
+    {"zstd_available", (PyCFunction)_remote_debugging_zstd_available, METH_NOARGS, _remote_debugging_zstd_available__doc__},
+
+static PyObject *
+_remote_debugging_zstd_available_impl(PyObject *module);
+
+static PyObject *
+_remote_debugging_zstd_available(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_zstd_available_impl(module);
+}
+
 PyDoc_STRVAR(_remote_debugging_get_child_pids__doc__,
 "get_child_pids($module, /, pid, *, recursive=True)\n"
 "--\n"
@@ -582,4 +1236,4 @@ _remote_debugging_is_python_process(PyObject *module, PyObject *const *args, Py_
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=dc0550ad3d6a409c input=a9049054013a1b77]*/
+/*[clinic end generated code: output=036de0b06d0e34cc input=a9049054013a1b77]*/
diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c
index fc58e2428b2009..c27b0471c0d20f 100644
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@@ -6,6 +6,20 @@
  ******************************************************************************/
 
 #include "_remote_debugging.h"
+#include "binary_io.h"
+
+/* Forward declarations for clinic-generated code */
+typedef struct {
+    PyObject_HEAD
+    BinaryWriter *writer;
+    uint32_t cached_total_samples;  /* Preserved after finalize */
+} BinaryWriterObject;
+
+typedef struct {
+    PyObject_HEAD
+    BinaryReader *reader;
+} BinaryReaderObject;
+
 #include "clinic/module.c.h"
 
 /* ============================================================================
@@ -970,6 +984,10 @@ static PyType_Spec RemoteUnwinder_spec = {
     .slots = RemoteUnwinder_slots,
 };
 
+/* Forward declarations for type specs defined later */
+static PyType_Spec BinaryWriter_spec;
+static PyType_Spec BinaryReader_spec;
+
 /* ============================================================================
  * MODULE INITIALIZATION
  * ============================================================================ */
@@ -1048,6 +1066,18 @@ _remote_debugging_exec(PyObject *m)
     if (PyModule_AddType(m, st->AwaitedInfo_Type) < 0) {
         return -1;
     }
+
+    // Create BinaryWriter and BinaryReader types
+    CREATE_TYPE(m, st->BinaryWriter_Type, &BinaryWriter_spec);
+    if (PyModule_AddType(m, st->BinaryWriter_Type) < 0) {
+        return -1;
+    }
+
+    CREATE_TYPE(m, st->BinaryReader_Type, &BinaryReader_spec);
+    if (PyModule_AddType(m, st->BinaryReader_Type) < 0) {
+        return -1;
+    }
+
 #ifdef Py_GIL_DISABLED
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
@@ -1091,6 +1121,8 @@ remote_debugging_traverse(PyObject *mod, visitproc visit, void *arg)
     Py_VISIT(state->ThreadInfo_Type);
     Py_VISIT(state->InterpreterInfo_Type);
     Py_VISIT(state->AwaitedInfo_Type);
+    Py_VISIT(state->BinaryWriter_Type);
+    Py_VISIT(state->BinaryReader_Type);
     return 0;
 }
 
@@ -1106,6 +1138,8 @@ remote_debugging_clear(PyObject *mod)
     Py_CLEAR(state->ThreadInfo_Type);
     Py_CLEAR(state->InterpreterInfo_Type);
     Py_CLEAR(state->AwaitedInfo_Type);
+    Py_CLEAR(state->BinaryWriter_Type);
+    Py_CLEAR(state->BinaryReader_Type);
     return 0;
 }
 
@@ -1115,13 +1149,504 @@ remote_debugging_free(void *mod)
     (void)remote_debugging_clear((PyObject *)mod);
 }
 
-static PyModuleDef_Slot remote_debugging_slots[] = {
-    {Py_mod_exec, _remote_debugging_exec},
-    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
-    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
-    {0, NULL},
+/* ============================================================================
+ * BINARY WRITER CLASS
+ * ============================================================================ */
+
+#define BinaryWriter_CAST(op) ((BinaryWriterObject *)(op))
+
+/*[clinic input]
+class _remote_debugging.BinaryWriter "BinaryWriterObject *" "&PyBinaryWriter_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e948838b90a2003c]*/
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__init__
+    filename: str
+    sample_interval_us: unsigned_long_long
+    start_time_us: unsigned_long_long
+    *
+    compression: int = 0
+
+High-performance binary writer for profiling data.
+
+Arguments:
+    filename: Path to output file
+    sample_interval_us: Sampling interval in microseconds
+    start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)
+    compression: 0=none, 1=zstd (default: 0)
+
+Use as a context manager or call finalize() when done.
+[clinic start generated code]*/
+
+static int
+_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self,
+                                             const char *filename,
+                                             unsigned long long sample_interval_us,
+                                             unsigned long long start_time_us,
+                                             int compression)
+/*[clinic end generated code: output=014c0306f1bacf4b input=57497fe3cb9214a6]*/
+{
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+    }
+
+    self->writer = binary_writer_create(filename, sample_interval_us, compression, start_time_us);
+    if (!self->writer) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.write_sample
+    stack_frames: object
+    timestamp_us: unsigned_long_long
+
+Write a sample to the binary file.
+
+Arguments:
+    stack_frames: List of InterpreterInfo objects
+    timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self,
+                                                 PyObject *stack_frames,
+                                                 unsigned long long timestamp_us)
+/*[clinic end generated code: output=24d5b86679b4128f input=dce3148417482624]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is closed");
+        return NULL;
+    }
+
+    if (binary_writer_write_sample(self->writer, stack_frames, timestamp_us) < 0) {
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.finalize
+
+Finalize and close the binary file.
+
+Writes string/frame tables, footer, and updates header.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=3534b88c6628de88 input=c02191750682f6a2]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is already closed");
+        return NULL;
+    }
+
+    /* Save total_samples before finalizing */
+    self->cached_total_samples = self->writer->total_samples;
+
+    if (binary_writer_finalize(self->writer) < 0) {
+        return NULL;
+    }
+
+    binary_writer_destroy(self->writer);
+    self->writer = NULL;
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.close
+
+Close the writer without finalizing (discards data).
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=9571bb2256fd1fd2 input=6e0da206e60daf16]*/
+{
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+        self->writer = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__enter__
+
+Enter context manager.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=8eb95f61daf2d120 input=8ef14ee18da561d2]*/
+{
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__exit__
+    exc_type: object = None
+    exc_val: object = None
+    exc_tb: object = None
+
+Exit context manager, finalizing the file.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb)
+/*[clinic end generated code: output=61831f47c72a53c6 input=12334ce1009af37f]*/
+{
+    if (self->writer) {
+        /* Finalize on normal exit */
+        if (binary_writer_finalize(self->writer) < 0) {
+            binary_writer_destroy(self->writer);
+            self->writer = NULL;
+            return NULL;
+        }
+        binary_writer_destroy(self->writer);
+        self->writer = NULL;
+    }
+    Py_RETURN_FALSE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.get_stats
+
+Get encoding statistics for the writer.
+
+Returns a dict with encoding statistics including repeat/full/suffix/pop-push
+record counts, frames written/saved, and compression ratio.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=06522cd52544df89 input=82968491b53ad277]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is closed");
+        return NULL;
+    }
+    return binary_writer_get_stats(self->writer);
+}
+
+static PyObject *
+BinaryWriter_get_total_samples(BinaryWriterObject *self, void *closure)
+{
+    if (!self->writer) {
+        /* Use cached value after finalize/close */
+        return PyLong_FromUnsignedLong(self->cached_total_samples);
+    }
+    return PyLong_FromUnsignedLong(self->writer->total_samples);
+}
+
+static PyGetSetDef BinaryWriter_getset[] = {
+    {"total_samples", (getter)BinaryWriter_get_total_samples, NULL, "Total samples written", NULL},
+    {NULL}
+};
+
+static PyMethodDef BinaryWriter_methods[] = {
+    _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF
+    {NULL, NULL, 0, NULL}
+};
+
+static void
+BinaryWriter_dealloc(PyObject *op)
+{
+    BinaryWriterObject *self = BinaryWriter_CAST(op);
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+    }
+    tp->tp_free(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot BinaryWriter_slots[] = {
+    {Py_tp_getset, BinaryWriter_getset},
+    {Py_tp_methods, BinaryWriter_methods},
+    {Py_tp_init, _remote_debugging_BinaryWriter___init__},
+    {Py_tp_dealloc, BinaryWriter_dealloc},
+    {0, NULL}
+};
+
+static PyType_Spec BinaryWriter_spec = {
+    .name = "_remote_debugging.BinaryWriter",
+    .basicsize = sizeof(BinaryWriterObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = BinaryWriter_slots,
 };
 
+/* ============================================================================
+ * BINARY READER CLASS
+ * ============================================================================ */
+
+#define BinaryReader_CAST(op) ((BinaryReaderObject *)(op))
+
+/*[clinic input]
+class _remote_debugging.BinaryReader "BinaryReaderObject *" "&PyBinaryReader_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=36400aaf6f53216d]*/
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__init__
+    filename: str
+
+High-performance binary reader for profiling data.
+
+Arguments:
+    filename: Path to input file
+
+Use as a context manager or call close() when done.
+[clinic start generated code]*/
+
+static int
+_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self,
+                                             const char *filename)
+/*[clinic end generated code: output=9699226f7ae052bb input=4201f9cc500ef2f6]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+    }
+
+    self->reader = binary_reader_open(filename);
+    if (!self->reader) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.replay
+    collector: object
+    progress_callback: object = None
+
+Replay samples through a collector.
+
+Arguments:
+    collector: Collector object with collect() method
+    progress_callback: Optional callable(current, total)
+
+Returns:
+    Number of samples replayed
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self,
+                                           PyObject *collector,
+                                           PyObject *progress_callback)
+/*[clinic end generated code: output=442345562574b61c input=ebb687aed3e0f4f1]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+
+    Py_ssize_t replayed = binary_reader_replay(self->reader, collector, progress_callback);
+    if (replayed < 0) {
+        return NULL;
+    }
+
+    return PyLong_FromSsize_t(replayed);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.get_info
+
+Get metadata about the binary file.
+
+Returns:
+    Dict with file metadata
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=7f641fbd39147391 input=02e75e39c8a6cd1f]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+
+    return binary_reader_get_info(self->reader);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.get_stats
+
+Get reconstruction statistics from replay.
+
+Returns a dict with statistics about record types decoded and samples
+reconstructed during replay.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=628b9ab5e4c4fd36 input=d8dd6654abd6c3c0]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+    return binary_reader_get_stats(self->reader);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.close
+
+Close the reader and free resources.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=ad0238cf5240b4f8 input=b919a66c737712d5]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+        self->reader = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__enter__
+
+Enter context manager.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=fade133538e93817 input=4794844c9efdc4f6]*/
+{
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__exit__
+    exc_type: object = None
+    exc_val: object = None
+    exc_tb: object = None
+
+Exit context manager, closing the file.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb)
+/*[clinic end generated code: output=2acdd36cfdc14e4a input=87284243d7935835]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+        self->reader = NULL;
+    }
+    Py_RETURN_FALSE;
+}
+
+static PyObject *
+BinaryReader_get_sample_count(BinaryReaderObject *self, void *closure)
+{
+    if (!self->reader) {
+        return PyLong_FromLong(0);
+    }
+    return PyLong_FromUnsignedLong(self->reader->sample_count);
+}
+
+static PyObject *
+BinaryReader_get_sample_interval_us(BinaryReaderObject *self, void *closure)
+{
+    if (!self->reader) {
+        return PyLong_FromLong(0);
+    }
+    return PyLong_FromUnsignedLongLong(self->reader->sample_interval_us);
+}
+
+static PyGetSetDef BinaryReader_getset[] = {
+    {"sample_count", (getter)BinaryReader_get_sample_count, NULL, "Number of samples in file", NULL},
+    {"sample_interval_us", (getter)BinaryReader_get_sample_interval_us, NULL, "Sample interval in microseconds", NULL},
+    {NULL}
+};
+
+static PyMethodDef BinaryReader_methods[] = {
+    _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF
+    {NULL, NULL, 0, NULL}
+};
+
+static void
+BinaryReader_dealloc(PyObject *op)
+{
+    BinaryReaderObject *self = BinaryReader_CAST(op);
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->reader) {
+        binary_reader_close(self->reader);
+    }
+    tp->tp_free(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot BinaryReader_slots[] = {
+    {Py_tp_getset, BinaryReader_getset},
+    {Py_tp_methods, BinaryReader_methods},
+    {Py_tp_init, _remote_debugging_BinaryReader___init__},
+    {Py_tp_dealloc, BinaryReader_dealloc},
+    {0, NULL}
+};
+
+static PyType_Spec BinaryReader_spec = {
+    .name = "_remote_debugging.BinaryReader",
+    .basicsize = sizeof(BinaryReaderObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = BinaryReader_slots,
+};
+
+/* ============================================================================
+ * MODULE METHODS
+ * ============================================================================ */
+
+/*[clinic input]
+_remote_debugging.zstd_available
+
+Check if zstd compression is available.
+
+Returns:
+    True if zstd available, False otherwise
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_zstd_available_impl(PyObject *module)
+/*[clinic end generated code: output=55e35a70ef280cdd input=a1b4d41bc09c7cf9]*/
+{
+    return PyBool_FromLong(binary_io_zstd_available());
+}
+
 /* ============================================================================
  * MODULE-LEVEL FUNCTIONS
  * ============================================================================ */
@@ -1188,11 +1713,19 @@ _remote_debugging_is_python_process_impl(PyObject *module, int pid)
 }
 
 static PyMethodDef remote_debugging_methods[] = {
+    _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF
     _REMOTE_DEBUGGING_GET_CHILD_PIDS_METHODDEF
     _REMOTE_DEBUGGING_IS_PYTHON_PROCESS_METHODDEF
     {NULL, NULL, 0, NULL},
 };
 
+static PyModuleDef_Slot remote_debugging_slots[] = {
+    {Py_mod_exec, _remote_debugging_exec},
+    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
+    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+    {0, NULL},
+};
+
 static struct PyModuleDef remote_debugging_module = {
     PyModuleDef_HEAD_INIT,
     .m_name = "_remote_debugging",
diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj
index 830b7b8744862c..0e86ce9f4c918c 100644
--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@@ -105,10 +105,13 @@
     <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c" />
     <ClCompile Include="..\Modules\_remote_debugging\threads.c" />
     <ClCompile Include="..\Modules\_remote_debugging\asyncio.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_writer.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_reader.c" />
     <ClCompile Include="..\Modules\_remote_debugging\subprocess.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\Modules\_remote_debugging\_remote_debugging.h" />
+    <ClInclude Include="..\Modules\_remote_debugging\binary_io.h" />
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc" />
diff --git a/PCbuild/_remote_debugging.vcxproj.filters b/PCbuild/_remote_debugging.vcxproj.filters
index 793a3256c52d58..59d4d5c5c335fb 100644
--- a/PCbuild/_remote_debugging.vcxproj.filters
+++ b/PCbuild/_remote_debugging.vcxproj.filters
@@ -33,6 +33,12 @@
     <ClCompile Include="..\Modules\_remote_debugging\asyncio.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_writer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_reader.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Modules\_remote_debugging\subprocess.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -41,6 +47,9 @@
     <ClInclude Include="..\Modules\_remote_debugging\_remote_debugging.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\Modules\_remote_debugging\binary_io.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc">
diff --git a/configure b/configure
index a1bc7991aa8dc2..b1faeaf806a9c6 100755
--- a/configure
+++ b/configure
@@ -858,6 +858,8 @@ HAVE_GETHOSTBYNAME_R_3_ARG
 HAVE_GETHOSTBYNAME_R_5_ARG
 HAVE_GETHOSTBYNAME_R_6_ARG
 LIBOBJS
+REMOTE_DEBUGGING_LIBS
+REMOTE_DEBUGGING_CFLAGS
 LIBZSTD_LIBS
 LIBZSTD_CFLAGS
 LIBLZMA_LIBS
@@ -23023,6 +23025,22 @@ printf "%s\n" "yes" >&6; }
         have_libzstd=yes
 fi
 
+if test "x$have_libzstd" = xyes
+then :
+
+  REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS"
+  REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS"
+
+else case e in #(
+  e)
+  REMOTE_DEBUGGING_CFLAGS=""
+  REMOTE_DEBUGGING_LIBS=""
+ ;;
+esac
+fi
+
+
+
 
 
 
@@ -31644,8 +31662,8 @@ fi
   if test "x$py_cv_module__remote_debugging" = xyes
 then :
 
-
-
+    as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_CFLAGS=$REMOTE_DEBUGGING_CFLAGS$as_nl"
+    as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_LDFLAGS=$REMOTE_DEBUGGING_LIBS$as_nl"
 
 fi
 
diff --git a/configure.ac b/configure.ac
index a284a118f0296f..043ec957f40894 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5529,6 +5529,18 @@ PKG_CHECK_MODULES([LIBZSTD], [libzstd >= 1.4.5], [have_libzstd=yes], [
   ])
 ])
 
+dnl _remote_debugging module: optional zstd compression support
+dnl The module always builds, but zstd compression is only available when libzstd is found
+AS_VAR_IF([have_libzstd], [yes], [
+  REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS"
+  REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS"
+], [
+  REMOTE_DEBUGGING_CFLAGS=""
+  REMOTE_DEBUGGING_LIBS=""
+])
+AC_SUBST([REMOTE_DEBUGGING_CFLAGS])
+AC_SUBST([REMOTE_DEBUGGING_LIBS])
+
 dnl PY_CHECK_NETDB_FUNC(FUNCTION)
 AC_DEFUN([PY_CHECK_NETDB_FUNC], [PY_CHECK_FUNC([$1], [@%:@include <netdb.h>])])
 
@@ -7911,7 +7923,7 @@ PY_STDLIB_MOD_SIMPLE([_pickle])
 PY_STDLIB_MOD_SIMPLE([_posixsubprocess])
 PY_STDLIB_MOD_SIMPLE([_queue])
 PY_STDLIB_MOD_SIMPLE([_random])
-PY_STDLIB_MOD_SIMPLE([_remote_debugging])
+PY_STDLIB_MOD_SIMPLE([_remote_debugging], [$REMOTE_DEBUGGING_CFLAGS], [$REMOTE_DEBUGGING_LIBS])
 PY_STDLIB_MOD_SIMPLE([select])
 PY_STDLIB_MOD_SIMPLE([_struct])
 PY_STDLIB_MOD_SIMPLE([_types])