From 0f5243ef9ccecb10cd3acd7324b54838265787d8 Mon Sep 17 00:00:00 2001 From: LeSingh1 Date: Mon, 18 May 2026 00:04:23 -0700 Subject: [PATCH] fix(py): report element count in buffer.shape, not byte length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PEP 3118 (buffer protocol) requires `prod(shape) * itemsize == nbytes`. The previous implementation pointed `view.shape` at the same storage as `view.len` (the byte length), so `memoryview(buf).shape` was reported as the number of bytes rather than the number of tokens, and `memoryview(buf).tolist()` returned 4x as many entries as expected (one per byte of each u32 token) on consumers that respect `shape`. NumPy is unaffected because `np.frombuffer` computes the count from `len / itemsize` directly, which is why this slipped through `Encoding.encode_to_numpy` — but any CPython consumer that goes through the buffer's `shape` (e.g. `memoryview(...).tolist()`, `array.array.frombuffer` on Python >=3.13) reads the wrong count. Fix: store the element count in a stable `Box` on the `TiktokenBuffer` itself, and point `view.shape` at it. The box's memory lives as long as the Python object (which is kept alive via `view.obj`), so the pointer remains valid for the buffer view's lifetime. Fixes #405. --- src/py.rs | 21 ++++++++++++++++++--- tests/test_misc.py | 16 ++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/py.rs b/src/py.rs index 60b62452..59afabb3 100644 --- a/src/py.rs +++ b/src/py.rs @@ -65,7 +65,7 @@ impl CoreBPE { Err(e) => return Err(PyErr::new::(e.message)), }; - let buffer = TiktokenBuffer { tokens }; + let buffer = TiktokenBuffer::new(tokens); buffer.into_py_any(py) } @@ -186,6 +186,17 @@ impl CoreBPE { #[pyclass(frozen)] struct TiktokenBuffer { tokens: Vec, + // Per the buffer protocol, `view.shape[0]` must be the element count + // (so that `prod(shape) * itemsize == len`). Box it so the pointer we + // hand out in `__getbuffer__` is stable for the lifetime of `self`. + shape: Box, +} + +impl TiktokenBuffer { + fn new(tokens: Vec) -> Self { + let shape = Box::new(tokens.len() as pyo3::ffi::Py_ssize_t); + Self { tokens, shape } + } } #[pymethods] @@ -208,7 +219,8 @@ impl TiktokenBuffer { let view_ref = &mut *view; view_ref.obj = slf.clone().into_any().into_ptr(); - let data = &slf.borrow().tokens; + let borrowed = slf.borrow(); + let data = &borrowed.tokens; view_ref.buf = data.as_ptr() as *mut std::os::raw::c_void; view_ref.len = (data.len() * std::mem::size_of::()) as isize; view_ref.readonly = 1; @@ -221,7 +233,10 @@ impl TiktokenBuffer { }; view_ref.ndim = 1; view_ref.shape = if (flags & pyo3::ffi::PyBUF_ND) == pyo3::ffi::PyBUF_ND { - &mut view_ref.len + // Element count, not byte length, per PEP 3118. + // The Box lives as long as `self` (kept alive via `view_ref.obj`), + // so this pointer is valid for the buffer view's lifetime. + &*borrowed.shape as *const pyo3::ffi::Py_ssize_t as *mut pyo3::ffi::Py_ssize_t } else { std::ptr::null_mut() }; diff --git a/tests/test_misc.py b/tests/test_misc.py index 0832c8ee..45746d2d 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -28,3 +28,19 @@ def test_optional_blobfile_dependency(): assert "blobfile" not in sys.modules """ subprocess.check_call([sys.executable, "-c", prog]) + + +def test_tiktoken_buffer_shape_is_element_count(): + # Regression test for the buffer protocol's `shape` reporting bytes + # instead of element count, which caused `memoryview(...).tolist()` + # to expand each token into 4 byte-sized integers. See issue #405. + import math + + enc = tiktoken.get_encoding("gpt2") + expected = enc.encode("hello world") + buf = enc._core_bpe.encode_to_tiktoken_buffer("hello world", frozenset()) + + mv = memoryview(buf) + # Per PEP 3118: prod(shape) * itemsize == nbytes + assert math.prod(mv.shape) * mv.itemsize == mv.nbytes + assert mv.tolist() == expected