diff --git a/tests/test_misc.py b/tests/test_misc.py index 0832c8ee..097effc1 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -19,6 +19,12 @@ def test_encoding_for_model(): assert enc.name == "o200k_base" enc = tiktoken.encoding_for_model("gpt-oss-120b") assert enc.name == "o200k_harmony" + # Regression for #464: a dotted minor like 'gpt-5.1' does not start + # with the 'gpt-5-' prefix, so it needs its own entry / prefix. + enc = tiktoken.encoding_for_model("gpt-5.1") + assert enc.name == "o200k_base" + enc = tiktoken.encoding_for_model("gpt-5.1-2025-11") + assert enc.name == "o200k_base" def test_optional_blobfile_dependency(): diff --git a/tiktoken/model.py b/tiktoken/model.py index 5c669af4..bfc4b248 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -9,6 +9,12 @@ "o3-": "o200k_base", "o4-mini-": "o200k_base", # chat + # NB: list 'gpt-5.1-' before 'gpt-5-' so that a dotted minor like + # 'gpt-5.1-2025-11' is matched by its own prefix rather than failing the + # 'gpt-5-' check ('gpt-5.1-' does not start with 'gpt-5-'). For dict + # iteration order this is purely cosmetic — startswith() is correct either + # way — but it makes the intent explicit. + "gpt-5.1-": "o200k_base", "gpt-5-": "o200k_base", "gpt-4.5-": "o200k_base", "gpt-4.1-": "o200k_base", @@ -32,6 +38,7 @@ "o3": "o200k_base", "o4-mini": "o200k_base", # chat + "gpt-5.1": "o200k_base", "gpt-5": "o200k_base", "gpt-4.1": "o200k_base", "gpt-4o": "o200k_base",