From 256090546a9a9c8ea8a973dc32690e1f8d28b650 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@thetypicalset.com>
Date: Thu, 10 Oct 2024 12:06:52 +0200
Subject: [PATCH 1/2] Gather data in a single file

We currently copy-paste the model names, regex and json cases between
benchmarks. Which is error-prone. In this commit I gather all of those in a
single file and import them from this file instead.
---
 src/benchmark_lfe.py           | 92 ++--------------------------------
 src/benchmark_outlines.py      | 92 ++--------------------------------
 src/benchmark_outlines_core.py | 92 ++--------------------------------
 3 files changed, 15 insertions(+), 261 deletions(-)

diff --git a/src/benchmark_lfe.py b/src/benchmark_lfe.py
index 752479b..e9b18af 100644
--- a/src/benchmark_lfe.py
+++ b/src/benchmark_lfe.py
@@ -5,33 +5,11 @@
 )
 from transformers import AutoTokenizer
 
-models = [
-    "NousResearch/Nous-Hermes-llama-2-7b",  # 32,000 tokens vocabulary
-    "gpt2",  # 50,257 tokens vocabulary
-    "NousResearch/Hermes-3-Llama-3.1-8B",  # 128,256 tokens vocabulary
-    "unsloth/gemma-2-2b-it-bnb-4bit",  # 256,128 tokens vocabulary
-]
-
-regex_case = [
-    (r"\d{3}-\d{2}-\d{4}", "203-22-1234"),
-    (
-        r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?",
-        "https://github.com/outlines-dev/outlines",
-    ),
-    (
-        r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.",
-        "A: Some thoughts before answering. The answer is 42.",
-    ),
-    (
-        "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)",
-        "AVeryLongStringtoTest1234",
-    ),
-    (r"\+[1-9]\d{1,14}", "1234567891234"),
-]
+from .data import json_cases, models, regex_cases
 
 
 class LMFormatEnforcerRegex:
-    params = [models, regex_case]
+    params = [models, regex_cases]
     param_names = ["model", "regex"]
     timeout = 600
 
@@ -48,7 +26,7 @@ def setup(self, model, _):
         self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer)
 
     def time_lfe(self, _, regex):
-        regex_string, regex_example = regex
+        regex_string, regex_example = regex["regex"], regex["example"]
         regex_example_tokens = self.tokenizer.encode(regex_example)
 
         parser = RegexParser(regex_string)
@@ -58,68 +36,8 @@ def time_lfe(self, _, regex):
             _ = token_enforcer.get_allowed_tokens(regex_example_tokens[: i + 1])
 
 
-json_case = [
-    (
-        {
-            "$defs": {
-                "Armor": {
-                    "enum": ["leather", "chainmail", "plate"],
-                    "title": "Armor",
-                    "type": "string",
-                }
-            },
-            "properties": {
-                "name": {"maxLength": 10, "title": "Name", "type": "string"},
-                "age": {"title": "Age", "type": "integer"},
-                "armor": {"$ref": "#/$defs/Armor"},
-                "strength": {"title": "Strength", "type": "integer"},
-            },
-            "required": ["name", "age", "armor", "strength"],
-            "title": "Character",
-            "type": "object",
-        },
-        """{'name': 'Super Warrior', 'age': 26,  'armor': 'leather', 'armor': 10}""",
-    ),
-    (
-        {
-            "$schema": "http://json-schema.org/draft-04/schema#",
-            "title": "Schema for a recording",
-            "type": "object",
-            "definitions": {
-                "artist": {
-                    "type": "object",
-                    "properties": {
-                        "id": {"type": "number"},
-                        "name": {"type": "string"},
-                        "functions": {"type": "array", "items": {"type": "string"}},
-                    },
-                    "required": ["id", "name", "functions"],
-                }
-            },
-            "properties": {
-                "id": {"type": "number"},
-                "work": {
-                    "type": "object",
-                    "properties": {
-                        "id": {"type": "number"},
-                        "name": {"type": "string"},
-                        "composer": {"$ref": "#/definitions/artist"},
-                    },
-                },
-                "recording_artists": {
-                    "type": "array",
-                    "items": {"$ref": "#/definitions/artist"},
-                },
-            },
-            "required": ["id", "work", "recording_artists"],
-        },
-        """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""",
-    ),
-]
-
-
 class LMFormatEnforcerJsonSchema:
-    params = [models, json_case]
+    params = [models, json_cases]
     param_names = ["model", "json"]
     timeout = 600
 
@@ -136,7 +54,7 @@ def setup(self, model, _):
         self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer)
 
     def time_lfe(self, _, json):
-        json_string, json_example = json
+        json_string, json_example = json["schema"], json["example"]
         json_example_tokens = self.tokenizer.encode(json_example)
 
         parser = JsonSchemaParser(json_string)
diff --git a/src/benchmark_outlines.py b/src/benchmark_outlines.py
index 6cf06cb..67de236 100644
--- a/src/benchmark_outlines.py
+++ b/src/benchmark_outlines.py
@@ -7,33 +7,11 @@
 from outlines.models.transformers import TransformerTokenizer
 from transformers import AutoTokenizer
 
-models = [
-    "NousResearch/Nous-Hermes-llama-2-7b",  # 32,000 tokens vocabulary
-    "gpt2",  # 50,257 tokens vocabulary
-    "NousResearch/Hermes-3-Llama-3.1-8B",  # 128,256 tokens vocabulary
-    "unsloth/gemma-2-2b-it-bnb-4bit",  # 256,128 tokens vocabulary
-]
-
-regex_case = [
-    (r"\d{3}-\d{2}-\d{4}", "203-22-1234"),
-    (
-        r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?",
-        "https://github.com/outlines-dev/outlines",
-    ),
-    (
-        r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.",
-        "A: Some thoughts before answering. The answer is 42.",
-    ),
-    (
-        "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)",
-        "AVeryLongStringtoTest1234",
-    ),
-    (r"\+[1-9]\d{1,14}", "1234567891234"),
-]
+from .data import json_cases, models, regex_cases
 
 
 class OutlinesRegex:
-    params = [models, regex_case]
+    params = [models, regex_cases]
     param_names = ["model", "regex"]
     timeout = 1200
 
@@ -59,7 +37,7 @@ def time_outlines(self, _, regex):
         """
         caching.clear_cache()
 
-        regex_string, regex_example = regex
+        regex_string, regex_example = regex["regex"], regex["example"]
         regex_example_tokens = self.tokenizer.encode(regex_example)[0][0]
         guide = RegexGuide(regex_string, self.tokenizer)
 
@@ -69,68 +47,8 @@ def time_outlines(self, _, regex):
             state = guide.get_next_state(state, token)
 
 
-json_case = [
-    (
-        {
-            "$defs": {
-                "Armor": {
-                    "enum": ["leather", "chainmail", "plate"],
-                    "title": "Armor",
-                    "type": "string",
-                }
-            },
-            "properties": {
-                "name": {"maxLength": 10, "title": "Name", "type": "string"},
-                "age": {"title": "Age", "type": "integer"},
-                "armor": {"$ref": "#/$defs/Armor"},
-                "strength": {"title": "Strength", "type": "integer"},
-            },
-            "required": ["name", "age", "armor", "strength"],
-            "title": "Character",
-            "type": "object",
-        },
-        """{'name': 'Super Warrior', 'age': 26,  'armor': 'leather', 'armor': 10}""",
-    ),
-    (
-        {
-            "$schema": "http://json-schema.org/draft-04/schema#",
-            "title": "Schema for a recording",
-            "type": "object",
-            "definitions": {
-                "artist": {
-                    "type": "object",
-                    "properties": {
-                        "id": {"type": "number"},
-                        "name": {"type": "string"},
-                        "functions": {"type": "array", "items": {"type": "string"}},
-                    },
-                    "required": ["id", "name", "functions"],
-                }
-            },
-            "properties": {
-                "id": {"type": "number"},
-                "work": {
-                    "type": "object",
-                    "properties": {
-                        "id": {"type": "number"},
-                        "name": {"type": "string"},
-                        "composer": {"$ref": "#/definitions/artist"},
-                    },
-                },
-                "recording_artists": {
-                    "type": "array",
-                    "items": {"$ref": "#/definitions/artist"},
-                },
-            },
-            "required": ["id", "work", "recording_artists"],
-        },
-        """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""",
-    ),
-]
-
-
 class OutlinesJsonSchema:
-    params = [models, json_case]
+    params = [models, json_cases]
     param_names = ["model", "json"]
     timeout = 1200
 
@@ -154,7 +72,7 @@ def time_outlines(self, _, json_case):
         regular expression, and walking this index while generating tokens.
 
         """
-        json_string, json_example = json_case
+        json_string, json_example = json_case["schema"], json_case["example"]
         json_example_tokens = self.tokenizer.encode(json_example)[0][0]
 
         regex_string = build_regex_from_schema(json.dumps(json_string))
diff --git a/src/benchmark_outlines_core.py b/src/benchmark_outlines_core.py
index fd50657..02db4d0 100644
--- a/src/benchmark_outlines_core.py
+++ b/src/benchmark_outlines_core.py
@@ -5,33 +5,11 @@
 from outlines_core.models.transformers import TransformerTokenizer
 from transformers import AutoTokenizer
 
-models = [
-    "NousResearch/Nous-Hermes-llama-2-7b",  # 32,000 tokens vocabulary
-    "gpt2",  # 50,257 tokens vocabulary
-    "NousResearch/Hermes-3-Llama-3.1-8B",  # 128,256 tokens vocabulary
-    "unsloth/gemma-2-2b-it-bnb-4bit",  # 256,128 tokens vocabulary
-]
-
-regex_case = [
-    (r"\d{3}-\d{2}-\d{4}", "203-22-1234"),
-    (
-        r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?",
-        "https://github.com/outlines-dev/outlines",
-    ),
-    (
-        r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.",
-        "A: Some thoughts before answering. The answer is 42.",
-    ),
-    (
-        "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)",
-        "AVeryLongStringtoTest1234",
-    ),
-    (r"\+[1-9]\d{1,14}", "1234567891234"),
-]
+from .data import json_cases, models, regex_cases
 
 
 class OutlinesCoreRegex:
-    params = [models, regex_case]
+    params = [models, regex_cases]
     param_names = ["model", "regex"]
     timeout = 600
 
@@ -54,7 +32,7 @@ def time_outlines_core(self, _, regex):
         regular expression, and walking this index while generating tokens.
 
         """
-        regex_string, regex_example = regex
+        regex_string, regex_example = regex["regex"], regex["example"]
         regex_example_tokens = self.tokenizer.encode(regex_example)[0][0]
         guide = RegexGuide(regex_string, self.tokenizer)
 
@@ -64,68 +42,8 @@ def time_outlines_core(self, _, regex):
             state = guide.get_next_state(state, token)
 
 
-json_case = [
-    (
-        {
-            "$defs": {
-                "Armor": {
-                    "enum": ["leather", "chainmail", "plate"],
-                    "title": "Armor",
-                    "type": "string",
-                }
-            },
-            "properties": {
-                "name": {"maxLength": 10, "title": "Name", "type": "string"},
-                "age": {"title": "Age", "type": "integer"},
-                "armor": {"$ref": "#/$defs/Armor"},
-                "strength": {"title": "Strength", "type": "integer"},
-            },
-            "required": ["name", "age", "armor", "strength"],
-            "title": "Character",
-            "type": "object",
-        },
-        """{'name': 'Super Warrior', 'age': 26,  'armor': 'leather', 'armor': 10}""",
-    ),
-    (
-        {
-            "$schema": "http://json-schema.org/draft-04/schema#",
-            "title": "Schema for a recording",
-            "type": "object",
-            "definitions": {
-                "artist": {
-                    "type": "object",
-                    "properties": {
-                        "id": {"type": "number"},
-                        "name": {"type": "string"},
-                        "functions": {"type": "array", "items": {"type": "string"}},
-                    },
-                    "required": ["id", "name", "functions"],
-                }
-            },
-            "properties": {
-                "id": {"type": "number"},
-                "work": {
-                    "type": "object",
-                    "properties": {
-                        "id": {"type": "number"},
-                        "name": {"type": "string"},
-                        "composer": {"$ref": "#/definitions/artist"},
-                    },
-                },
-                "recording_artists": {
-                    "type": "array",
-                    "items": {"$ref": "#/definitions/artist"},
-                },
-            },
-            "required": ["id", "work", "recording_artists"],
-        },
-        """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""",
-    ),
-]
-
-
 class OutlinesCoreJsonSchema:
-    params = [models, json_case]
+    params = [models, json_cases]
     param_names = ["model", "json"]
     timeout = 600
 
@@ -148,7 +66,7 @@ def time_outlines_core(self, _, json_case):
         regular expression, and walking this index while generating tokens.
 
         """
-        json_string, json_example = json_case
+        json_string, json_example = json_case["schema"], json_case["example"]
         json_example_tokens = self.tokenizer.encode(json_example)[0][0]
 
         regex_string = build_regex_from_schema(json.dumps(json_string))

From 3edfc7768fd0d3931946f7bad09703ae3249744d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@thetypicalset.com>
Date: Thu, 10 Oct 2024 15:35:13 +0200
Subject: [PATCH 2/2] Add data file

---
 src/data.py | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 src/data.py

diff --git a/src/data.py b/src/data.py
new file mode 100644
index 0000000..e4a76be
--- /dev/null
+++ b/src/data.py
@@ -0,0 +1,96 @@
+models = [
+    "NousResearch/Nous-Hermes-llama-2-7b",  # 32,000 tokens vocabulary
+    "gpt2",  # 50,257 tokens vocabulary
+    "NousResearch/Hermes-3-Llama-3.1-8B",  # 128,256 tokens vocabulary
+    "unsloth/gemma-2-2b-it-bnb-4bit",  # 256,128 tokens vocabulary
+]
+
+regex_cases = [
+    {
+        "name": "Phone Number",
+        "regex": r'\d{3}-\d{2}-\d{4}',
+        "example": '203-22-1234'
+    },
+    {
+        "name": "URL",
+        "regex": r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?',
+        "example": 'https://github.com/outlines-dev/outlines'
+    },
+    {
+        "name": "GSM8K",
+        "regex": r'A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.',
+        "example": 'A: Some thoughts before answering. The answer is 42.'
+    },
+    {
+        "name": "Complex string",
+        "regex": r'(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)',
+        "example": 'AVeryLongStringtoTest1234'
+    },
+    {
+        "name": "Long integer",
+        "regex": r'\+[1-9]\d{1,14}',
+        "example": '1234567891234'
+    }
+]
+
+json_cases = [
+    {
+        "name": "RPG character",
+        "schema":
+        {
+            "$defs": {
+                "Armor": {
+                    "enum": ["leather", "chainmail", "plate"],
+                    "title": "Armor",
+                    "type": "string",
+                }
+            },
+            "properties": {
+                "name": {"maxLength": 10, "title": "Name", "type": "string"},
+                "age": {"title": "Age", "type": "integer"},
+                "armor": {"$ref": "#/$defs/Armor"},
+                "strength": {"title": "Strength", "type": "integer"},
+            },
+            "required": ["name", "age", "armor", "strength"],
+            "title": "Character",
+            "type": "object",
+        },
+        "example": """{'name': 'Super Warrior', 'age': 26,  'armor': 'leather', 'armor': 10}""",
+    },
+    {
+        "name": "Simple nested schema",
+        "schema": {
+            "$schema": "http://json-schema.org/draft-04/schema#",
+            "title": "Schema for a recording",
+            "type": "object",
+            "definitions": {
+                "artist": {
+                    "type": "object",
+                    "properties": {
+                        "id": {"type": "number"},
+                        "name": {"type": "string"},
+                        "functions": {"type": "array", "items": {"type": "string"}},
+                    },
+                    "required": ["id", "name", "functions"],
+                }
+            },
+            "properties": {
+                "id": {"type": "number"},
+                "work": {
+                    "type": "object",
+                    "properties": {
+                        "id": {"type": "number"},
+                        "name": {"type": "string"},
+                        "composer": {"$ref": "#/definitions/artist"},
+                    },
+                },
+                "recording_artists": {
+                    "type": "array",
+                    "items": {"$ref": "#/definitions/artist"},
+                },
+            },
+            "required": ["id", "work", "recording_artists"],
+        },
+        "example": """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""",
+    },
+]