From 256090546a9a9c8ea8a973dc32690e1f8d28b650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 10 Oct 2024 12:06:52 +0200 Subject: [PATCH 1/2] Gather data in a single file We currently copy-paste the model names, regex and json cases between benchmarks. Which is error-prone. In this commit I gather all of those in a single file and import them from this file instead. --- src/benchmark_lfe.py | 92 ++-------------------------------- src/benchmark_outlines.py | 92 ++-------------------------------- src/benchmark_outlines_core.py | 92 ++-------------------------------- 3 files changed, 15 insertions(+), 261 deletions(-) diff --git a/src/benchmark_lfe.py b/src/benchmark_lfe.py index 752479b..e9b18af 100644 --- a/src/benchmark_lfe.py +++ b/src/benchmark_lfe.py @@ -5,33 +5,11 @@ ) from transformers import AutoTokenizer -models = [ - "NousResearch/Nous-Hermes-llama-2-7b", # 32,000 tokens vocabulary - "gpt2", # 50,257 tokens vocabulary - "NousResearch/Hermes-3-Llama-3.1-8B", # 128,256 tokens vocabulary - "unsloth/gemma-2-2b-it-bnb-4bit", # 256,128 tokens vocabulary -] - -regex_case = [ - (r"\d{3}-\d{2}-\d{4}", "203-22-1234"), - ( - r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?", - "https://github.com/outlines-dev/outlines", - ), - ( - r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.", - "A: Some thoughts before answering. The answer is 42.", - ), - ( - "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)", - "AVeryLongStringtoTest1234", - ), - (r"\+[1-9]\d{1,14}", "1234567891234"), -] +from .data import json_cases, models, regex_cases class LMFormatEnforcerRegex: - params = [models, regex_case] + params = [models, regex_cases] param_names = ["model", "regex"] timeout = 600 @@ -48,7 +26,7 @@ def setup(self, model, _): self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer) def time_lfe(self, _, regex): - regex_string, regex_example = regex + regex_string, regex_example = regex["regex"], regex["example"] regex_example_tokens = self.tokenizer.encode(regex_example) parser = RegexParser(regex_string) @@ -58,68 +36,8 @@ def time_lfe(self, _, regex): _ = token_enforcer.get_allowed_tokens(regex_example_tokens[: i + 1]) -json_case = [ - ( - { - "$defs": { - "Armor": { - "enum": ["leather", "chainmail", "plate"], - "title": "Armor", - "type": "string", - } - }, - "properties": { - "name": {"maxLength": 10, "title": "Name", "type": "string"}, - "age": {"title": "Age", "type": "integer"}, - "armor": {"$ref": "#/$defs/Armor"}, - "strength": {"title": "Strength", "type": "integer"}, - }, - "required": ["name", "age", "armor", "strength"], - "title": "Character", - "type": "object", - }, - """{'name': 'Super Warrior', 'age': 26, 'armor': 'leather', 'armor': 10}""", - ), - ( - { - "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema for a recording", - "type": "object", - "definitions": { - "artist": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "functions": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["id", "name", "functions"], - } - }, - "properties": { - "id": {"type": "number"}, - "work": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "composer": {"$ref": "#/definitions/artist"}, - }, - }, - "recording_artists": { - "type": "array", - "items": {"$ref": "#/definitions/artist"}, - }, - }, - "required": ["id", "work", "recording_artists"], - }, - """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""", - ), -] - - class LMFormatEnforcerJsonSchema: - params = [models, json_case] + params = [models, json_cases] param_names = ["model", "json"] timeout = 600 @@ -136,7 +54,7 @@ def setup(self, model, _): self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer) def time_lfe(self, _, json): - json_string, json_example = json + json_string, json_example = json["schema"], json["example"] json_example_tokens = self.tokenizer.encode(json_example) parser = JsonSchemaParser(json_string) diff --git a/src/benchmark_outlines.py b/src/benchmark_outlines.py index 6cf06cb..67de236 100644 --- a/src/benchmark_outlines.py +++ b/src/benchmark_outlines.py @@ -7,33 +7,11 @@ from outlines.models.transformers import TransformerTokenizer from transformers import AutoTokenizer -models = [ - "NousResearch/Nous-Hermes-llama-2-7b", # 32,000 tokens vocabulary - "gpt2", # 50,257 tokens vocabulary - "NousResearch/Hermes-3-Llama-3.1-8B", # 128,256 tokens vocabulary - "unsloth/gemma-2-2b-it-bnb-4bit", # 256,128 tokens vocabulary -] - -regex_case = [ - (r"\d{3}-\d{2}-\d{4}", "203-22-1234"), - ( - r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?", - "https://github.com/outlines-dev/outlines", - ), - ( - r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.", - "A: Some thoughts before answering. The answer is 42.", - ), - ( - "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)", - "AVeryLongStringtoTest1234", - ), - (r"\+[1-9]\d{1,14}", "1234567891234"), -] +from .data import json_cases, models, regex_cases class OutlinesRegex: - params = [models, regex_case] + params = [models, regex_cases] param_names = ["model", "regex"] timeout = 1200 @@ -59,7 +37,7 @@ def time_outlines(self, _, regex): """ caching.clear_cache() - regex_string, regex_example = regex + regex_string, regex_example = regex["regex"], regex["example"] regex_example_tokens = self.tokenizer.encode(regex_example)[0][0] guide = RegexGuide(regex_string, self.tokenizer) @@ -69,68 +47,8 @@ def time_outlines(self, _, regex): state = guide.get_next_state(state, token) -json_case = [ - ( - { - "$defs": { - "Armor": { - "enum": ["leather", "chainmail", "plate"], - "title": "Armor", - "type": "string", - } - }, - "properties": { - "name": {"maxLength": 10, "title": "Name", "type": "string"}, - "age": {"title": "Age", "type": "integer"}, - "armor": {"$ref": "#/$defs/Armor"}, - "strength": {"title": "Strength", "type": "integer"}, - }, - "required": ["name", "age", "armor", "strength"], - "title": "Character", - "type": "object", - }, - """{'name': 'Super Warrior', 'age': 26, 'armor': 'leather', 'armor': 10}""", - ), - ( - { - "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema for a recording", - "type": "object", - "definitions": { - "artist": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "functions": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["id", "name", "functions"], - } - }, - "properties": { - "id": {"type": "number"}, - "work": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "composer": {"$ref": "#/definitions/artist"}, - }, - }, - "recording_artists": { - "type": "array", - "items": {"$ref": "#/definitions/artist"}, - }, - }, - "required": ["id", "work", "recording_artists"], - }, - """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""", - ), -] - - class OutlinesJsonSchema: - params = [models, json_case] + params = [models, json_cases] param_names = ["model", "json"] timeout = 1200 @@ -154,7 +72,7 @@ def time_outlines(self, _, json_case): regular expression, and walking this index while generating tokens. """ - json_string, json_example = json_case + json_string, json_example = json_case["schema"], json_case["example"] json_example_tokens = self.tokenizer.encode(json_example)[0][0] regex_string = build_regex_from_schema(json.dumps(json_string)) diff --git a/src/benchmark_outlines_core.py b/src/benchmark_outlines_core.py index fd50657..02db4d0 100644 --- a/src/benchmark_outlines_core.py +++ b/src/benchmark_outlines_core.py @@ -5,33 +5,11 @@ from outlines_core.models.transformers import TransformerTokenizer from transformers import AutoTokenizer -models = [ - "NousResearch/Nous-Hermes-llama-2-7b", # 32,000 tokens vocabulary - "gpt2", # 50,257 tokens vocabulary - "NousResearch/Hermes-3-Llama-3.1-8B", # 128,256 tokens vocabulary - "unsloth/gemma-2-2b-it-bnb-4bit", # 256,128 tokens vocabulary -] - -regex_case = [ - (r"\d{3}-\d{2}-\d{4}", "203-22-1234"), - ( - r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?", - "https://github.com/outlines-dev/outlines", - ), - ( - r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.", - "A: Some thoughts before answering. The answer is 42.", - ), - ( - "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)", - "AVeryLongStringtoTest1234", - ), - (r"\+[1-9]\d{1,14}", "1234567891234"), -] +from .data import json_cases, models, regex_cases class OutlinesCoreRegex: - params = [models, regex_case] + params = [models, regex_cases] param_names = ["model", "regex"] timeout = 600 @@ -54,7 +32,7 @@ def time_outlines_core(self, _, regex): regular expression, and walking this index while generating tokens. """ - regex_string, regex_example = regex + regex_string, regex_example = regex["regex"], regex["example"] regex_example_tokens = self.tokenizer.encode(regex_example)[0][0] guide = RegexGuide(regex_string, self.tokenizer) @@ -64,68 +42,8 @@ def time_outlines_core(self, _, regex): state = guide.get_next_state(state, token) -json_case = [ - ( - { - "$defs": { - "Armor": { - "enum": ["leather", "chainmail", "plate"], - "title": "Armor", - "type": "string", - } - }, - "properties": { - "name": {"maxLength": 10, "title": "Name", "type": "string"}, - "age": {"title": "Age", "type": "integer"}, - "armor": {"$ref": "#/$defs/Armor"}, - "strength": {"title": "Strength", "type": "integer"}, - }, - "required": ["name", "age", "armor", "strength"], - "title": "Character", - "type": "object", - }, - """{'name': 'Super Warrior', 'age': 26, 'armor': 'leather', 'armor': 10}""", - ), - ( - { - "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema for a recording", - "type": "object", - "definitions": { - "artist": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "functions": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["id", "name", "functions"], - } - }, - "properties": { - "id": {"type": "number"}, - "work": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "composer": {"$ref": "#/definitions/artist"}, - }, - }, - "recording_artists": { - "type": "array", - "items": {"$ref": "#/definitions/artist"}, - }, - }, - "required": ["id", "work", "recording_artists"], - }, - """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""", - ), -] - - class OutlinesCoreJsonSchema: - params = [models, json_case] + params = [models, json_cases] param_names = ["model", "json"] timeout = 600 @@ -148,7 +66,7 @@ def time_outlines_core(self, _, json_case): regular expression, and walking this index while generating tokens. """ - json_string, json_example = json_case + json_string, json_example = json_case["schema"], json_case["example"] json_example_tokens = self.tokenizer.encode(json_example)[0][0] regex_string = build_regex_from_schema(json.dumps(json_string)) From 3edfc7768fd0d3931946f7bad09703ae3249744d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 10 Oct 2024 15:35:13 +0200 Subject: [PATCH 2/2] Add data file --- src/data.py | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 src/data.py diff --git a/src/data.py b/src/data.py new file mode 100644 index 0000000..e4a76be --- /dev/null +++ b/src/data.py @@ -0,0 +1,96 @@ +models = [ + "NousResearch/Nous-Hermes-llama-2-7b", # 32,000 tokens vocabulary + "gpt2", # 50,257 tokens vocabulary + "NousResearch/Hermes-3-Llama-3.1-8B", # 128,256 tokens vocabulary + "unsloth/gemma-2-2b-it-bnb-4bit", # 256,128 tokens vocabulary +] + +regex_cases = [ + { + "name": "Phone Number", + "regex": r'\d{3}-\d{2}-\d{4}', + "example": '203-22-1234' + }, + { + "name": "URL", + "regex": r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?', + "example": 'https://github.com/outlines-dev/outlines' + }, + { + "name": "GSM8K", + "regex": r'A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.', + "example": 'A: Some thoughts before answering. The answer is 42.' + }, + { + "name": "Complex string", + "regex": r'(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)', + "example": 'AVeryLongStringtoTest1234' + }, + { + "name": "Long integer", + "regex": r'\+[1-9]\d{1,14}', + "example": '1234567891234' + } +] + +json_cases = [ + { + "name": "RPG character", + "schema": + { + "$defs": { + "Armor": { + "enum": ["leather", "chainmail", "plate"], + "title": "Armor", + "type": "string", + } + }, + "properties": { + "name": {"maxLength": 10, "title": "Name", "type": "string"}, + "age": {"title": "Age", "type": "integer"}, + "armor": {"$ref": "#/$defs/Armor"}, + "strength": {"title": "Strength", "type": "integer"}, + }, + "required": ["name", "age", "armor", "strength"], + "title": "Character", + "type": "object", + }, + "example": """{'name': 'Super Warrior', 'age': 26, 'armor': 'leather', 'armor': 10}""", + }, + { + "name": "Simple nested schema", + "schema": { + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "Schema for a recording", + "type": "object", + "definitions": { + "artist": { + "type": "object", + "properties": { + "id": {"type": "number"}, + "name": {"type": "string"}, + "functions": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["id", "name", "functions"], + } + }, + "properties": { + "id": {"type": "number"}, + "work": { + "type": "object", + "properties": { + "id": {"type": "number"}, + "name": {"type": "string"}, + "composer": {"$ref": "#/definitions/artist"}, + }, + }, + "recording_artists": { + "type": "array", + "items": {"$ref": "#/definitions/artist"}, + }, + }, + "required": ["id", "work", "recording_artists"], + }, + "example": """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""", + }, +]