From 256090546a9a9c8ea8a973dc32690e1f8d28b650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 10 Oct 2024 12:06:52 +0200 Subject: [PATCH] Gather data in a single file We currently copy-paste the model names, regex and json cases between benchmarks. Which is error-prone. In this commit I gather all of those in a single file and import them from this file instead. --- src/benchmark_lfe.py | 92 ++-------------------------------- src/benchmark_outlines.py | 92 ++-------------------------------- src/benchmark_outlines_core.py | 92 ++-------------------------------- 3 files changed, 15 insertions(+), 261 deletions(-) diff --git a/src/benchmark_lfe.py b/src/benchmark_lfe.py index 752479b..e9b18af 100644 --- a/src/benchmark_lfe.py +++ b/src/benchmark_lfe.py @@ -5,33 +5,11 @@ ) from transformers import AutoTokenizer -models = [ - "NousResearch/Nous-Hermes-llama-2-7b", # 32,000 tokens vocabulary - "gpt2", # 50,257 tokens vocabulary - "NousResearch/Hermes-3-Llama-3.1-8B", # 128,256 tokens vocabulary - "unsloth/gemma-2-2b-it-bnb-4bit", # 256,128 tokens vocabulary -] - -regex_case = [ - (r"\d{3}-\d{2}-\d{4}", "203-22-1234"), - ( - r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?", - "https://github.com/outlines-dev/outlines", - ), - ( - r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.", - "A: Some thoughts before answering. The answer is 42.", - ), - ( - "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)", - "AVeryLongStringtoTest1234", - ), - (r"\+[1-9]\d{1,14}", "1234567891234"), -] +from .data import json_cases, models, regex_cases class LMFormatEnforcerRegex: - params = [models, regex_case] + params = [models, regex_cases] param_names = ["model", "regex"] timeout = 600 @@ -48,7 +26,7 @@ def setup(self, model, _): self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer) def time_lfe(self, _, regex): - regex_string, regex_example = regex + regex_string, regex_example = regex["regex"], regex["example"] regex_example_tokens = self.tokenizer.encode(regex_example) parser = RegexParser(regex_string) @@ -58,68 +36,8 @@ def time_lfe(self, _, regex): _ = token_enforcer.get_allowed_tokens(regex_example_tokens[: i + 1]) -json_case = [ - ( - { - "$defs": { - "Armor": { - "enum": ["leather", "chainmail", "plate"], - "title": "Armor", - "type": "string", - } - }, - "properties": { - "name": {"maxLength": 10, "title": "Name", "type": "string"}, - "age": {"title": "Age", "type": "integer"}, - "armor": {"$ref": "#/$defs/Armor"}, - "strength": {"title": "Strength", "type": "integer"}, - }, - "required": ["name", "age", "armor", "strength"], - "title": "Character", - "type": "object", - }, - """{'name': 'Super Warrior', 'age': 26, 'armor': 'leather', 'armor': 10}""", - ), - ( - { - "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema for a recording", - "type": "object", - "definitions": { - "artist": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "functions": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["id", "name", "functions"], - } - }, - "properties": { - "id": {"type": "number"}, - "work": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "composer": {"$ref": "#/definitions/artist"}, - }, - }, - "recording_artists": { - "type": "array", - "items": {"$ref": "#/definitions/artist"}, - }, - }, - "required": ["id", "work", "recording_artists"], - }, - """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""", - ), -] - - class LMFormatEnforcerJsonSchema: - params = [models, json_case] + params = [models, json_cases] param_names = ["model", "json"] timeout = 600 @@ -136,7 +54,7 @@ def setup(self, model, _): self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer) def time_lfe(self, _, json): - json_string, json_example = json + json_string, json_example = json["schema"], json["example"] json_example_tokens = self.tokenizer.encode(json_example) parser = JsonSchemaParser(json_string) diff --git a/src/benchmark_outlines.py b/src/benchmark_outlines.py index 6cf06cb..67de236 100644 --- a/src/benchmark_outlines.py +++ b/src/benchmark_outlines.py @@ -7,33 +7,11 @@ from outlines.models.transformers import TransformerTokenizer from transformers import AutoTokenizer -models = [ - "NousResearch/Nous-Hermes-llama-2-7b", # 32,000 tokens vocabulary - "gpt2", # 50,257 tokens vocabulary - "NousResearch/Hermes-3-Llama-3.1-8B", # 128,256 tokens vocabulary - "unsloth/gemma-2-2b-it-bnb-4bit", # 256,128 tokens vocabulary -] - -regex_case = [ - (r"\d{3}-\d{2}-\d{4}", "203-22-1234"), - ( - r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?", - "https://github.com/outlines-dev/outlines", - ), - ( - r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.", - "A: Some thoughts before answering. The answer is 42.", - ), - ( - "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)", - "AVeryLongStringtoTest1234", - ), - (r"\+[1-9]\d{1,14}", "1234567891234"), -] +from .data import json_cases, models, regex_cases class OutlinesRegex: - params = [models, regex_case] + params = [models, regex_cases] param_names = ["model", "regex"] timeout = 1200 @@ -59,7 +37,7 @@ def time_outlines(self, _, regex): """ caching.clear_cache() - regex_string, regex_example = regex + regex_string, regex_example = regex["regex"], regex["example"] regex_example_tokens = self.tokenizer.encode(regex_example)[0][0] guide = RegexGuide(regex_string, self.tokenizer) @@ -69,68 +47,8 @@ def time_outlines(self, _, regex): state = guide.get_next_state(state, token) -json_case = [ - ( - { - "$defs": { - "Armor": { - "enum": ["leather", "chainmail", "plate"], - "title": "Armor", - "type": "string", - } - }, - "properties": { - "name": {"maxLength": 10, "title": "Name", "type": "string"}, - "age": {"title": "Age", "type": "integer"}, - "armor": {"$ref": "#/$defs/Armor"}, - "strength": {"title": "Strength", "type": "integer"}, - }, - "required": ["name", "age", "armor", "strength"], - "title": "Character", - "type": "object", - }, - """{'name': 'Super Warrior', 'age': 26, 'armor': 'leather', 'armor': 10}""", - ), - ( - { - "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema for a recording", - "type": "object", - "definitions": { - "artist": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "functions": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["id", "name", "functions"], - } - }, - "properties": { - "id": {"type": "number"}, - "work": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "composer": {"$ref": "#/definitions/artist"}, - }, - }, - "recording_artists": { - "type": "array", - "items": {"$ref": "#/definitions/artist"}, - }, - }, - "required": ["id", "work", "recording_artists"], - }, - """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""", - ), -] - - class OutlinesJsonSchema: - params = [models, json_case] + params = [models, json_cases] param_names = ["model", "json"] timeout = 1200 @@ -154,7 +72,7 @@ def time_outlines(self, _, json_case): regular expression, and walking this index while generating tokens. """ - json_string, json_example = json_case + json_string, json_example = json_case["schema"], json_case["example"] json_example_tokens = self.tokenizer.encode(json_example)[0][0] regex_string = build_regex_from_schema(json.dumps(json_string)) diff --git a/src/benchmark_outlines_core.py b/src/benchmark_outlines_core.py index fd50657..02db4d0 100644 --- a/src/benchmark_outlines_core.py +++ b/src/benchmark_outlines_core.py @@ -5,33 +5,11 @@ from outlines_core.models.transformers import TransformerTokenizer from transformers import AutoTokenizer -models = [ - "NousResearch/Nous-Hermes-llama-2-7b", # 32,000 tokens vocabulary - "gpt2", # 50,257 tokens vocabulary - "NousResearch/Hermes-3-Llama-3.1-8B", # 128,256 tokens vocabulary - "unsloth/gemma-2-2b-it-bnb-4bit", # 256,128 tokens vocabulary -] - -regex_case = [ - (r"\d{3}-\d{2}-\d{4}", "203-22-1234"), - ( - r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?", - "https://github.com/outlines-dev/outlines", - ), - ( - r"A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.", - "A: Some thoughts before answering. The answer is 42.", - ), - ( - "(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)", - "AVeryLongStringtoTest1234", - ), - (r"\+[1-9]\d{1,14}", "1234567891234"), -] +from .data import json_cases, models, regex_cases class OutlinesCoreRegex: - params = [models, regex_case] + params = [models, regex_cases] param_names = ["model", "regex"] timeout = 600 @@ -54,7 +32,7 @@ def time_outlines_core(self, _, regex): regular expression, and walking this index while generating tokens. """ - regex_string, regex_example = regex + regex_string, regex_example = regex["regex"], regex["example"] regex_example_tokens = self.tokenizer.encode(regex_example)[0][0] guide = RegexGuide(regex_string, self.tokenizer) @@ -64,68 +42,8 @@ def time_outlines_core(self, _, regex): state = guide.get_next_state(state, token) -json_case = [ - ( - { - "$defs": { - "Armor": { - "enum": ["leather", "chainmail", "plate"], - "title": "Armor", - "type": "string", - } - }, - "properties": { - "name": {"maxLength": 10, "title": "Name", "type": "string"}, - "age": {"title": "Age", "type": "integer"}, - "armor": {"$ref": "#/$defs/Armor"}, - "strength": {"title": "Strength", "type": "integer"}, - }, - "required": ["name", "age", "armor", "strength"], - "title": "Character", - "type": "object", - }, - """{'name': 'Super Warrior', 'age': 26, 'armor': 'leather', 'armor': 10}""", - ), - ( - { - "$schema": "http://json-schema.org/draft-04/schema#", - "title": "Schema for a recording", - "type": "object", - "definitions": { - "artist": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "functions": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["id", "name", "functions"], - } - }, - "properties": { - "id": {"type": "number"}, - "work": { - "type": "object", - "properties": { - "id": {"type": "number"}, - "name": {"type": "string"}, - "composer": {"$ref": "#/definitions/artist"}, - }, - }, - "recording_artists": { - "type": "array", - "items": {"$ref": "#/definitions/artist"}, - }, - }, - "required": ["id", "work", "recording_artists"], - }, - """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""", - ), -] - - class OutlinesCoreJsonSchema: - params = [models, json_case] + params = [models, json_cases] param_names = ["model", "json"] timeout = 600 @@ -148,7 +66,7 @@ def time_outlines_core(self, _, json_case): regular expression, and walking this index while generating tokens. """ - json_string, json_example = json_case + json_string, json_example = json_case["schema"], json_case["example"] json_example_tokens = self.tokenizer.encode(json_example)[0][0] regex_string = build_regex_from_schema(json.dumps(json_string))