From 9aca8c709025a34ceec9937e8d08f1b98466409b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Mon, 2 Dec 2024 15:50:52 +0100 Subject: [PATCH] Use `outlines-core` to translate JSON Schemas into regexes --- benchmarks/bench_json_schema.py | 7 +- outlines/fsm/json_schema.py | 489 +----------------- outlines/generate/choice.py | 4 +- outlines/generate/json.py | 7 +- outlines/processors/structured.py | 3 +- tests/fsm/test_json_schema.py | 88 ++-- .../generate/test_integration_transformers.py | 1 + 7 files changed, 56 insertions(+), 543 deletions(-) diff --git a/benchmarks/bench_json_schema.py b/benchmarks/bench_json_schema.py index 62d9b3c1d..3a1f72cb6 100644 --- a/benchmarks/bench_json_schema.py +++ b/benchmarks/bench_json_schema.py @@ -1,6 +1,7 @@ +from outlines_core.fsm.json_schema import build_regex_from_schema + from outlines.caching import cache_disabled from outlines.fsm.guide import RegexGuide -from outlines.fsm.json_schema import build_regex_from_schema from .common import setup_tokenizer # noqa: E402 @@ -70,10 +71,6 @@ def setup(self, schema_name): self.tokenizer = setup_tokenizer() self.schema = schemas[schema_name] - @cache_disabled() - def time_json_schema_to_regex(self, schema_name): - build_regex_from_schema(self.schema) - @cache_disabled() def time_json_schema_to_fsm(self, schema_name): regex = build_regex_from_schema(self.schema) diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py index 0bab57923..bae0ad17a 100644 --- a/outlines/fsm/json_schema.py +++ b/outlines/fsm/json_schema.py @@ -1,90 +1,10 @@ import inspect import json -import re import warnings from enum import Enum -from typing import Callable, Optional, Tuple, Type, Union +from typing import Callable, Type, Union -from jsonschema.protocols import Validator from pydantic import BaseModel, create_model -from referencing import Registry, Resource -from referencing._core import Resolver -from referencing.jsonschema import DRAFT202012 - -# allow `\"`, `\\`, or any character which isn't a control sequence -STRING_INNER = r'([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])' -STRING = f'"{STRING_INNER}*"' - -INTEGER = r"(-)?(0|[1-9][0-9]*)" -NUMBER = rf"({INTEGER})(\.[0-9]+)?([eE][+-][0-9]+)?" -BOOLEAN = r"(true|false)" -NULL = r"null" -WHITESPACE = r"[ ]?" - -type_to_regex = { - "string": STRING, - "integer": INTEGER, - "number": NUMBER, - "boolean": BOOLEAN, - "null": NULL, -} - -DATE_TIME = r'"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?"' -DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"' -TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"' -UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"' - -format_to_regex = { - "uuid": UUID, - "date-time": DATE_TIME, - "date": DATE, - "time": TIME, -} - - -def build_regex_from_schema(schema: str, whitespace_pattern: Optional[str] = None): - """Turn a JSON schema into a regex that matches any JSON object that follows - this schema. - - JSON Schema is a declarative language that allows to annotate JSON documents - with types and descriptions. These schemas can be generated from any Python - datastructure that has type annotation: namedtuples, dataclasses, Pydantic - models. And by ensuring that the generation respects the schema we ensure - that the output can be parsed into these objects. - This function parses the provided schema and builds a generation schedule which - mixes deterministic generation (fixed strings), and sampling with constraints. - - Parameters - ---------- - schema - A string that represents a JSON Schema. - whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string literals) - Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` - - Returns - ------- - A generation schedule. A list of strings that represent the JSON - schema's structure and regular expression that define the structure of - the fields. - - References - ---------- - .. [0] JSON Schema. https://json-schema.org/ - - """ - - schema = json.loads(schema) - Validator.check_schema(schema) - - # Build reference resolver - schema = Resource(contents=schema, specification=DRAFT202012) - uri = schema.id() if schema.id() is not None else "" - registry = Registry().with_resource(uri=uri, resource=schema) - resolver = registry.resolver() - - content = schema.contents - return to_regex(resolver, content, whitespace_pattern) def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str: @@ -120,413 +40,6 @@ def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) - return schema_str -def _get_num_items_pattern(min_items, max_items, whitespace_pattern): - # Helper function for arrays and objects - min_items = int(min_items or 0) - if max_items is None: - return rf"{{{max(min_items - 1, 0)},}}" - else: - max_items = int(max_items) - if max_items < 1: - return None - return rf"{{{max(min_items - 1, 0)},{max_items - 1}}}" - - -def validate_quantifiers( - min_bound: Optional[str], max_bound: Optional[str], start_offset: int = 0 -) -> Tuple[str, str]: - """ - Ensures that the bounds of a number are valid. Bounds are used as quantifiers in the regex. - - Parameters - ---------- - min_bound - The minimum value that the number can take. - max_bound - The maximum value that the number can take. - start_offset - Number of elements that are already present in the regex but still need to be counted. - ex: if the regex is already "(-)?(0|[1-9][0-9])", we will always have at least 1 digit, so the start_offset is 1. - - Returns - ------- - min_bound - The minimum value that the number can take. - max_bound - The maximum value that the number can take. - - Raises - ------ - ValueError - If the minimum bound is greater than the maximum bound. - - TypeError or ValueError - If the minimum bound is not an integer or None. - or - If the maximum bound is not an integer or None. - """ - min_bound = "" if min_bound is None else str(int(min_bound) - start_offset) - max_bound = "" if max_bound is None else str(int(max_bound) - start_offset) - if min_bound and max_bound: - if int(max_bound) < int(min_bound): - raise ValueError("max bound must be greater than or equal to min bound") - return min_bound, max_bound - - -def to_regex( - resolver: Resolver, instance: dict, whitespace_pattern: Optional[str] = None -): - """Translate a JSON Schema instance into a regex that validates the schema. - - Note - ---- - Many features of JSON schema are missing: - - Handle `additionalProperties` keyword - - Handle types defined as a list - - Handle constraints on numbers - - Handle special patterns: `date`, `uri`, etc. - - This does not support recursive definitions. - - Parameters - ---------- - resolver - An object that resolves references to other instances within a schema - instance - The instance to translate - whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string literals) - Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` - """ - - # set whitespace pattern - if whitespace_pattern is None: - whitespace_pattern = WHITESPACE - - if instance == {}: - # JSON Schema Spec: Empty object means unconstrained, any json type is legal - types = [ - {"type": "boolean"}, - {"type": "null"}, - {"type": "number"}, - {"type": "integer"}, - {"type": "string"}, - {"type": "array"}, - {"type": "object"}, - ] - regexes = [to_regex(resolver, t, whitespace_pattern) for t in types] - regexes = [rf"({r})" for r in regexes] - return rf"{'|'.join(regexes)}" - - elif "properties" in instance: - regex = "" - regex += r"\{" - properties = instance["properties"] - required_properties = instance.get("required", []) - is_required = [item in required_properties for item in properties] - # If at least one property is required, we include the one in the lastest position - # without any comma. - # For each property before it (optional or required), we add with a comma after the property. - # For each property after it (optional), we add with a comma before the property. - if any(is_required): - last_required_pos = max([i for i, value in enumerate(is_required) if value]) - for i, (name, value) in enumerate(properties.items()): - subregex = f'{whitespace_pattern}"{re.escape(name)}"{whitespace_pattern}:{whitespace_pattern}' - subregex += to_regex(resolver, value, whitespace_pattern) - if i < last_required_pos: - subregex = f"{subregex}{whitespace_pattern}," - elif i > last_required_pos: - subregex = f"{whitespace_pattern},{subregex}" - regex += subregex if is_required[i] else f"({subregex})?" - # If no property is required, we have to create a possible pattern for each property in which - # it's the last one necessarilly present. Then, we add the others as optional before and after - # following the same strategy as described above. - # The whole block is made optional to allow the case in which no property is returned. - else: - property_subregexes = [] - for i, (name, value) in enumerate(properties.items()): - subregex = f'{whitespace_pattern}"{name}"{whitespace_pattern}:{whitespace_pattern}' - subregex += to_regex(resolver, value, whitespace_pattern) - property_subregexes.append(subregex) - possible_patterns = [] - for i in range(len(property_subregexes)): - pattern = "" - for subregex in property_subregexes[:i]: - pattern += f"({subregex}{whitespace_pattern},)?" - pattern += property_subregexes[i] - for subregex in property_subregexes[i + 1 :]: - pattern += f"({whitespace_pattern},{subregex})?" - possible_patterns.append(pattern) - regex += f"({'|'.join(possible_patterns)})?" - - regex += f"{whitespace_pattern}" + r"\}" - - return regex - - # To validate against allOf, the given data must be valid against all of the - # given subschemas. - elif "allOf" in instance: - subregexes = [ - to_regex(resolver, t, whitespace_pattern) for t in instance["allOf"] - ] - subregexes_str = [f"{subregex}" for subregex in subregexes] - return rf"({''.join(subregexes_str)})" - - # To validate against `anyOf`, the given data must be valid against - # any (one or more) of the given subschemas. - elif "anyOf" in instance: - subregexes = [ - to_regex(resolver, t, whitespace_pattern) for t in instance["anyOf"] - ] - return rf"({'|'.join(subregexes)})" - - # To validate against oneOf, the given data must be valid against exactly - # one of the given subschemas. - elif "oneOf" in instance: - subregexes = [ - to_regex(resolver, t, whitespace_pattern) for t in instance["oneOf"] - ] - - xor_patterns = [f"(?:{subregex})" for subregex in subregexes] - - return rf"({'|'.join(xor_patterns)})" - - # Create pattern for Tuples, per JSON Schema spec, `prefixItems` determines types at each idx - elif "prefixItems" in instance: - element_patterns = [ - to_regex(resolver, t, whitespace_pattern) for t in instance["prefixItems"] - ] - comma_split_pattern = rf"{whitespace_pattern},{whitespace_pattern}" - tuple_inner = comma_split_pattern.join(element_patterns) - return rf"\[{whitespace_pattern}{tuple_inner}{whitespace_pattern}\]" - - # The enum keyword is used to restrict a value to a fixed set of values. It - # must be an array with at least one element, where each element is unique. - elif "enum" in instance: - choices = [] - for choice in instance["enum"]: - if type(choice) in [int, float, bool, type(None), str]: - choices.append(re.escape(json.dumps(choice))) - elif isinstance(choice, dict): - choices.append(to_regex(resolver, choice, whitespace_pattern)) - else: - raise TypeError(f"Unsupported data type in enum: {type(choice)}") - return f"({'|'.join(choices)})" - - elif "const" in instance: - const = instance["const"] - if type(const) in [int, float, bool, type(None), str]: - const = re.escape(json.dumps(const)) - else: - raise TypeError(f"Unsupported data type in const: {type(const)}") - return const - - elif "$ref" in instance: - path = f"{instance['$ref']}" - instance = resolver.lookup(path).contents - return to_regex(resolver, instance, whitespace_pattern) - - # The type keyword may either be a string or an array: - # - If it's a string, it is the name of one of the basic types. - # - If it is an array, it must be an array of strings, where each string is - # the name of one of the basic types, and each element is unique. In this - # case, the JSON snippet is valid if it matches any of the given types. - elif "type" in instance: - instance_type = instance["type"] - if instance_type == "string": - if "maxLength" in instance or "minLength" in instance: - max_items = instance.get("maxLength", "") - min_items = instance.get("minLength", "") - try: - if int(max_items) < int(min_items): - raise ValueError( - "maxLength must be greater than or equal to minLength" - ) # FIXME this raises an error but is caught right away by the except (meant for int("") I assume) - except ValueError: - pass - return f'"{STRING_INNER}{{{min_items},{max_items}}}"' - elif "pattern" in instance: - pattern = instance["pattern"] - if pattern[0] == "^" and pattern[-1] == "$": - return rf'("{pattern[1:-1]}")' - else: - return rf'("{pattern}")' - elif "format" in instance: - format = instance["format"] - if format == "date-time": - return format_to_regex["date-time"] - elif format == "uuid": - return format_to_regex["uuid"] - elif format == "date": - return format_to_regex["date"] - elif format == "time": - return format_to_regex["time"] - else: - raise NotImplementedError( - f"Format {format} is not supported by Outlines" - ) - else: - return type_to_regex["string"] - - elif instance_type == "number": - bounds = { - "minDigitsInteger", - "maxDigitsInteger", - "minDigitsFraction", - "maxDigitsFraction", - "minDigitsExponent", - "maxDigitsExponent", - } - if bounds.intersection(set(instance.keys())): - min_digits_integer, max_digits_integer = validate_quantifiers( - instance.get("minDigitsInteger"), - instance.get("maxDigitsInteger"), - start_offset=1, - ) - min_digits_fraction, max_digits_fraction = validate_quantifiers( - instance.get("minDigitsFraction"), instance.get("maxDigitsFraction") - ) - min_digits_exponent, max_digits_exponent = validate_quantifiers( - instance.get("minDigitsExponent"), instance.get("maxDigitsExponent") - ) - integers_quantifier = ( - f"{{{min_digits_integer},{max_digits_integer}}}" - if min_digits_integer or max_digits_integer - else "*" - ) - fraction_quantifier = ( - f"{{{min_digits_fraction},{max_digits_fraction}}}" - if min_digits_fraction or max_digits_fraction - else "+" - ) - exponent_quantifier = ( - f"{{{min_digits_exponent},{max_digits_exponent}}}" - if min_digits_exponent or max_digits_exponent - else "+" - ) - return rf"((-)?(0|[1-9][0-9]{integers_quantifier}))(\.[0-9]{fraction_quantifier})?([eE][+-][0-9]{exponent_quantifier})?" - return type_to_regex["number"] - - elif instance_type == "integer": - if "minDigits" in instance or "maxDigits" in instance: - min_digits, max_digits = validate_quantifiers( - instance.get("minDigits"), instance.get("maxDigits"), start_offset=1 - ) - return rf"(-)?(0|[1-9][0-9]{{{min_digits},{max_digits}}})" - return type_to_regex["integer"] - - elif instance_type == "array": - num_repeats = _get_num_items_pattern( - instance.get("minItems"), instance.get("maxItems"), whitespace_pattern - ) - if num_repeats is None: - return rf"\[{whitespace_pattern}\]" - - allow_empty = "?" if int(instance.get("minItems", 0)) == 0 else "" - - if "items" in instance: - items_regex = to_regex(resolver, instance["items"], whitespace_pattern) - return rf"\[{whitespace_pattern}(({items_regex})(,{whitespace_pattern}({items_regex})){num_repeats}){allow_empty}{whitespace_pattern}\]" - else: - # Here we need to make the choice to exclude generating list of objects - # if the specification of the object is not given, even though a JSON - # object that contains an object here would be valid under the specification. - legal_types = [ - {"type": "boolean"}, - {"type": "null"}, - {"type": "number"}, - {"type": "integer"}, - {"type": "string"}, - ] - depth = instance.get("depth", 2) - if depth > 0: - legal_types.append({"type": "object", "depth": depth - 1}) - legal_types.append({"type": "array", "depth": depth - 1}) - - regexes = [ - to_regex(resolver, t, whitespace_pattern) for t in legal_types - ] - return rf"\[{whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}{allow_empty}{whitespace_pattern}\]" - - elif instance_type == "object": - # pattern for json object with values defined by instance["additionalProperties"] - # enforces value type constraints recursively, "minProperties", and "maxProperties" - # doesn't enforce "required", "dependencies", "propertyNames" "any/all/on Of" - num_repeats = _get_num_items_pattern( - instance.get("minProperties"), - instance.get("maxProperties"), - whitespace_pattern, - ) - if num_repeats is None: - return rf"\{{{whitespace_pattern}\}}" - - allow_empty = "?" if int(instance.get("minProperties", 0)) == 0 else "" - - additional_properties = instance.get("additionalProperties") - - if additional_properties is None or additional_properties is True: - # JSON Schema behavior: If the additionalProperties of an object is - # unset or True, it is unconstrained object. - # We handle this by setting additionalProperties to anyOf: {all types} - - legal_types = [ - {"type": "string"}, - {"type": "number"}, - {"type": "boolean"}, - {"type": "null"}, - ] - - # We set the object depth to 2 to keep the expression finite, but the "depth" - # key is not a true component of the JSON Schema specification. - depth = instance.get("depth", 2) - if depth > 0: - legal_types.append({"type": "object", "depth": depth - 1}) - legal_types.append({"type": "array", "depth": depth - 1}) - additional_properties = {"anyOf": legal_types} - - value_pattern = to_regex( - resolver, additional_properties, whitespace_pattern - ) - key_value_pattern = ( - f"{STRING}{whitespace_pattern}:{whitespace_pattern}{value_pattern}" - ) - key_value_successor_pattern = ( - f"{whitespace_pattern},{whitespace_pattern}{key_value_pattern}" - ) - multiple_key_value_pattern = f"({key_value_pattern}({key_value_successor_pattern}){num_repeats}){allow_empty}" - - return ( - r"\{" - + whitespace_pattern - + multiple_key_value_pattern - + whitespace_pattern - + r"\}" - ) - - elif instance_type == "boolean": - return type_to_regex["boolean"] - - elif instance_type == "null": - return type_to_regex["null"] - - elif isinstance(instance_type, list): - # Here we need to make the choice to exclude generating an object - # if the specification of the object is not give, even though a JSON - # object that contains an object here would be valid under the specification. - regexes = [ - to_regex(resolver, {"type": t}, whitespace_pattern) - for t in instance_type - if t != "object" - ] - return rf"({'|'.join(regexes)})" - - raise NotImplementedError( - f"""Could not translate the instance {instance} to a - regular expression. Make sure it is valid to the JSON Schema specification. If - it is, please open an issue on the Outlines repository""" - ) - - def get_schema_from_signature(fn: Callable) -> dict: """Turn a function signature into a JSON schema. diff --git a/outlines/generate/choice.py b/outlines/generate/choice.py index 75fc71271..afb998f52 100644 --- a/outlines/generate/choice.py +++ b/outlines/generate/choice.py @@ -4,7 +4,9 @@ from functools import singledispatch from typing import Callable, List, Union -from outlines.fsm.json_schema import build_regex_from_schema, get_schema_from_enum +from outlines_core.fsm.json_schema import build_regex_from_schema + +from outlines.fsm.json_schema import get_schema_from_enum from outlines.generate.api import SequenceGeneratorAdapter from outlines.models import OpenAI from outlines.samplers import Sampler, multinomial diff --git a/outlines/generate/json.py b/outlines/generate/json.py index 703447958..d098d920d 100644 --- a/outlines/generate/json.py +++ b/outlines/generate/json.py @@ -3,13 +3,10 @@ from functools import singledispatch from typing import Callable, Optional, Union +from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel -from outlines.fsm.json_schema import ( - build_regex_from_schema, - get_schema_from_enum, - get_schema_from_signature, -) +from outlines.fsm.json_schema import get_schema_from_enum, get_schema_from_signature from outlines.generate.api import SequenceGeneratorAdapter from outlines.models import OpenAI from outlines.samplers import Sampler, multinomial diff --git a/outlines/processors/structured.py b/outlines/processors/structured.py index d2bc15f77..64892b73f 100644 --- a/outlines/processors/structured.py +++ b/outlines/processors/structured.py @@ -27,10 +27,11 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union import torch +from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from outlines.fsm.guide import CFGGuide, Guide, RegexGuide -from outlines.fsm.json_schema import build_regex_from_schema, convert_json_schema_to_str +from outlines.fsm.json_schema import convert_json_schema_to_str from .base_logits_processor import OutlinesLogitsProcessor diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index 6f0b59c50..9cfc110bb 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -7,9 +7,7 @@ import interegular import pytest -from pydantic import BaseModel, Field, constr - -from outlines.fsm.json_schema import ( +from outlines_core.fsm.json_schema import ( BOOLEAN, DATE, DATE_TIME, @@ -22,10 +20,11 @@ UUID, WHITESPACE, build_regex_from_schema, - get_schema_from_enum, - get_schema_from_signature, to_regex, ) +from pydantic import BaseModel, Field, constr + +from outlines.fsm.json_schema import get_schema_from_enum, get_schema_from_signature def test_function_basic(): @@ -75,7 +74,7 @@ class User(BaseModel): ) def test_match_integer(pattern, does_match): step = {"title": "Foo", "type": "integer"} - regex = to_regex(None, step) + regex = to_regex(step) assert regex == INTEGER value = pattern["integer"] @@ -102,7 +101,7 @@ def test_match_integer(pattern, does_match): ) def test_match_number(pattern, does_match): step = {"title": "Foo", "type": "number"} - regex = to_regex(None, step) + regex = to_regex(step) assert regex == NUMBER value = pattern["number"] @@ -137,7 +136,7 @@ def test_match_number(pattern, does_match): # String with maximum length ( {"title": "Foo", "type": "string", "maxLength": 3}, - f'"{STRING_INNER}{{,3}}"', + f'"{STRING_INNER}{{0,3}}"', [('"ab"', True), ('"a""', False), ('"abcd"', False)], ), # String with minimum length @@ -240,40 +239,43 @@ def test_match_number(pattern, does_match): [("0", True), ("1", True), ("a", False)], ), # Enum mix of types - ( - { - "title": "Foo", - "enum": [ - 6, - 5.3, - "potato", - True, - None, - { - "properties": { - "a": {"title": "A", "type": "number"}, - "b": {"title": "B", "type": "number"}, - }, - "required": ["a", "b"], - "title": "add", - "type": "object", - }, - ], - }, - r'(6|5\.3|"potato"|true|null|\{[ ]?"a"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?[ ]?,[ ]?"b"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?[ ]?\})', - [ - ("6", True), - ("5.3", True), - ('"potato"', True), - ("true", True), - ("null", True), - ("523", False), - ("True", False), - ("None", False), - ('{"a": -1.0, "b": 1.1}', True), - ('{"a": "a", "b": 1.1}', False), - ], - ), + # + # Enums of objects are not supported by outlines-core yet, + # see https://github.com/dottxt-ai/outlines-core/issues/100 + # ( + # { + # "title": "Foo", + # "enum": [ + # 6, + # 5.3, + # "potato", + # True, + # None, + # { + # "properties": { + # "a": {"title": "A", "type": "number"}, + # "b": {"title": "B", "type": "number"}, + # }, + # "required": ["a", "b"], + # "title": "add", + # "type": "object", + # }, + # ], + # }, + # r'(6|5\.3|"potato"|true|null|\{[ ]?"a"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?[ ]?,[ ]?"b"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?[ ]?\})', + # [ + # ("6", True), + # ("5.3", True), + # ('"potato"', True), + # ("true", True), + # ("null", True), + # ("523", False), + # ("True", False), + # ("None", False), + # ('{"a": -1.0, "b": 1.1}', True), + # ('{"a": "a", "b": 1.1}', False), + # ], + # ), # integer ( { @@ -308,7 +310,7 @@ def test_match_number(pattern, does_match): }, "required": ["count"], }, - '\\{[ ]?"count"[ ]?:[ ]?(-)?(0|[1-9][0-9]{,2})[ ]?\\}', + '\\{[ ]?"count"[ ]?:[ ]?(-)?(0|[1-9][0-9]{0,2})[ ]?\\}', [('{ "count": 100 }', True), ('{ "count": 1000 }', False)], ), # integer with minimum and maximum digits diff --git a/tests/generate/test_integration_transformers.py b/tests/generate/test_integration_transformers.py index 92c5d789c..7b7973d23 100644 --- a/tests/generate/test_integration_transformers.py +++ b/tests/generate/test_integration_transformers.py @@ -363,6 +363,7 @@ def mul(c: float, d: float) -> float: return c * d +@pytest.mark.xfail(reason="Enum of objects are not supported in outlines-core") def test_transformers_json_function_enum(model): prompt = "Output some JSON "