diff --git a/CHANGELOG.md b/CHANGELOG.md index 3457774..3f3225d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## v0.1.1 (2023-03-25) + +#### Changed + - Fixes to the README regarding validation utility functions. + - Renamed ill-named function to `resolve_entity` and added explicit test. + + ## v0.1.0 (2023-03-25) The project's typing system was validated using mypy and refactored to follow @@ -5,7 +12,7 @@ Annotated types as specified by [PEP 593](https://peps.python.org/pep-0593/). #### Added - FuzzValidator annotation type created to simplify design - - validate_python, validate_json, and validate_entity functions added + - validate_python and validate_json functions added - Added Language, LanguageName, and LanguageCode usable types - fuzztypes.logger and fuzztypes.utils module for downloading iso codes diff --git a/README.md b/README.md index ccebb57..8466879 100644 --- a/README.md +++ b/README.md @@ -146,28 +146,6 @@ assert obj.model_dump() == { } ``` -Types can also be used outside of Pydantic models to validate and normalize data: - -```python -from fuzztypes import Date, Fuzzmoji - -# access value via "call" (parenthesis) -assert Date("1 JAN 2023").isoformat() == "2023-01-01" -assert Fuzzmoji("tada") == '🎉' - -# access entity via "key lookup" (square brackets) -assert Fuzzmoji["movie cam"].value == "🎥" -assert Fuzzmoji["movie cam"].aliases == [':movie_camera:', 'movie camera'] -assert Fuzzmoji["movie cam"].model_dump() == { - 'value': '🎥', - 'label': None, - 'meta': None, - 'priority': None, - 'aliases': [':movie_camera:', 'movie camera'] -} -``` - - ## Installation Available on [PyPI](https://pypi.org/project/FuzzTypes/): @@ -212,19 +190,22 @@ specific data validation and normalization requirements. Usable types are pre-built annotation types in FuzzTypes that can be directly used in Pydantic models. They provide convenient and ready-to-use functionality for common data types and scenarios. -| Type | Description | -|-------------|-------------------------------------------------------------------------------------------| -| `ASCII` | Converts Unicode strings to ASCII equivalents using either `anyascii` or `unidecode`. | -| `Date` | Converts date strings to `date` objects using `dateparser`. | -| `Email` | Extracts email addresses from strings using a regular expression. | -| `Emoji` | Matches emojis based on Unicode Consortium aliases using the `emoji` library. | -| `Fuzzmoji` | Matches emojis using fuzzy string matching against aliases. | -| `Integer` | Converts numeric strings or words to integers using `number-parser`. | -| `Person` | Parses person names into subfields (e.g., first, last, suffix) using `python-nameparser`. | -| `SSN` | Extracts U.S. Social Security Numbers from strings using a regular expression. | -| `Time` | Converts datetime strings to `datetime` objects using `dateparser`. | -| `Vibemoji` | Matches emojis using semantic similarity against aliases. | -| `Zipcode` | Extracts U.S. ZIP codes (5 or 9 digits) from strings using a regular expression. | +| Type | Description | +|----------------|-------------------------------------------------------------------------------------------| +| `ASCII` | Converts Unicode strings to ASCII equivalents using either `anyascii` or `unidecode`. | +| `Date` | Converts date strings to `date` objects using `dateparser`. | +| `Email` | Extracts email addresses from strings using a regular expression. | +| `Emoji` | Matches emojis based on Unicode Consortium aliases using the `emoji` library. | +| `Fuzzmoji` | Matches emojis using fuzzy string matching against aliases. | +| `Integer` | Converts numeric strings or words to integers using `number-parser`. | +| `LanguageCode` | Resolves language to ISO language codes (e.g., "en"). | +| `LanguageName` | Resolves language to ISO language names (e.g., "English"). | +| `Language` | Resolves language to ISO language object (name, alpha_2, alpha_3, scope, type, etc.). | +| `Person` | Parses person names into subfields (e.g., first, last, suffix) using `python-nameparser`. | +| `SSN` | Extracts U.S. Social Security Numbers from strings using a regular expression. | +| `Time` | Converts datetime strings to `datetime` objects using `dateparser`. | +| `Vibemoji` | Matches emojis using semantic similarity against aliases. | +| `Zipcode` | Extracts U.S. ZIP codes (5 or 9 digits) from strings using a regular expression. | These usable types provide a wide range of commonly needed data validations and transformations, making it easier to work with various data formats and perform tasks like parsing, extraction, and matching. @@ -298,20 +279,6 @@ data for use in precision oncology clinical decision support systems. Contact me offerings. -## Roadmap - -Additional capabilities will soon be added: - -- Complete OnDiskValidator [fuzzy string matching](https://github.com/quickwit-oss/tantivy-py/issues/20). -- Reranking models -- Hybrid search (linear and reciprocal rank fusion using fuzzy and semantic) -- Trie-based autocomplete and aho-corasick search -- `Humanize` intword and ordinals -- `Pint` quantities -- `Country` and `Currency` codes/names - -The following usable types are planned for future implementation in FuzzTypes: - | Type | Description | |----------------|-------------------------------------------------------------------------------------------| | `AirportCode` | Represents airport codes (e.g., "ORD"). | @@ -319,8 +286,6 @@ The following usable types are planned for future implementation in FuzzTypes: | `CountryCode` | Represents ISO country codes (e.g., "US"). | | `Country` | Represents country names (e.g., "United States"). | | `Currency` | Represents currency codes (e.g., "USD"). | -| `LanguageCode` | Represents ISO language codes (e.g., "en"). | -| `Language` | Represents language names (e.g., "English"). | | `Quantity` | Converts strings to `Quantity` objects with value and unit using `pint`. | | `URL` | Represents normalized URLs with tracking parameters removed using `url-normalize`. | | `USStateCode` | Represents U.S. state codes (e.g., "CA"). | @@ -495,7 +460,6 @@ assert model.name == "JOHN" ``` - ### Regex The `Regex` base type allows matching values using a regular @@ -568,4 +532,51 @@ assert obj.model_dump(exclude_defaults=True, mode="json") == { "language_code": "en", "language_name": "English", } +``` + +### Validate Python and JSON functions + +Functional approach to validating python and json are available. +Below are examples for the `validate_python` and `validate_json` functions: + +```python +from pydantic import BaseModel +from fuzztypes import validate_python, validate_json, Integer, Date + +# validate python +assert validate_python(Integer, "two hundred") == 200 + +# validate json +class MyModel(BaseModel): + date: Date + +json = '{"date": "July 4th 2021"}' +obj = validate_json(MyModel, json) +assert obj.date.isoformat() == "2021-07-04" +``` + +### Resolve Entities from FuzzValidator or Annotation + +Entities can be resolved from the `FuzzValidator` validators such as InMemoryValidator +or OnDiskValidator or the defined `Annotation` type using the `resolve_entity` function: + +```python +from typing import Annotated +from fuzztypes import resolve_entity, InMemoryValidator + +elements = ["earth", "fire", "water", "air"] +ElementValidator = InMemoryValidator(elements) +Element = Annotated[str, ElementValidator] + +assert resolve_entity(ElementValidator, "EARTH").model_dump() == { + "aliases": [], + "label": None, + "meta": None, + "priority": None, + "value": "earth", +} + +assert resolve_entity(Element, "Air").model_dump( + exclude_defaults=True +) == {"value": "air"} ``` \ No newline at end of file diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index 05f5004..8b6fa54 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -21,7 +21,7 @@ # Validation from .validation import ( FuzzValidator, - validate_entity, + resolve_entity, validate_python, validate_json, get_type_adapter, @@ -89,7 +89,7 @@ "lazy", "logger", "utils", - "validate_entity", + "resolve_entity", "validate_json", "validate_python", ) diff --git a/src/fuzztypes/validation.py b/src/fuzztypes/validation.py index 26216f9..65a0e50 100644 --- a/src/fuzztypes/validation.py +++ b/src/fuzztypes/validation.py @@ -55,7 +55,7 @@ def validate_python(cls: Any, value: Any) -> Any: return ta.validate_python(value) -def validate_entity(cls: Any, value: Any) -> Optional[Entity]: +def resolve_entity(cls: Any, value: Any) -> Optional[Entity]: """ Returns entity from metadata if cls is a FuzzValidator. diff --git a/tests/in_memory/test_in_memory_tags_example.py b/tests/in_memory/test_in_memory_tags_example.py index 9721494..ef0923a 100644 --- a/tests/in_memory/test_in_memory_tags_example.py +++ b/tests/in_memory/test_in_memory_tags_example.py @@ -19,7 +19,7 @@ EntitySource, InMemoryValidator, flags, - validate_entity, + resolve_entity, validate_python, Entity, ) @@ -52,11 +52,11 @@ def Tag(TagSource): def test_get_entity_from_annotation(Tag): - entity = validate_entity(Tag, "2d") + entity = resolve_entity(Tag, "2d") assert isinstance(entity, Entity) assert entity.priority == 3 - entity = validate_entity(Tag, "3d") + entity = resolve_entity(Tag, "3d") assert isinstance(entity, Entity) assert entity.priority == 14 diff --git a/tests/on_disk/test_on_disk_alias.py b/tests/on_disk/test_on_disk_alias.py index 1349060..c30cdcb 100644 --- a/tests/on_disk/test_on_disk_alias.py +++ b/tests/on_disk/test_on_disk_alias.py @@ -75,9 +75,8 @@ def test_duplicate_records(): assert False, "Didn't raise exception!" except KeyError as e: assert ( - str(e) - == '\'Key Error: b ' - '["b" could not be resolved, did you mean "c", "a", or "d"?]\'' + str(e) == "'Key Error: b " + '["b" could not be resolved, did you mean "c", "a", or "d"?]\'' ) A = OnDiskValidator("DupeRec", source, tiebreaker_mode="lesser") diff --git a/tests/on_disk/test_on_disk_name.py b/tests/on_disk/test_on_disk_name.py index 7578e3f..e97fe66 100644 --- a/tests/on_disk/test_on_disk_name.py +++ b/tests/on_disk/test_on_disk_name.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, ValidationError, Field -from fuzztypes import NamedEntity, OnDiskValidator, flags, validate_entity +from fuzztypes import NamedEntity, OnDiskValidator, flags, resolve_entity names = ["George Washington", "John Adams", "Thomas Jefferson"] President = OnDiskValidator( @@ -34,7 +34,7 @@ def test_namestr_getitem(): entity = NamedEntity(value="Thomas Jefferson") assert President["Thomas Jefferson"] == entity assert President["THOMAS JEFFERSON"] == entity - assert validate_entity(President, "Thomas Jefferson") == entity + assert resolve_entity(President, "Thomas Jefferson") == entity assert CasedPrez["Thomas Jefferson"] == entity try: diff --git a/tests/test_readme.py b/tests/test_readme.py index 445dd77..5b8f8a6 100644 --- a/tests/test_readme.py +++ b/tests/test_readme.py @@ -279,3 +279,42 @@ class MyModel(BaseModel): model = MyModel(ip_address="My internet IP address is 192.168.127.12") assert model.ip_address == "192.168.127.12" + + +def test_validate_functions(): + from fuzztypes import validate_python, validate_json, resolve_entity, Date + + # validate python + assert validate_python(Integer, "two hundred") == 200 + + # validate json + class MyModel(BaseModel): + date: Date + + json = '{"date": "July 4th 2021"}' + obj = validate_json(MyModel, json) + assert obj.date.isoformat() == "2021-07-04" + + +def test_resolve_entity(): + from fuzztypes import resolve_entity, InMemoryValidator + + elements = ["earth", "fire", "water", "air"] + ElementValidator = InMemoryValidator(elements) + Element = Annotated[str, ElementValidator] + + # resolve using validator + entity = resolve_entity(ElementValidator, "EARTH") + assert entity is not None + assert entity.model_dump() == { + "aliases": [], + "label": None, + "meta": None, + "priority": None, + "value": "earth", + } + + # resolve using annotation type + entity = resolve_entity(Element, "Air") + assert entity is not None + assert entity.model_dump(exclude_defaults=True) == {"value": "air"}