diff --git a/.gitignore b/.gitignore index c4f6c53..5da7fbb 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,5 @@ wheels/ model_cache/ .DS_Store /training/ -profile.dat \ No newline at end of file +profile.dat +notebooks \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3457774 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,22 @@ +## v0.1.0 (2023-03-25) + +The project's typing system was validated using mypy and refactored to follow +Annotated types as specified by [PEP 593](https://peps.python.org/pep-0593/). + +#### Added + - FuzzValidator annotation type created to simplify design + - validate_python, validate_json, and validate_entity functions added + - Added Language, LanguageName, and LanguageCode usable types + - fuzztypes.logger and fuzztypes.utils module for downloading iso codes + +#### Changed + - Renamed OnDisk to OnDiskValidator + - Renamed InMemory to InMemoryValidator + - Refactored InMemoryValidator and OnDiskValidator to use FuzzValidator + - Refactored Person to use FuzzValidator + - Renamed Regex to RegexValidator + - Changed error message to more common "did you mean" message format + +#### Removed + - abstract.py module and AbstractType class, simplified by FuzzValidator + - function.py module and Function annotation type, replaced by FuzzValidator \ No newline at end of file diff --git a/README.md b/README.md index 7eeacd9..ccebb57 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,6 @@ Designed for simplicity, it provides powerful normalization capabilities (e.g. named entity linking) to ensure structured data is composed of "smart things" not "dumb strings". -*Note: FuzzTypes is currently experimental and there could be breaking -changes to its API over the next few weeks.* ## Getting Started @@ -36,16 +34,20 @@ FuzzTypes expands on the standard data conversions handled by Pydantic and provides a variety of autocorrecting annotation types. ```python +from datetime import datetime +from typing import Annotated + from pydantic import BaseModel + from fuzztypes import ( ASCII, Datetime, Email, Fuzzmoji, - InMemory, + InMemoryValidator, Integer, Person, - Regex, + RegexValidator, ZipCode, flags, ) @@ -53,13 +55,17 @@ from fuzztypes import ( # define a source, see EntitySource for using TSV, CSV, JSONL inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"] -# define a named entity type in memory. use OnDisk for larger data sets. -Inventor = InMemory(inventors, search_flag=flags.FuzzSearch) +# define a in memory validator with fuzz search enabled. +Inventor = Annotated[ + str, InMemoryValidator(inventors, search_flag=flags.FuzzSearch) +] # custom Regex type for finding twitter handles. -Handle = Regex(r'@\w{1,15}', examples=["@genomoncology"]) +Handle = Annotated[ + str, RegexValidator(r"@\w{1,15}", examples=["@genomoncology"]) +] -# define a Pydantic class with 9 fuzzy type attriubutes +# define a Pydantic class with 9 fuzzy type attributes class Fuzzy(BaseModel): ascii: ASCII email: Email @@ -105,7 +111,7 @@ assert obj.integer == 55 assert obj.inventor == "Ada Lovelace" # human name parser (title, first, middle, last, suffix, nickname) -assert str(obj.person) == 'Mr. Arthur Herbert Fonzarelli (fonzie)' +assert str(obj.person) == "Mr. Arthur H. Fonzarelli (fonzie)" assert obj.person.short_name == "Arthur Fonzarelli" assert obj.person.nickname == "fonzie" assert obj.person.last == "Fonzarelli" @@ -117,7 +123,27 @@ assert obj.time.isoformat() == "2025-01-01T05:00:00" assert obj.zipcode == "12345-6789" # print JSON on success -print(obj.model_dump_json(indent=4)) +assert obj.model_dump() == { + "ascii": "anthropos", + "email": "jdoe@example.com", + "emoji": "💭", + "handle": "@imaurer", + "integer": 55, + "inventor": "Ada Lovelace", + "person": { + "first": "Arthur", + "init_format": "{first} {middle} {last}", + "last": "Fonzarelli", + "middle": "H.", + "name_format": "{title} {first} {middle} {last} {suffix} " + "({nickname})", + "nickname": "fonzie", + "suffix": "", + "title": "Mr.", + }, + "time": datetime(2025, 1, 1, 5), + "zipcode": "12345-6789", +} ``` Types can also be used outside of Pydantic models to validate and normalize data: @@ -164,19 +190,18 @@ There is a read-only notebook that you can copy and edit to try out FuzzTypes: [https://colab.research.google.com/drive/1GNngxcTUXpWDqK_qNsJoP2NhSN9vKCzZ?usp=sharing](https://colab.research.google.com/drive/1GNngxcTUXpWDqK_qNsJoP2NhSN9vKCzZ?usp=sharing) -## Base Types +## Base Validators -Base types are the fundamental building blocks in FuzzTypes. They provide the core functionality and can be used to -create custom annotation types tailored to specific use cases. +Base validators are the building blocks of FuzzTypes that can be used for creating custom "usable types". -| Type | Description | -|------------|--------------------------------------------------------------------------------------------| -| `DateType` | Base type for fuzzy parsing date objects. | -| `Function` | Allows using any function that accepts one value and returns one value for transformation. | -| `InMemory` | Enables matching entities in memory using exact, alias, fuzzy, or semantic search. | -| `OnDisk` | Performs matching entities stored on disk using exact, alias, fuzzy, or semantic search. | -| `Regex` | Allows matching values using a regular expression pattern. | -| `TimeType` | Base type for fuzzy parsing datetime objects (e.g., "tomorrow at 5am"). | +| Type | Description | +|---------------------|---------------------------------------------------------------------------------------------| +| `DateType` | Base date type, pass in arguments such as `date_order`, `strict` and `relative_base`. | +| `FuzzValidator` | Validator class that calls a provided function and handles core and json schema config. | +| `InMemoryValidator` | Enables matching entities in memory using exact, alias, fuzzy, or semantic search. | +| `OnDiskValidator` | Performs matching entities stored on disk using exact, alias, fuzzy, or semantic search. | +| `RegexValidator` | Regular expression pattern matching base validator. | +| `DatetimeType` | Base datetime type, pass in arguments such as `date_order`, `timezone` and `relative_base`. | These base types offer flexibility and extensibility, enabling you to create custom annotation types that suit your specific data validation and normalization requirements. @@ -205,10 +230,9 @@ These usable types provide a wide range of commonly needed data validations and easier to work with various data formats and perform tasks like parsing, extraction, and matching. -## Configuring FuzzTypes +## InMemoryValidator and OnDiskValidator Configuration -FuzzTypes provides a set of configuration options that allow you to customize the behavior of the annotation types. -These options can be passed as arguments when creating an instance of a FuzzType. +The InMemory and OnDisk Validator objects work with lists of Entities. The following table describes the available configuration options: @@ -224,11 +248,6 @@ The following table describes the available configuration options: | `notfound_mode` | `Literal["raise", "none", "allow"]` | `"raise"` | The action to take when a matching entity is not found. Available options are "raise" (raises an exception), "none" (returns `None`), and "allow" (returns the input key as the value). | | `search_flag` | `flags.SearchFlag` | `flags.DefaultSearch` | The search strategy to use for finding matches. It is a combination of flags that determine which fields of the `NamedEntity` are considered for matching and whether fuzzy or semantic search is enabled. Available options are defined in the `flags` module. | | `tiebreaker_mode` | `Literal["raise", "lesser", "greater"]` | `"raise"` | The strategy to use for resolving ties when multiple matches have the same similarity score. Available options are "raise" (raises an exception), "lesser" (returns the match with the lower value), and "greater" (returns the match with the greater value). | -| `validator_mode` | `Literal["before"]` | `"before"` | The validation mode to use for Pydantic. Currently, only the "before" mode is fully tested and supported, which resolves the value before validation. | - -These configuration options provide flexibility in tailoring the behavior of FuzzTypes to suit your specific use case. -By adjusting these options, you can control aspects such as case sensitivity, device selection, encoding mechanism, -search strategy, similarity thresholds, and more. ## Lazy Dependencies @@ -252,22 +271,22 @@ pip install anyascii dateparser emoji lancedb nameparser number-parser rapidfuzz ``` -| Fuzz Type | Library | License | Purpose | -|------------|--------------------------------------------------------------------------|------------|------------------------------------------------------------| -| ASCII | [anyascii](https://github.com/anyascii/anyascii) | ISC | Converting Unicode into ASCII equivalents (not GPL) | -| ASCII | [unidecode](https://github.com/avian2/unidecode) | GPL | Converting Unicode into ASCII equivalents (better quality) | -| Date | [dateparser](https://github.com/scrapinghub/dateparser) | BSD-3 | Parsing dates from strings | -| Emoji | [emoji](https://github.com/carpedm20/emoji/) | BSD | Handling and manipulating emoji characters | -| Fuzz | [rapidfuzz](https://github.com/rapidfuzz/RapidFuzz) | MIT | Performing fuzzy string matching | -| InMemory | [numpy](https://numpy.org/) | BSD | Numerical computing in Python | -| InMemory | [scikit-learn](https://scikit-learn.org/) | BSD | Machine learning in Python | -| InMemory | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | -| Integer | [number-parser](https://github.com/scrapinghub/number-parser) | BSD-3 | Parsing numbers from strings | -| OnDisk | [lancedb](https://github.com/lancedb/lancedb) | Apache-2.0 | High-performance, on-disk vector database | -| OnDisk | [pyarrow](https://github.com/apache/arrow) | Apache-2.0 | In-memory columnar data format and processing library | -| OnDisk | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | -| OnDisk | [tantivy](https://github.com/quickwit-oss/tantivy-py) | MIT | Full-text search (FTS) for LanceDB. | -| Person | [nameparser](https://github.com/derek73/python-nameparser) | LGPL | Parsing person names | +| Fuzz Type | Library | License | Purpose | +|-------------------|--------------------------------------------------------------------------|------------|------------------------------------------------------------| +| ASCII | [anyascii](https://github.com/anyascii/anyascii) | ISC | Converting Unicode into ASCII equivalents (not GPL) | +| ASCII | [unidecode](https://github.com/avian2/unidecode) | GPL | Converting Unicode into ASCII equivalents (better quality) | +| Date | [dateparser](https://github.com/scrapinghub/dateparser) | BSD-3 | Parsing dates from strings | +| Emoji | [emoji](https://github.com/carpedm20/emoji/) | BSD | Handling and manipulating emoji characters | +| Fuzz | [rapidfuzz](https://github.com/rapidfuzz/RapidFuzz) | MIT | Performing fuzzy string matching | +| InMemoryValidator | [numpy](https://numpy.org/) | BSD | Numerical computing in Python | +| InMemoryValidator | [scikit-learn](https://scikit-learn.org/) | BSD | Machine learning in Python | +| InMemoryValidator | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | +| Integer | [number-parser](https://github.com/scrapinghub/number-parser) | BSD-3 | Parsing numbers from strings | +| OnDiskValidator | [lancedb](https://github.com/lancedb/lancedb) | Apache-2.0 | High-performance, on-disk vector database | +| OnDiskValidator | [pyarrow](https://github.com/apache/arrow) | Apache-2.0 | In-memory columnar data format and processing library | +| OnDiskValidator | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | +| OnDiskValidator | [tantivy](https://github.com/quickwit-oss/tantivy-py) | MIT | Full-text search (FTS) for LanceDB. | +| Person | [nameparser](https://github.com/derek73/python-nameparser) | LGPL | Parsing person names | ## Maintainer @@ -283,7 +302,7 @@ offerings. Additional capabilities will soon be added: -- Complete OnDisk [fuzzy string matching](https://github.com/quickwit-oss/tantivy-py/issues/20). +- Complete OnDiskValidator [fuzzy string matching](https://github.com/quickwit-oss/tantivy-py/issues/20). - Reranking models - Hybrid search (linear and reciprocal rank fusion using fuzzy and semantic) - Trie-based autocomplete and aho-corasick search @@ -355,6 +374,7 @@ loading entities from a callable function. Example: ```python +from pathlib import Path from fuzztypes import EntitySource, NamedEntity # Load entities from a CSV file @@ -370,41 +390,24 @@ def load_animals(): animal_source = EntitySource(load_animals) ``` -### Function Base Type - -The `Function` base type allows you to use any function that accepts -one value and returns one value for transformation. It is useful -for creating simple annotation types that perform custom data -transformations. - -Example: -```python -from fuzztypes import Function - -# Create a custom annotation type that converts a value to uppercase -UpperCase = Function(str.upper) +### InMemoryValidator Base Type -class MyModel(BaseModel): - name: UpperCase - -model = MyModel(name="john") -assert model.name == "JOHN" -``` - -### InMemory Base Type - -The `InMemory` base type enables matching entities in memory using +The `InMemoryValidator` base type enables matching entities in memory using exact, alias, fuzzy, or semantic search. It is suitable for small to medium-sized datasets that can fit in memory and provides fast matching capabilities. Example: ```python -from fuzztypes import InMemory, flags +from typing import Annotated +from pydantic import BaseModel +from fuzztypes import InMemoryValidator, flags # Create a custom annotation type for matching fruits fruits = ["Apple", "Banana", "Orange"] -Fruit = InMemory(fruits, search_flag=flags.FuzzSearch) +Fruit = Annotated[ + str, InMemoryValidator(fruits, search_flag=flags.FuzzSearch) +] class MyModel(BaseModel): fruit: Fruit @@ -413,53 +416,86 @@ model = MyModel(fruit="appel") assert model.fruit == "Apple" ``` -### OnDisk Base Type +### OnDiskValidator Base Type -The `OnDisk` base type performs matching entities stored on disk +The `OnDiskValidator` base type performs matching entities stored on disk using exact, alias, fuzzy, or semantic search. It leverages the LanceDB library for efficient storage and retrieval of entities. -`OnDisk` is recommended for large datasets that cannot fit in memory. +`OnDiskValidator` is recommended for large datasets that cannot fit in memory. Example: ```python -from fuzztypes import OnDisk, flags +from typing import Annotated +from pydantic import BaseModel +from fuzztypes import OnDiskValidator -# Create a custom annotation type for matching countries -countries = ["United States", "United Kingdom", "Canada"] -Country = OnDisk("Country", countries, search_flag=flags.FuzzSearch) +# Create a custom annotation type for matching countries stored on disk +countries = [ + ("United States", "US"), + ("United Kingdom", "UK"), + ("Canada", "CA"), +] +Country = Annotated[str, OnDiskValidator("Country", countries)] class MyModel(BaseModel): country: Country -model = MyModel(country="USA") -assert model.country == "United States" +assert MyModel(country="Canada").country == "Canada" +assert MyModel(country="US").country == "United States" ``` ### DateType and TimeType -The `DateType` and `TimeType` base types provide fuzzy parsing +The `DateValidator` and `DatetimeValidator` base types provide fuzzy parsing capabilities for date and datetime objects, respectively. They allow you to define flexible date and time formats and perform parsing based on specified settings such as date order, timezone, and relative base. +Example: + +```python +from datetime import date, datetime +from pydantic import BaseModel +from typing import Annotated +from fuzztypes import DateValidator, DatetimeValidator + +MyDate = Annotated[date, DateValidator(date_order="MDY")] +MyTime = Annotated[datetime, DatetimeValidator(timezone="UTC")] + +class MyModel(BaseModel): + date: MyDate + time: MyTime + +model = MyModel(date="1/1/2023", time="1/1/23 at 10:30 PM") +assert model.date.isoformat() == "2023-01-01" +assert model.time.isoformat() == "2023-01-01T22:30:00+00:00" +``` + + +### FuzzValidator + +The `FuzzValidator` is the base of the fuzztypes typing system. +It can be used directly to wrap any python function. + Example: ```python -from fuzztypes import DateType, DatetimeType +from typing import Annotated +from pydantic import BaseModel +from fuzztypes import FuzzValidator -# Create custom annotation types for parsing dates and times -Date = DateType(date_order="MDY") -Time = DatetimeType(timezone="UTC") +# Create a custom annotation type that converts a value to uppercase +UpperCase = Annotated[str, FuzzValidator(str.upper)] class MyModel(BaseModel): - date: Date - time: Time + name: UpperCase -model = MyModel(date="4/20/2023", time="10:30 PM") -print(model.date) # Output: datetime.date(2023, 4, 20) -print(model.time) # Output: datetime.datetime(2023, 4, 20, 22, 30, tzinfo=) +model = MyModel(name="john") +assert model.name == "JOHN" ``` + + ### Regex The `Regex` base type allows matching values using a regular @@ -468,15 +504,68 @@ validate and extract specific patterns from input values. Example: ```python -from fuzztypes import Regex +from typing import Annotated +from pydantic import BaseModel +from fuzztypes import RegexValidator # Create a custom annotation type for matching email addresses -Email = Regex(r"[\w\.-]+@[\w\.-]+\.\w+") +IPAddress = Annotated[ + str, RegexValidator(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}$") +] class MyModel(BaseModel): - email: Email + ip_address: IPAddress -model = MyModel(email="john.doe@example.com") -assert model.email == "john.doe@example.com" +model = MyModel(ip_address="My internet IP address is 192.168.127.12") +assert model.ip_address == "192.168.127.12" ``` +### Languages + +Languages are loaded from the [Debian iso-codes](https://salsa.debian.org/iso-codes-team/iso-codes/) project. + +Languages are resolved using their preferred, common, inverted, bibliographic name, or 2 or 3 letter alpha code. + +Languages can be included as a string name (LanguageName), string code (LanguageCode) or full language object. + +The preferred code is the 2 letter version and will be used if available. Otherwise, the 3 letter alpha code is used. + +Example: + +```python +from pydantic import BaseModel +from fuzztypes import ( + Language, + LanguageName, + LanguageCode, + LanguageScope, + LanguageType, + LanguageNamedEntity, + validate_python, +) +class Model(BaseModel): + language_code: LanguageCode + language_name: LanguageName + language: Language + +# Test that Language resolves to the complete language object +data = dict(language_code="en", language="English", language_name="ENG") +obj = validate_python(Model, data) +assert obj.language_code == "en" +assert obj.language_name == "English" +assert obj.language.scope == LanguageScope.INDIVIDUAL +assert obj.language.type == LanguageType.LIVING +assert isinstance(obj.language, LanguageNamedEntity) +assert obj.model_dump(exclude_defaults=True, mode="json") == { + "language": { + "aliases": ["en", "eng"], + "alpha_2": "en", + "alpha_3": "eng", + "scope": "I", + "type": "L", + "value": "English", + }, + "language_code": "en", + "language_name": "English", +} +``` \ No newline at end of file diff --git a/notebooks/00_readme_examples.ipynb b/notebooks/00_readme_examples.ipynb deleted file mode 100644 index babf61d..0000000 --- a/notebooks/00_readme_examples.ipynb +++ /dev/null @@ -1,200 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "## Readme Code Examples\n", - "These are the code examples from the Readme document." - ], - "metadata": { - "collapsed": false - }, - "id": "3542b4dbc383efb7" - }, - { - "cell_type": "code", - "execution_count": 14, - "outputs": [], - "source": [ - "from pydantic import BaseModel\n", - "\n", - "class Normal(BaseModel):\n", - " boolean: bool\n", - " float: float\n", - " integer: int\n", - " \n", - "obj = Normal(\n", - " boolean='yes',\n", - " float='2',\n", - " integer='3',\n", - ")\n", - "assert obj.boolean is True\n", - "assert obj.float == 2.0\n", - "assert obj.integer == 3" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-19T16:35:29.171400Z", - "start_time": "2024-03-19T16:35:29.169287Z" - } - }, - "id": "55243aba75d3e44d" - }, - { - "cell_type": "code", - "execution_count": 15, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"ascii\": \"anthropos\",\n", - " \"email\": \"jdoe@example.com\",\n", - " \"emoji\": \"💭\",\n", - " \"handle\": \"@imaurer\",\n", - " \"integer\": 55,\n", - " \"inventor\": \"Ada Lovelace\",\n", - " \"person\": {\n", - " \"name_format\": \"{title} {first} {middle} {last} {suffix} ({nickname})\",\n", - " \"init_format\": \"{first} {middle} {last}\",\n", - " \"title\": \"Mr.\",\n", - " \"first\": \"Arthur\",\n", - " \"middle\": \"Herbert\",\n", - " \"last\": \"Fonzarelli\",\n", - " \"suffix\": \"\",\n", - " \"nickname\": \"fonzie\"\n", - " },\n", - " \"time\": \"2025-01-01T05:00:00\",\n", - " \"zipcode\": \"12345-6789\"\n", - "}\n" - ] - } - ], - "source": [ - "from pydantic import BaseModel\n", - "from fuzztypes import (\n", - " ASCII,\n", - " Datetime,\n", - " Email,\n", - " Fuzzmoji,\n", - " InMemory,\n", - " Integer,\n", - " Person,\n", - " Regex,\n", - " ZipCode,\n", - " flags,\n", - ")\n", - "\n", - "# define a source, see EntitySource for using TSV, CSV, JSONL\n", - "inventors = [\"Ada Lovelace\", \"Alan Turing\", \"Claude Shannon\"]\n", - "\n", - "# define a named entity type in memory. use OnDisk for larger data sets.\n", - "Inventor = InMemory(inventors, search_flag=flags.FuzzSearch)\n", - "\n", - "# custom Regex type for finding twitter handles.\n", - "Handle = Regex(r'@\\w{1,15}', examples=[\"@genomoncology\"])\n", - "\n", - "# define a Pydantic class with 9 fuzzy type attriubutes\n", - "class Fuzzy(BaseModel):\n", - " ascii: ASCII\n", - " email: Email\n", - " emoji: Fuzzmoji\n", - " handle: Handle\n", - " integer: Integer\n", - " inventor: Inventor\n", - " person: Person\n", - " time: Datetime\n", - " zipcode: ZipCode\n", - "\n", - "# create an instance of class Fuzzy\n", - "obj = Fuzzy(\n", - " ascii=\"άνθρωπος\",\n", - " email=\"John Doe \",\n", - " emoji='thought bubble',\n", - " handle='Ian Maurer (@imaurer)',\n", - " integer='fifty-five',\n", - " inventor='ada luvlace',\n", - " person='mr. arthur herbert fonzarelli (fonzie)',\n", - " time='5am on Jan 1, 2025',\n", - " zipcode=\"(Zipcode: 12345-6789)\",\n", - ")\n", - "\n", - "# test the autocorrecting performed\n", - "\n", - "# greek for man: https://en.wiktionary.org/wiki/άνθρωπος\n", - "assert obj.ascii == \"anthropos\"\n", - "\n", - "# extract email via regular expression\n", - "assert obj.email == \"jdoe@example.com\"\n", - "\n", - "# fuzzy match \"thought bubble\" to \"thought balloon\" emoji\n", - "assert obj.emoji == \"💭\"\n", - "\n", - "# simple, inline regex example (see above Handle type)\n", - "assert obj.handle == \"@imaurer\"\n", - "\n", - "# convert integer word phrase to integer value\n", - "assert obj.integer == 55\n", - "\n", - "# case-insensitive fuzzy match on lowercase, misspelled name\n", - "assert obj.inventor == \"Ada Lovelace\"\n", - "\n", - "# human name parser (title, first, middle, last, suffix, nickname)\n", - "assert str(obj.person) == 'Mr. Arthur Herbert Fonzarelli (fonzie)'\n", - "assert obj.person.short_name == \"Arthur Fonzarelli\"\n", - "assert obj.person.nickname == \"fonzie\"\n", - "assert obj.person.last == \"Fonzarelli\"\n", - "\n", - "# convert time phrase to datetime object\n", - "assert obj.time.isoformat() == \"2025-01-01T05:00:00\"\n", - "\n", - "# extract zip5 or zip9 formats using regular expressions\n", - "assert obj.zipcode == \"12345-6789\"\n", - "\n", - "# print JSON on success\n", - "print(obj.model_dump_json(indent=4))" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-19T16:35:51.056350Z", - "start_time": "2024-03-19T16:35:50.994941Z" - } - }, - "id": "6c30a7cafa50364e" - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [], - "metadata": { - "collapsed": false - }, - "id": "b3b7c2fac600ccd8" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/pyproject.toml b/pyproject.toml index adbe1e8..60e16af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,12 +25,14 @@ dynamic = ["version"] [project.optional-dependencies] test = [ "pytest", + "pytest-mock", "coverage[toml]", ] local = [ "build", "jupyter", "ipython", + "mypy", "pip", "setuptools", "twine", @@ -59,11 +61,6 @@ ext = [ path = "src/fuzztypes/__init__.py" [tool.mypy] -strict = true - -[[tool.mypy.overrides]] -module = "gpt.tests.*" -ignore_missing_imports = true check_untyped_defs = true [tool.pytest.ini_options] diff --git a/requirements-dev.txt b/requirements-dev.txt index ec60578..2c481d7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -217,6 +217,9 @@ more-itertools==10.2.0 # via jaraco-classes mpmath==1.3.0 # via sympy +mypy==1.9.0 +mypy-extensions==1.0.0 + # via mypy nameparser==1.1.3 nbclient==0.10.0 # via nbconvert @@ -320,6 +323,8 @@ pylance==0.10.2 pyproject-hooks==1.0.0 # via build pytest==8.0.1 + # via pytest-mock +pytest-mock==3.12.0 python-dateutil==2.9.0.post0 # via # arrow @@ -434,6 +439,7 @@ tomli==2.0.1 # build # coverage # jupyterlab + # mypy # pyproject-hooks # pytest torch==2.2.1 @@ -480,6 +486,7 @@ typing-extensions==4.9.0 # async-lru # huggingface-hub # ipython + # mypy # pydantic # pydantic-core # torch diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index b52020a..05f5004 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -1,32 +1,56 @@ +__version__ = "0.1.0" + +# logging +import logging + +logger = logging.getLogger("fuzztypes") +logger.setLevel(logging.WARNING) + +# flags and constants from . import flags from . import const + +# utilities +from . import utils from . import lazy # Schema from .entity import Entity, NamedEntity, EntitySource -from .match import Match, MatchList, Record +from .match import Match, MatchResult, Record -# Hidden Abstract Types -from . import abstract +# Validation +from .validation import ( + FuzzValidator, + validate_entity, + validate_python, + validate_json, + get_type_adapter, +) -# Base Entity Types -from .in_memory import InMemory -from .on_disk import OnDisk +# Named Entity Storage +from . import storage +from .in_memory import InMemoryValidator +from .on_disk import OnDiskValidator # Base Non-Entity Types -from .function import Function -from .regex import Regex +from .regex import RegexValidator # Usable Types from .ascii import ASCII -from .date import Date, DateType, Datetime, DatetimeType +from .date import Date, DateValidator, Datetime, DatetimeValidator from .emojis import Emoji, Fuzzmoji, Vibemoji from .integer import Integer +from .language import ( + Language, + LanguageCode, + LanguageName, + LanguageNamedEntity, + LanguageScope, + LanguageType, +) from .person import Person from .regex import Email, SSN, ZipCode -__version__ = "0.0.2" - __all__ = ( "ASCII", @@ -35,25 +59,37 @@ "Emoji", "Entity", "EntitySource", - "Function", "Fuzzmoji", - "InMemory", + "FuzzValidator", + "InMemoryValidator", "Integer", + "Language", + "LanguageCode", + "LanguageName", + "LanguageNamedEntity", + "LanguageScope", + "LanguageType", "Match", - "MatchList", + "MatchResult", "NamedEntity", - "OnDisk", + "OnDiskValidator", "Person", "Record", - "Regex", + "RegexValidator", "SSN", "Date", - "DateType", + "DateValidator", "Datetime", - "DatetimeType", + "DatetimeValidator", "Vibemoji", "ZipCode", "const", "flags", + "get_type_adapter", "lazy", + "logger", + "utils", + "validate_entity", + "validate_json", + "validate_python", ) diff --git a/src/fuzztypes/abstract.py b/src/fuzztypes/abstract.py deleted file mode 100644 index b879257..0000000 --- a/src/fuzztypes/abstract.py +++ /dev/null @@ -1,259 +0,0 @@ -from datetime import date, datetime -from typing import Any, Callable, Type, Union, Optional, Iterable, List - -from pydantic import ( - BaseModel, - GetCoreSchemaHandler, - GetJsonSchemaHandler, - json_schema, -) -from pydantic_core import CoreSchema, PydanticCustomError, core_schema - -from fuzztypes import NamedEntity, Entity, MatchList, const, flags, lazy - -SupportedType = Union[str, float, int, dict, list, date, datetime, BaseModel] - - -def AbstractType( - lookup_function: Callable[[str], MatchList], - *, - EntityType: Type = Entity, - examples: list = None, - input_type: Type[SupportedType] = str, - notfound_mode: const.NotFoundMode = "raise", - output_type: Type[SupportedType] = None, - validator_mode: const.ValidatorMode = "before", -): - """ - Factory function to create a specialized AbstractType, which is a Pydantic - based type with added fuzzy matching capabilities. - - :param lookup_function: Function to perform the lookup. - :param EntityType: Type of Entity (e.g. NamedEntity) to return. - :param examples: Example values used in schema generation. - :param input_type: The underlying Python data type. - :param notfound_mode: 'raise' an error, set 'none', or 'allow' unknown key. - :param output_type: Specify only if different from input_type. - :param validator_mode: Validation mode ('before', 'after', 'plain', 'wrap') - - :return: A specialized AbstractType based on the provided specifications. - """ - - output_type = output_type or input_type - - # noinspection PyClassHasNoInit - class _AbstractType(output_type): - @classmethod - def __get_pydantic_core_schema__( - cls, - source_type: type, - handler: GetCoreSchemaHandler, - ) -> CoreSchema: - """ - Generate the Pydantic core schema for the AbstractType. - - This method is used internally by Pydantic to generate the schema - based on the provided validation mode and input/output types. - """ - validation_function_map = { - "before": core_schema.with_info_before_validator_function, - "after": core_schema.with_info_before_validator_function, - "plain": core_schema.with_info_plain_validator_function, - "wrap": core_schema.with_info_wrap_validator_function, - } - - validation_function = validation_function_map[validator_mode] - in_schema = handler(input_type) - - if output_type and output_type != input_type: - # used for Person where name (str) or Person (BaseModel) used. - out_schema = handler(output_type) - in_schema = core_schema.union_schema([in_schema, out_schema]) - - if notfound_mode == "none": - in_schema = core_schema.nullable_schema(in_schema) - - return validation_function(cls, in_schema) - - @classmethod - def __get_pydantic_json_schema__( - cls, - schema: CoreSchema, - handler: GetJsonSchemaHandler, - ) -> json_schema.JsonSchemaValue: - """ - Generate the JSON schema for the AbstractType. - - This method is used internally by Pydantic to generate the JSON - schema representation of the AbstractType, including any examples. - """ - schema = handler(schema) - if examples is not None: - schema["examples"] = examples - return schema - - def __new__(cls, key: str, _: Any = None) -> Optional[Any]: - """ - Doesn't create an AbstractType, it's actually a class-level - __call__ function. - - Pydantic core schema logic will pass an additional argument - that can be ignored. - - It retrieves the entity associated with the provided key. - If an entity is found, it returns the value of the entity. - If no entity is found, it returns None. - If an exception is raised, it is will not be caught. - """ - entity = cls.lookup(key) - if entity: - return entity.value - - @classmethod - def __class_getitem__(cls, key) -> EntityType: - """ - Get the entity associated with the given key using dictionary-like - access. - - This method allows retrieving the entity using dictionary-like - syntax (e.g., AbstractType[key]). - - If entity found, it is returned. - If entity not found, raise a KeyError based on PydanticCustomError. - """ - try: - return cls.lookup(key) - except PydanticCustomError as err: - raise KeyError(f"Key Error: {key} [{err}]") from err - - @classmethod - def lookup(cls, key: str) -> Optional[EntityType]: - """ - Lookup the entity for the given key. - - This method attempts to find the entity associated with the - provided key. - - If a match is found, it returns the corresponding entity. - - If no match is found, takes action based on the notfound_mode: - "none": returns None (if notfound_mode is "none") - "allow": returns an entity with the key as value - "raise": raises a PydanticCustomError - """ - match_list: MatchList = lookup_function(key) - - if match_list.success: - return match_list.entity - - if notfound_mode == "allow": - return EntityType(value=key) - - if notfound_mode == "none": - return - - msg = "key ({key}) could not be resolved" - ctx = dict(key=key) - if match_list: - ctx["near"] = [str(m) for m in match_list] - msg += f", closest non-matches = {match_list}" - raise PydanticCustomError("key_not_found", msg, ctx) - - return _AbstractType - - -class AbstractStorage: - def __init__( - self, - source: Iterable[NamedEntity], - *, - case_sensitive: bool = False, - encoder: Union[Callable, str, object] = None, - device: const.DeviceList = "cpu", - fuzz_scorer: str = "token_sort_ratio", - limit: int = 10, - min_similarity: float = 80.0, - search_flag: flags.SearchFlag = flags.DefaultSearch, - tiebreaker_mode: const.TiebreakerMode = "raise", - ): - assert not search_flag.is_hybrid, "Hybrid search not yet supported!" - - self.source = source - - # options - self.case_sensitive = case_sensitive - self.device = device - self.limit = limit - self.min_similarity = min_similarity - self.prepped = False - self.search_flag = search_flag - self.tiebreaker_mode = tiebreaker_mode - - # store string for lazy loading - self._fuzz_scorer = fuzz_scorer - self._encoder = encoder - self._vect_dimensions = None - - def __call__(self, key: str) -> MatchList: - if not self.prepped: - self.prepped = True - self.prepare() - - match_list = self.get(key) - match_list.choose(self.min_similarity, self.tiebreaker_mode) - return match_list - - def prepare(self): - raise NotImplementedError - - def get(self, key: str) -> MatchList: - raise NotImplementedError - - def normalize(self, key: str): - if key: - key = key.strip() - if self.case_sensitive: - return key - else: - return key.lower() - - # - # encoding - # - - @property - def encoder(self): - return lazy.create_encoder(self._encoder, device=self.device) - - @property - def vect_dimensions(self): - if self._vect_dimensions is None: - dummy_encoded = self.encode([""]) - self._vect_dimensions = dummy_encoded.shape[1] - return self._vect_dimensions - - def encode(self, values: List[str]): - return self.encoder( - values, - ) - - # - # fuzzy matching - # - - @property - def rapidfuzz(self): - return lazy.lazy_import("rapidfuzz") - - @property - def fuzz_scorer(self): - return getattr( - self.rapidfuzz.fuzz, - self._fuzz_scorer, - self.rapidfuzz.fuzz.token_sort_ratio, - ) - - def fuzz_clean(self, term: str) -> str: - # no really, it's a string - # noinspection PyTypeChecker - return self.rapidfuzz.utils.default_process(term) diff --git a/src/fuzztypes/ascii.py b/src/fuzztypes/ascii.py index 32c710e..726ab29 100644 --- a/src/fuzztypes/ascii.py +++ b/src/fuzztypes/ascii.py @@ -1,6 +1,6 @@ -from typing import Callable +from typing import Annotated, Any, Callable -from fuzztypes import Function, lazy +from fuzztypes import FuzzValidator, lazy _tx = None @@ -27,9 +27,9 @@ def get_tx() -> Callable: # pragma: no cover return _tx -def to_ascii(key: str) -> str: +def to_ascii(key: Any) -> str: f = _tx or get_tx() - return f(key) + return f(str(key)) -ASCII = Function(to_ascii) +ASCII = Annotated[str, FuzzValidator(to_ascii)] diff --git a/src/fuzztypes/const.py b/src/fuzztypes/const.py index 755dbbe..2728454 100644 --- a/src/fuzztypes/const.py +++ b/src/fuzztypes/const.py @@ -4,7 +4,8 @@ # Home directory of fuzztypes library. FuzzHome = "~/.local/fuzztypes/" FuzzHome = os.path.expanduser(os.environ.get("FUZZTYPES_HOME", FuzzHome)) -OnDiskPath = os.path.join(FuzzHome, "on_disk") +StoredValidatorPath = os.path.join(FuzzHome, "on_disk") +DownloadsPath = os.path.join(FuzzHome, "downloads") # Default encoder to use when generating semantic embeddings. # Override with environment variable `FUZZTYPES_DEFAULT_ENCODER`. diff --git a/src/fuzztypes/date.py b/src/fuzztypes/date.py index e014b0c..0f1247f 100644 --- a/src/fuzztypes/date.py +++ b/src/fuzztypes/date.py @@ -1,22 +1,19 @@ import datetime -from typing import Optional, Union, Type +from typing import Annotated, Optional, Union -from . import Entity, MatchList, abstract, const, lazy +from . import FuzzValidator, const, lazy -date_or_datetime = Union[datetime.date, datetime.datetime] +DateOrDatetime = Union[datetime.date, datetime.datetime] -def DateType( - date_order: const.DateOrder = None, - examples: Optional[list] = None, +def DateValidator( + date_order: Optional[const.DateOrder] = None, + is_date: bool = True, languages: Optional[list[str]] = None, - notfound_mode: const.NotFoundMode = "raise", - input_type: Type[date_or_datetime] = datetime.date, timezone: Optional[str] = None, - validator_mode: const.ValidatorMode = "before", strict: bool = False, prefer_future_dates: bool = False, - relative_base: Optional[date_or_datetime] = None, + relative_base: Optional[DateOrDatetime] = None, ): DateDataParser = lazy.lazy_import("dateparser.date", "DateDataParser") languages = languages or ["en"] @@ -35,50 +32,32 @@ def DateType( parser = DateDataParser(languages=languages, settings=settings) - def parse(key: str) -> MatchList: - match_list = MatchList() + def parse(key: str) -> DateOrDatetime: value = parser.get_date_data(key).date_obj - if value is not None: - if input_type is datetime.date: - value = value.date() - entity = Entity(value=value) - match_list.set(key=key, entity=entity) - return match_list + value = value.date() if (value and is_date) else value + return value - return abstract.AbstractType( - parse, - examples=examples, - input_type=input_type, - notfound_mode=notfound_mode, - validator_mode=validator_mode, - ) + return FuzzValidator(parse) -def DatetimeType( - date_order: const.DateOrder = None, - examples: Optional[list] = None, +def DatetimeValidator( + date_order: Optional[const.DateOrder] = None, languages: Optional[list[str]] = None, - notfound_mode: const.NotFoundMode = "raise", - input_type: Type[date_or_datetime] = datetime.datetime, timezone: Optional[str] = None, - validator_mode: const.ValidatorMode = "before", strict: bool = False, prefer_future_dates: bool = False, - relative_base: Optional[date_or_datetime] = None, + relative_base: Optional[DateOrDatetime] = None, ): - return DateType( - date_order, - examples, - languages, - notfound_mode, - input_type, - timezone, - validator_mode, - strict, - prefer_future_dates, - relative_base, + return DateValidator( + date_order=date_order, + is_date=False, + languages=languages, + timezone=timezone, + strict=strict, + prefer_future_dates=prefer_future_dates, + relative_base=relative_base, ) -Date = DateType() -Datetime = DatetimeType() +Date = Annotated[datetime.date, DateValidator()] +Datetime = Annotated[datetime.datetime, DatetimeValidator()] diff --git a/src/fuzztypes/emojis.py b/src/fuzztypes/emojis.py index 2b763b5..3e2c09a 100644 --- a/src/fuzztypes/emojis.py +++ b/src/fuzztypes/emojis.py @@ -1,8 +1,8 @@ from collections import defaultdict -from typing import List +from typing import Annotated, List from pydantic import TypeAdapter -from fuzztypes import NamedEntity, EntitySource, OnDisk, flags, lazy +from fuzztypes import NamedEntity, EntitySource, OnDiskValidator, flags, lazy def load_emoji_entities() -> List[NamedEntity]: @@ -21,27 +21,36 @@ def load_emoji_entities() -> List[NamedEntity]: EmojiSource = EntitySource(load_emoji_entities) -Emoji = OnDisk( - "Emoji", - EmojiSource, - search_flag=flags.AliasSearch, - tiebreaker_mode="lesser", -) - -Fuzzmoji = OnDisk( - "Fuzzmoji", - EmojiSource, - search_flag=flags.FuzzSearch, - tiebreaker_mode="lesser", - min_similarity=10.0, - device="cpu", -) - -Vibemoji = OnDisk( - "Vibemoji", - EmojiSource, - search_flag=flags.SemanticSearch, - tiebreaker_mode="lesser", - min_similarity=10.0, - device="cpu", -) +Emoji = Annotated[ + str, + OnDiskValidator( + "Emoji", + EmojiSource, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", + ), +] + +Fuzzmoji = Annotated[ + str, + OnDiskValidator( + "Fuzzmoji", + EmojiSource, + search_flag=flags.FuzzSearch, + tiebreaker_mode="lesser", + min_similarity=10.0, + device="cpu", + ), +] + +Vibemoji = Annotated[ + str, + OnDiskValidator( + "Vibemoji", + EmojiSource, + search_flag=flags.SemanticSearch, + tiebreaker_mode="lesser", + min_similarity=10.0, + device="cpu", + ), +] diff --git a/src/fuzztypes/entity.py b/src/fuzztypes/entity.py index 04c59bf..564f8fd 100644 --- a/src/fuzztypes/entity.py +++ b/src/fuzztypes/entity.py @@ -1,13 +1,25 @@ import csv import json from pathlib import Path -from typing import List, Union, Type, Any, Optional, Tuple, Callable +from typing import ( + List, + Union, + Type, + Any, + Optional, + Tuple, + Callable, + Generic, + TypeVar, +) from pydantic import BaseModel, Field, TypeAdapter +T = TypeVar("T") -class Entity(BaseModel): - value: Any = Field( + +class Entity(BaseModel, Generic[T]): + value: T = Field( ..., description="Value stored by Entity.", ) @@ -16,11 +28,11 @@ class Entity(BaseModel): description="Entity concept type such as PERSON, ORG, or GPE.", ) meta: Optional[dict] = Field( - None, + default=None, description="Additional attributes accessible through dot-notation.", ) priority: Optional[int] = Field( - None, + default=None, description="Tiebreaker rank (higher wins, None=0, negative allowed)", ) @@ -28,6 +40,9 @@ def __eq__(self, other: Any): other = getattr(other, "value", other) return self.value == other + def resolve(self) -> T: + return self.value + @property def rank(self) -> int: """Normalized by converting None to 0 and making lower better.""" @@ -51,10 +66,7 @@ def __setattr__(self, key: str, value: Any): if key in self.model_fields: super().__setattr__(key, value) else: - # Initialize meta if it's None - if self.__dict__.get("meta") is None: - super().__setattr__("meta", {}) - # Add or update the attribute in the meta dictionary + self.meta = self.meta or {} self.meta[key] = value @@ -74,16 +86,18 @@ def convert(cls, item: Union[str, dict, list, tuple, "NamedEntity"]): if isinstance(item, cls): return item + data = {} if item and isinstance(item, (list, tuple)): value, aliases = item[0], item[1:] if len(aliases) == 1 and isinstance(aliases[0], (tuple, list)): aliases = aliases[0] - item = dict(value=value, aliases=aliases) - - elif isinstance(item, str): - item = dict(value=item) + data = dict(value=value, aliases=aliases) + elif isinstance(item, dict): + data = item + else: + data = dict(value=item) - return NamedEntity(**item) + return cls(**data) NamedEntityAdapter = TypeAdapter(NamedEntity) @@ -104,7 +118,7 @@ def __len__(self): def __getitem__( self, key: Union[int, slice, str] - ) -> Union[NamedEntity, "EntitySource"]: + ) -> Union[NamedEntity, list[NamedEntity], "EntitySource"]: if isinstance(key, str): # return another shell, let loading occur on demand. return EntitySource(source=(self, key)) @@ -119,14 +133,14 @@ def __iter__(self): def _load_if_necessary(self): if not self.loaded: self.loaded = True - if isinstance(self.source, Tuple): + if isinstance(self.source, tuple): parent, label = self.source self.entities = [e for e in parent if e.label == label] - elif isinstance(self.source, Callable): + elif callable(self.source): self.entities = self.source() - elif self.source: + elif isinstance(self.source, Path): dialects = { "csv": self.from_csv, "tsv": self.from_tsv, @@ -135,6 +149,7 @@ def _load_if_necessary(self): } _, ext = self.source.name.lower().rsplit(".", maxsplit=1) f = dialects.get(ext) + assert f is not None, f"No reader found for: {ext}" # noinspection PyArgumentList self.entities = f(self.source) diff --git a/src/fuzztypes/function.py b/src/fuzztypes/function.py deleted file mode 100644 index 783bfbd..0000000 --- a/src/fuzztypes/function.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Callable, Type - -from . import Entity, MatchList, const, abstract - - -def Function( - source: Callable[[abstract.SupportedType], abstract.SupportedType], - examples: list = None, - notfound_mode: const.NotFoundMode = "raise", - input_type: Type[abstract.SupportedType] = str, - validator_mode: const.ValidatorMode = "before", -): - def do_lookup(key: str) -> MatchList: - value = source(key) - match_list = MatchList() - if value is not None: - entity = Entity(value=value) - match_list.set(key=key, entity=entity) - return match_list - - return abstract.AbstractType( - do_lookup, - examples=examples, - input_type=input_type, - notfound_mode=notfound_mode, - validator_mode=validator_mode, - ) diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index ada8eb8..63acb3e 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -1,28 +1,29 @@ from collections import defaultdict -from typing import Callable, Iterable, Union, List, Dict +from typing import Callable, Iterable, Union, Type, Optional from pydantic import PositiveInt from fuzztypes import ( + FuzzValidator, Match, - MatchList, + MatchResult, NamedEntity, Record, - abstract, const, flags, lazy, + storage, ) -class InMemoryStorage(abstract.AbstractStorage): +class InMemoryValidatorStorage(storage.AbstractStorage): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._mapping: Dict[str, List[Record]] = defaultdict(list) - self._terms: list[str] = [] - self._is_alias: list[bool] = [] - self._entities: list[NamedEntity] = [] + self._mapping = defaultdict(list) + self._terms = [] + self._is_alias = [] + self._entities = [] self._embeddings = None # @@ -31,7 +32,7 @@ def __init__(self, *args, **kwargs): def prepare(self): for item in self.source: - entity = NamedEntity.convert(item) + entity = self.entity_type.convert(item) self.add(entity) def add(self, entity: NamedEntity) -> None: @@ -46,13 +47,19 @@ def add(self, entity: NamedEntity) -> None: def add_by_name(self, entity: NamedEntity) -> None: term = entity.value - record = Record(entity=entity, term=term, is_alias=False) - self._mapping[self.normalize(term)].append(record) + norm_term = self.normalize(term) + record = Record( + entity=entity, term=term, norm_term=norm_term, is_alias=False + ) + self._mapping[norm_term].append(record) def add_by_alias(self, entity: NamedEntity) -> None: for term in entity.aliases: - record = Record(entity=entity, term=term, is_alias=True) - self._mapping[self.normalize(term)].append(record) + norm_term = self.normalize(term) + record = Record( + entity=entity, term=term, norm_term=norm_term, is_alias=True + ) + self._mapping[norm_term].append(record) def add_fuzz_or_semantic(self, entity: NamedEntity) -> None: clean_name: str = self.fuzz_clean(entity.value) @@ -70,25 +77,28 @@ def add_fuzz_or_semantic(self, entity: NamedEntity) -> None: # Getters # - def get(self, key: str) -> MatchList: + def get(self, key: str) -> MatchResult: records = self._mapping.get(self.normalize(key), []) - match_list = Record.from_list(records, key=key) + match_list = Record.from_list( + records, key=key, entity_type=self.entity_type + ) - if not match_list: + results = MatchResult(matches=match_list) + + if not results: if self.search_flag.is_fuzz_ok: - match_list = self.get_by_fuzz(key) + results = self.get_by_fuzz(key) if self.search_flag.is_semantic_ok: - match_list = self.get_by_semantic(key) + results = self.get_by_semantic(key) - matches = MatchList(matches=match_list) - return matches + return results # # Fuzzy Matching # - def get_by_fuzz(self, term) -> MatchList: + def get_by_fuzz(self, term) -> MatchResult: query = self.fuzz_clean(term) matches = self.fuzz_match(query) return matches @@ -96,7 +106,7 @@ def get_by_fuzz(self, term) -> MatchList: def fuzz_match( self, query: str, - ) -> MatchList: + ) -> MatchResult: # https://rapidfuzz.github.io/RapidFuzz/Usage/process.html#extract extract = self.rapidfuzz.process.extract( query=query, @@ -105,24 +115,24 @@ def fuzz_match( limit=self.limit, ) - match_list = MatchList() + results = MatchResult() for key, score, index in extract: entity = self._entities[index] is_alias = self._is_alias[index] m = Match(key=key, entity=entity, is_alias=is_alias, score=score) - match_list.append(m) - return match_list + results.append(m) + return results # # Vector Similarity Search # - def get_by_semantic(self, key) -> List[Match]: + def get_by_semantic(self, key) -> MatchResult: # find closest match using knn indices, scores = self.find_knn(key) - # create a MatchList from the results - matches = [] + # create a MatchResult from the results + results = MatchResult() for index, score in zip(indices, scores): entity = self._entities[index] term = self._terms[index] @@ -134,9 +144,9 @@ def get_by_semantic(self, key) -> List[Match]: is_alias=is_alias, term=term, ) - matches.append(match) + results.append(match) - return matches + return results @property def embeddings(self): @@ -172,36 +182,31 @@ def find_knn(self, key: str) -> tuple: return k_nearest_indices, top_k_scores -def InMemory( +def InMemoryValidator( source: Iterable, *, case_sensitive: bool = False, encoder: Union[Callable, str, object] = None, - examples: list = None, + entity_type: Type[NamedEntity] = NamedEntity, + examples: Optional[list] = None, fuzz_scorer: const.FuzzScorer = "token_sort_ratio", limit: PositiveInt = 10, min_similarity: float = 80.0, notfound_mode: const.NotFoundMode = "raise", search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", - validator_mode: const.ValidatorMode = "before", ): - storage = InMemoryStorage( + in_memory = InMemoryValidatorStorage( source, case_sensitive=case_sensitive, encoder=encoder, + entity_type=entity_type, fuzz_scorer=fuzz_scorer, limit=limit, min_similarity=min_similarity, + notfound_mode=notfound_mode, search_flag=search_flag, tiebreaker_mode=tiebreaker_mode, ) - return abstract.AbstractType( - storage, - EntityType=NamedEntity, - examples=examples, - input_type=str, - notfound_mode=notfound_mode, - validator_mode=validator_mode, - ) + return FuzzValidator(in_memory, examples=examples) diff --git a/src/fuzztypes/integer.py b/src/fuzztypes/integer.py index 2f070ea..0f348a2 100644 --- a/src/fuzztypes/integer.py +++ b/src/fuzztypes/integer.py @@ -1,6 +1,6 @@ -from typing import Callable, Union +from typing import Annotated, Callable, Union -from fuzztypes import Function, lazy +from fuzztypes import FuzzValidator, lazy _tx = None @@ -23,4 +23,4 @@ def to_int(key: Union[int, str]) -> int: return val -Integer = Function(to_int, input_type=int) +Integer = Annotated[int, FuzzValidator(to_int)] diff --git a/src/fuzztypes/language.py b/src/fuzztypes/language.py new file mode 100644 index 0000000..937bec5 --- /dev/null +++ b/src/fuzztypes/language.py @@ -0,0 +1,116 @@ +import json +from enum import Enum +from typing import Annotated, Optional, List, Iterable, Type + +from pydantic import TypeAdapter + +from fuzztypes import EntitySource, NamedEntity, OnDiskValidator, flags, utils + + +class LanguageScope(Enum): + INDIVIDUAL = "I" + MACROLANGUAGE = "M" + SPECIAL = "S" + + +class LanguageType(Enum): + ANCIENT = "A" + CONSTRUCTED = "C" + EXTINCT = "E" + HISTORICAL = "H" + LIVING = "L" + SPECIAL = "S" + + +class LanguageNamedEntity(NamedEntity): + """Resolves to language full name.""" + + alpha_2: Optional[str] = None + alpha_3: str + scope: Optional[LanguageScope] = None + type: Optional[LanguageType] = None + common_name: Optional[str] = None + inverted_name: Optional[str] = None + bibliographic: Optional[str] = None + + @property + def code(self): + return self.alpha_2 or self.alpha_3 + + +class LanguageModelNamedEntity(LanguageNamedEntity): + """Resolves to self as a full child object.""" + + def resolve(self): + return self + + +class LanguageCodeNameEntity(LanguageNamedEntity): + """Resolves to code name.""" + + def resolve(self): + return self.code + + +LanguageNamedEntityType = Type[LanguageNamedEntity] + + +def load_languages( + entity_cls: Type[LanguageNamedEntity] = LanguageNamedEntity, +): + def do_load() -> Iterable[NamedEntity]: + repo = "https://salsa.debian.org/iso-codes-team/iso-codes/" + remote = f"{repo}-/raw/main/data/iso_639-3.json" + local = utils.get_file(remote) + assert local, f"Could not download: {remote}" + data = json.load(open(local))["639-3"] + alias_fields = { + "alpha_2", + "alpha_3", + "common_name", + "inverted_name", + "bibliographic", + } + entities = [] + for item in data: + item["value"] = item.pop("name") + aliases = [v for k, v in item.items() if k in alias_fields] + item["aliases"] = aliases + entities.append(item) + return TypeAdapter(List[LanguageNamedEntity]).validate_python(data) + + return do_load + + +LanguageName = Annotated[ + str, + OnDiskValidator( + "Language", + EntitySource(load_languages(LanguageNamedEntity)), + entity_type=LanguageNamedEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", + ), +] + +LanguageCode = Annotated[ + str, + OnDiskValidator( + "Language", + EntitySource(load_languages(LanguageCodeNameEntity)), + entity_type=LanguageCodeNameEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", + ), +] + +Language = Annotated[ + LanguageNamedEntity, + OnDiskValidator( + "Language", + EntitySource(load_languages(LanguageModelNamedEntity)), + entity_type=LanguageModelNamedEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", + ), +] diff --git a/src/fuzztypes/lazy.py b/src/fuzztypes/lazy.py index 8743db9..a8d1516 100644 --- a/src/fuzztypes/lazy.py +++ b/src/fuzztypes/lazy.py @@ -1,7 +1,7 @@ import functools import importlib import os -from typing import Any, List, TypedDict, Callable +from typing import Any, List, TypedDict, Callable, Optional from fuzztypes import const @@ -9,7 +9,7 @@ @functools.lru_cache(maxsize=None) def lazy_import( library_name: str, - attr_name: str = None, + attr_name: Optional[str] = None, return_none_on_error: bool = False, ) -> Any: """ @@ -46,10 +46,8 @@ def lazy_import( except ImportError as e: version_info = f"(version {version})" if version else "" install = f"`pip install {install_name}{version_info}`" - details = list(filter(None, [purpose, url, license_type])) - if details: - details = ", ".join(details) - details = f" ({details})" + details = ", ".join(list(filter(None, [purpose, url, license_type]))) + details = f" ({details})" if details else "" msg = f"Import Failed: {install}{details}" if not info: diff --git a/src/fuzztypes/match.py b/src/fuzztypes/match.py index e943feb..9312019 100644 --- a/src/fuzztypes/match.py +++ b/src/fuzztypes/match.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional, Iterator, Any, Union +from typing import List, Tuple, Optional, Any, Union, Type from pydantic import BaseModel, Field @@ -23,14 +23,8 @@ def rank_value(self) -> Tuple[Tuple[float, int], Any]: def __lt__(self, other: "Match"): return self.rank_value < other.rank_value - def __str__(self): - if self.is_alias: - return f"{self.key} => {self.entity.value} [{self.score:.1f}]" - else: - return f"{self.entity.value} [{self.score:.1f}]" - -class MatchList(BaseModel): +class MatchResult(BaseModel): matches: List[Match] = Field(default_factory=list) choice: Optional[Match] = None @@ -40,34 +34,12 @@ def __bool__(self): def __len__(self): return len(self.matches) - def __iter__(self) -> Iterator[Match]: - return iter(self.matches) - def __getitem__(self, item): return self.matches[item] - def __str__(self): - return ", ".join(map(str, self.matches)) - - @property - def success(self): - return self.choice is not None - @property def entity(self): - return self.success and self.choice.entity - - def set( - self, - key: str, - entity: Entity, - is_alias: bool = False, - term: str = None, - ): - """If match is a known winner, just set it and forget it.""" - match = Match(key=key, entity=entity, is_alias=is_alias, term=term) - self.choice = match - self.matches.append(match) + return self.choice is not None and self.choice.entity def append(self, match: Match): """Add a match to the list of potential matches.""" @@ -99,22 +71,34 @@ def choose(self, min_score: float, tiebreaker_mode: const.TiebreakerMode): class Record(BaseModel): entity: Union[NamedEntity, str] term: str + norm_term: Optional[str] = None is_alias: bool vector: Any = None - def deserialize(self): - if isinstance(self.entity, str): - self.entity = NamedEntity.model_validate_json(self.entity) - @classmethod - def from_list(cls, recs: list, key, score: float = 100.0) -> List[Match]: - return [record.to_match(key, score) for record in recs] + def from_list( + cls, + recs: list, + key, + score: float = 100.0, + entity_type: Type[NamedEntity] = NamedEntity, + ) -> List[Match]: + return [record.to_match(key, score, entity_type) for record in recs] + + def to_match( + self, + key, + score: float = 100.0, + entity_type: Type[NamedEntity] = NamedEntity, + ) -> Match: + if isinstance(self.entity, str): + match_entity = entity_type.model_validate_json(self.entity) + else: + match_entity = self.entity - def to_match(self, key, score: float = 100.0) -> Match: - self.deserialize() return Match( key=key, - entity=self.entity, + entity=match_entity, is_alias=self.is_alias, score=score, term=self.term, diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index 3adcc5c..46ee3a4 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -1,39 +1,49 @@ -from typing import Callable, Iterable, Union, List +from typing import Callable, Iterable, Union, List, Type, Optional, Any from pydantic import PositiveInt from fuzztypes import ( + FuzzValidator, Match, - MatchList, + MatchResult, NamedEntity, Record, - abstract, const, flags, lazy, + storage, ) accelerators = {"cuda", "mps"} -class OnDiskStorage(abstract.AbstractStorage): +class StoredValidatorStorage(storage.AbstractStorage): def __init__( self, name: str, - source: Iterable[NamedEntity], + source: Iterable, **kwargs, ): super().__init__(source, **kwargs) self.name = name - self.conn = None - self.table = None + self._conn = None + self._table = None + + @property + def conn(self) -> Any: + if self._conn is None: + lancedb = lazy.lazy_import("lancedb") + self._conn = lancedb.connect(const.StoredValidatorPath) + return self._conn + + @property + def table(self) -> Any: + if self._table is None: + self._table = self.conn.open_table(self.name) + return self._table def prepare(self, force_drop_table: bool = False): - lancedb = lazy.lazy_import("lancedb") - - self.conn = lancedb.connect(const.OnDiskPath) - table_names = set(self.conn.table_names(limit=999_999_999)) if force_drop_table and self.name in table_names: @@ -43,20 +53,19 @@ def prepare(self, force_drop_table: bool = False): if self.name not in table_names: try: self.create_table() - except Exception as e: + except Exception as e: # pragma: no cover # if any issue occurs, drop the table and re-raise error # in the future, handle errors better self.conn.drop_table(self.name) raise e - self.table = self.conn.open_table(self.name) - def create_table(self): pa = lazy.lazy_import("pyarrow") schema = pa.schema( [ pa.field("term", pa.string()), + pa.field("norm_term", pa.string()), pa.field("entity", pa.string()), pa.field("is_alias", pa.string()), pa.field( @@ -65,9 +74,7 @@ def create_table(self): ), ] ) - self.table = self.conn.create_table( - self.name, schema=schema, exist_ok=True - ) + table = self.conn.create_table(self.name, schema=schema, exist_ok=True) # create records from source records = self.create_records() @@ -80,7 +87,7 @@ def create_table(self): record.vector = vector # add records in a batch to table - self.table.add([record.model_dump() for record in records]) + table.add([record.model_dump() for record in records]) # adjust num_partitions and num_sub_vectors based on dataset size num_records = len(records) @@ -88,7 +95,7 @@ def create_table(self): should_index = num_records > 256 and self.search_flag.is_semantic_ok if self.search_flag.is_fuzz_ok: # pragma: no cover - self.table.create_fts_index("term") + table.create_fts_index("term") if should_index: # pragma: no cover num_partitions = min(num_records, 256) @@ -96,7 +103,7 @@ def create_table(self): index_cache_size = min(num_records, 256) accelerator = self.device if self.device in accelerators else None - self.table.create_index( + table.create_index( metric="cosine", num_partitions=num_partitions, num_sub_vectors=num_sub_vectors, @@ -110,7 +117,7 @@ def create_records(self): records = [] empty = [0.0] * self.vect_dimensions for item in self.source: - entity = NamedEntity.convert(item) + entity = self.entity_type.convert(item) json = entity.model_dump_json(exclude_defaults=True) terms = [] @@ -125,13 +132,14 @@ def create_records(self): for term in terms: # normalize for case sensitivity - term = self.normalize(term) + norm_term = self.normalize(term) # construct and add record if term: record = Record( entity=json, term=term, + norm_term=norm_term, is_alias=is_alias, vector=empty, ) @@ -146,10 +154,14 @@ def create_records(self): # Getters # - def get(self, key: str) -> MatchList: - where = f'term = "{self.normalize(key)}"' + def get(self, key: str) -> MatchResult: + where = f'term = "{key}"' match_list = self.run_query(key, where=where) + if not match_list: + where = f'norm_term = "{self.normalize(key)}"' + match_list = self.run_query(key, where=where) + if not match_list: if self.search_flag.is_fuzz_ok: match_list = self.get_by_fuzz(key) @@ -157,7 +169,7 @@ def get(self, key: str) -> MatchList: if self.search_flag.is_semantic_ok: match_list = self.get_by_semantic(key) - matches = MatchList(matches=match_list) + matches = MatchResult(matches=match_list) return matches def get_by_fuzz(self, key: str) -> List[Match]: @@ -184,7 +196,7 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: if vector is not None and self.search_flag.is_semantic_ok: qb = qb.metric("cosine") - qb = qb.select(["entity", "term", "is_alias"]) + qb = qb.select(["entity", "term", "norm_term", "is_alias"]) if where is not None: qb = qb.where(where, prefilter=True) @@ -204,45 +216,43 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: score = 100.0 # Exact match record = Record.model_validate(item) - match_list.append(record.to_match(key=key, score=score)) + match = record.to_match( + key=key, score=score, entity_type=self.entity_type + ) + match_list.append(match) return match_list -def OnDisk( +def OnDiskValidator( identity: str, source: Iterable, *, case_sensitive: bool = False, - device: str = None, + device: Optional[const.DeviceList] = None, encoder: Union[Callable, str, object] = None, - examples: list = None, + entity_type: Type[NamedEntity] = NamedEntity, + examples: Optional[list] = None, fuzz_scorer: const.FuzzScorer = "token_sort_ratio", limit: PositiveInt = 10, min_similarity: float = 80.0, notfound_mode: const.NotFoundMode = "raise", search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", - validator_mode: const.ValidatorMode = "before", -) -> abstract.AbstractType: - storage = OnDiskStorage( +): + on_disk = StoredValidatorStorage( identity, source, case_sensitive=case_sensitive, device=device, + entity_type=entity_type, fuzz_scorer=fuzz_scorer, limit=limit, min_similarity=min_similarity, + notfound_mode=notfound_mode, search_flag=search_flag, encoder=encoder, tiebreaker_mode=tiebreaker_mode, ) - return abstract.AbstractType( - storage, - EntityType=NamedEntity, - examples=examples, - input_type=str, - notfound_mode=notfound_mode, - validator_mode=validator_mode, - ) + return FuzzValidator(on_disk, examples=examples) diff --git a/src/fuzztypes/person.py b/src/fuzztypes/person.py index 98355a5..0c225b9 100644 --- a/src/fuzztypes/person.py +++ b/src/fuzztypes/person.py @@ -1,7 +1,8 @@ -from typing import Type, Union +from typing import Annotated, Optional + from pydantic import BaseModel -from fuzztypes import Entity, MatchList, abstract, const, lazy +from fuzztypes import FuzzValidator, lazy FULL_NAME = "{title} {first} {middle} {last} {suffix} ({nickname})" SHORT_NAME = "{first} {last}" @@ -83,46 +84,31 @@ def human_name(self, name_format=None, init_format=None): ) -def PersonModelType( +def PersonValidator( name_format: str = FULL_NAME, init_format: str = FULL_INIT, capitalize: bool = True, - examples: list = None, - notfound_mode: const.NotFoundMode = "raise", - validator_mode: const.ValidatorMode = "before", -) -> Type[PersonModel]: - def do_lookup(key: Union[str, PersonModel]) -> MatchList: +): + def to_person(key) -> Optional[PersonModel]: if isinstance(key, str): human_name = parse(full_name=key) if capitalize: human_name.capitalize(force=True) data = human_name.as_dict() - value = PersonModel( + person = PersonModel( name_format=name_format, init_format=init_format, **data ) elif isinstance(key, PersonModel): - value = key + person = key elif isinstance(key, dict): - value = PersonModel(**key) - elif key is None: - value = None + person = PersonModel(**key) else: raise ValueError(f"Unexpected key type {type(key)} for {key}.") - match_list = MatchList() - entity = Entity(value=value) - match_list.set(key=key, entity=entity) - return match_list + return person - return abstract.AbstractType( - do_lookup, - examples=examples, - input_type=PersonModel, - notfound_mode=notfound_mode, - output_type=str, - validator_mode=validator_mode, - ) + return FuzzValidator(to_person) # default annotation -Person = PersonModelType() +Person = Annotated[PersonModel, PersonValidator()] diff --git a/src/fuzztypes/py.typed b/src/fuzztypes/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/fuzztypes/regex.py b/src/fuzztypes/regex.py index d284ec8..a4821fa 100644 --- a/src/fuzztypes/regex.py +++ b/src/fuzztypes/regex.py @@ -1,50 +1,51 @@ import re +from typing import Annotated, Optional -from . import Entity, Match, MatchList, abstract, const +from . import FuzzValidator -def Regex( +def RegexValidator( pattern: str, - examples: list = None, - notfound_mode: const.NotFoundMode = "raise", - validator_mode: const.ValidatorMode = "before", - tiebreaker_mode: const.TiebreakerMode = "raise", + examples: Optional[list] = None, ): regex = re.compile(pattern) - def do_lookup(key: str) -> MatchList: + def do_regex(key: str) -> str: matches = regex.findall(key) - match_list = MatchList() - - for match in matches: - # Create and append Entity for each match found - entity = Entity(value=match) - match_list.append(Match(key=match, entity=entity, is_alias=False)) - - # Leave tiebreaker and error handling to MatchList.choose - match_list.choose(min_score=0, tiebreaker_mode=tiebreaker_mode) - - return match_list - - return abstract.AbstractType( - do_lookup, - examples=examples, - notfound_mode=notfound_mode, - validator_mode=validator_mode, - ) - - -Email = Regex( - r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", - examples=["user@example.com"], -) - -SSN = Regex( - r"\b\d{3}-\d{2}-\d{4}\b", - examples=["000-00-0000"], -) - -ZipCode = Regex( - r"\b\d{5}(?:-\d{4})?\b", - examples=["12345", "12345-6789"], -) + if len(matches) == 1: + return matches[0] + elif len(matches) > 1: + raise ValueError( + f"Multiple matches found for pattern '{pattern}' in '{key}'" + ) + else: + raise ValueError( + f"No matches found for pattern '{pattern}' in '{key}'" + ) + + return FuzzValidator(do_regex, examples=examples) + + +Email = Annotated[ + str, + RegexValidator( + r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", + examples=["user@example.com"], + ), +] + +SSN = Annotated[ + str, + RegexValidator( + r"\b\d{3}-\d{2}-\d{4}\b", + examples=["000-00-0000"], + ), +] + +ZipCode = Annotated[ + str, + RegexValidator( + r"\b\d{5}(?:-\d{4})?\b", + examples=["12345", "12345-6789"], + ), +] diff --git a/src/fuzztypes/storage.py b/src/fuzztypes/storage.py new file mode 100644 index 0000000..b9f3e8b --- /dev/null +++ b/src/fuzztypes/storage.py @@ -0,0 +1,125 @@ +from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union + +from pydantic_core import PydanticCustomError + +from fuzztypes import NamedEntity, MatchResult, const, flags, lazy + + +class AbstractStorage: + def __init__( + self, + source: Iterable, + *, + case_sensitive: bool = False, + device: const.DeviceList = "cpu", + encoder: Union[Callable, str, object] = None, + entity_type: Type[NamedEntity] = NamedEntity, + fuzz_scorer: str = "token_sort_ratio", + limit: int = 10, + min_similarity: float = 80.0, + notfound_mode: const.NotFoundMode = "raise", + search_flag: flags.SearchFlag = flags.DefaultSearch, + tiebreaker_mode: const.TiebreakerMode = "raise", + ): + assert not search_flag.is_hybrid, "Hybrid search not yet supported!" + + self.source = source + + # options + self.case_sensitive = case_sensitive + self.device = device + self.entity_type = entity_type + self.limit = limit + self.min_similarity = min_similarity + self.notfound_mode = notfound_mode + self.prepped = False + self.search_flag = search_flag + self.tiebreaker_mode = tiebreaker_mode + + # store string for lazy loading + self._fuzz_scorer = fuzz_scorer + self._encoder = encoder + self._vect_dimensions = None + + def __call__(self, key: str) -> Optional[Any]: + entity = self[key] + return entity.resolve() if entity else None + + def __getitem__(self, key: str) -> Optional[NamedEntity]: + if not self.prepped: + self.prepped = True + self.prepare() + + match_list = self.get(key) + match_list.choose(self.min_similarity, self.tiebreaker_mode) + + if match_list.choice is not None: + return match_list.entity + + if self.notfound_mode == "allow": + return self.entity_type(value=key) + + if self.notfound_mode == "none": + return None + + msg = '"{key}" could not be resolved' + ctx: Dict[str, Any] = dict(key=key) + if match_list: + near = [f'"{match.entity.value}"' for match in match_list.matches] + if len(near) > 1: + near[-1] = "or " + near[-1] + msg += f", did you mean {', '.join(near)}?" + raise PydanticCustomError("key_not_found", msg, ctx) + + def prepare(self): + raise NotImplementedError + + def get(self, key: str) -> MatchResult: + raise NotImplementedError + + def normalize(self, key: str): + if key: + key = key.strip() + if self.case_sensitive: + return key + else: + return key.lower() + + # + # encoding + # + + @property + def encoder(self): + return lazy.create_encoder(self._encoder, device=self.device) + + @property + def vect_dimensions(self): + if self._vect_dimensions is None: + dummy_encoded = self.encode([""]) + self._vect_dimensions = dummy_encoded.shape[1] + return self._vect_dimensions + + def encode(self, values: List[str]): + return self.encoder(values) + + # + # fuzzy matching + # + + @property + def rapidfuzz(self): + return lazy.lazy_import("rapidfuzz") + + @property + def fuzz_scorer(self): + return getattr( + self.rapidfuzz.fuzz, + self._fuzz_scorer, + self.rapidfuzz.fuzz.token_sort_ratio, + ) + + def fuzz_clean(self, term: str) -> str: + # no really, it's a string + # noinspection PyTypeChecker + return self.rapidfuzz.utils.default_process(term) diff --git a/src/fuzztypes/utils/__init__.py b/src/fuzztypes/utils/__init__.py new file mode 100644 index 0000000..1e84b9f --- /dev/null +++ b/src/fuzztypes/utils/__init__.py @@ -0,0 +1,6 @@ +from .download import download_file, get_file + +__all__ = ( + "download_file", + "get_file", +) diff --git a/src/fuzztypes/utils/download.py b/src/fuzztypes/utils/download.py new file mode 100644 index 0000000..503d635 --- /dev/null +++ b/src/fuzztypes/utils/download.py @@ -0,0 +1,59 @@ +import os +import urllib.request +from datetime import datetime +from typing import Optional +from urllib.error import URLError, HTTPError + +from fuzztypes import logger, const + + +def get_file_age_in_days(file_path: str) -> int: + age = datetime.now() - datetime.fromtimestamp(os.path.getmtime(file_path)) + return age.days + + +def get_file(url: str, expires_in_days: int = 30) -> Optional[str]: + """ + Tries to retrieve a file from the cache or download it if not available + or expired. + + :param url: The URL of the original file to be downloaded. + :param expires_in_days: Expiration period for the cached file. + :return: Path to the downloaded file, or None if fails. + """ + file_name = os.path.basename(url) + cache_file_path = os.path.join(const.DownloadsPath, file_name) + temp_download_path = f"{cache_file_path}.tmp" + + cache_ok = os.path.exists(cache_file_path) + if cache_ok: + file_age = get_file_age_in_days(cache_file_path) + cache_ok = file_age <= expires_in_days + + if not cache_ok: + download_success = download_file(url, temp_download_path) + if download_success: + os.replace(temp_download_path, cache_file_path) + cache_ok = os.path.exists(cache_file_path) + + if not cache_ok: + logger.error(f"Unable to download the file and no cached file: {url}") + + return cache_file_path if cache_ok else None + + +def download_file(url, download_path): + """ + Attempt to download a file directly to a specified path. + If the download fails, logs a warning and returns None. + + :param url: The URL of the file to be downloaded. + :param download_path: The full file path where the file should be saved. + :return: Boolean indicating success or failure of the download. + """ + try: + urllib.request.urlretrieve(url, download_path) + return True + except (HTTPError, URLError, ValueError, OSError, Exception) as e: + logger.warning(f"Download (url={url}) failed: {e}", exc_info=True) + return False diff --git a/src/fuzztypes/validation.py b/src/fuzztypes/validation.py new file mode 100644 index 0000000..26216f9 --- /dev/null +++ b/src/fuzztypes/validation.py @@ -0,0 +1,113 @@ +import dataclasses +import sys +from functools import lru_cache +from itertools import chain +from typing import Any, Dict, Optional, Union, cast, get_args + +from pydantic import ( + GetCoreSchemaHandler, + GetJsonSchemaHandler, + TypeAdapter, + json_schema, +) +from pydantic_core import CoreSchema, PydanticCustomError, core_schema + +from fuzztypes import Entity + +dataclass_kwargs: Dict[str, Any] + +slots_true: Dict[str, bool] = {} +if sys.version_info >= (3, 10): + slots_true = {"slots": True} # pragma: no cover + + +@lru_cache(maxsize=None) +def get_type_adapter(cls: Any) -> TypeAdapter: + """ + Get a type adapter for the given class wrapped by a cache. + + :param cls: TypedDict, BaseModel, or Annotation. + :return: TypeAdapter wrapper of cls + """ + return TypeAdapter(cls) + + +def validate_json(cls: Any, value: Union[str, bytes]) -> Any: + """ + Validate a JSON string or bytes against the model. + + :param cls: TypedDict, BaseModel, or Annotation. + :param value: JSON string or bytes to validate. + :return: Validated Python object. + """ + return get_type_adapter(cls).validate_json(value) + + +def validate_python(cls: Any, value: Any) -> Any: + """ + Validate a Python object against the model. + + :param cls: TypedDict, BaseModel, or Annotation. + :param value: Python object to validate. + :return: Validated Python object. + """ + ta = get_type_adapter(cls) + return ta.validate_python(value) + + +def validate_entity(cls: Any, value: Any) -> Optional[Entity]: + """ + Returns entity from metadata if cls is a FuzzValidator. + + :param cls: Any object + :param value: input value + :return: Entity if validator is an entity source + """ + metadata = get_args(cls) + entity = None + for item in chain([cls], metadata): + if isinstance(item, FuzzValidator): + entity = item[value] + return entity + + +@dataclasses.dataclass(frozen=True, **slots_true) +class FuzzValidator: + func: Any + examples: Optional[list] = None + + def __hash__(self): + attrs = (self.func, tuple(self.examples or ())) + return hash(attrs) + + def __getitem__(self, key): + try: + return self.func[key] + except PydanticCustomError as err: + raise KeyError(f"Key Error: {key} [{err}]") from err + + def __get_pydantic_core_schema__( + self, source_type: Any, handler: GetCoreSchemaHandler + ) -> core_schema.CoreSchema: + schema = handler(source_type) + func = cast(core_schema.NoInfoValidatorFunction, self.func) + + return core_schema.no_info_before_validator_function( + func, schema=schema + ) + + def __get_pydantic_json_schema__( + self, + schema: CoreSchema, + handler: GetJsonSchemaHandler, + ) -> json_schema.JsonSchemaValue: + """ + Generate the JSON schema for the AbstractType. + + This method is used internally by Pydantic to generate the JSON + schema representation of the AbstractType, including any examples. + """ + schema = handler(schema) + if self.examples is not None: + schema["examples"] = self.examples + return schema diff --git a/tests/conftest.py b/tests/conftest.py index 0300586..8a8a488 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,7 @@ from pytest import fixture -from fuzztypes import EntitySource +from fuzztypes import EntitySource, NamedEntity @fixture(scope="session") @@ -21,11 +21,14 @@ def EmojiSource(data_path): def FruitSource(data_path): # loading separately from AnimalSource to test lazy loading MixedSource = EntitySource(data_path / "mixed.jsonl") - FruitSource = MixedSource["fruit"] assert MixedSource.loaded is False + + FruitSource = MixedSource["fruit"] + assert isinstance(FruitSource, EntitySource) assert FruitSource.loaded is False # first access loads FruitSource -> MixedSource + assert isinstance(FruitSource[0], NamedEntity) assert FruitSource[0].value == "Apple" assert FruitSource.loaded is True assert MixedSource.loaded is True diff --git a/tests/in_memory/test_in_memory_alias.py b/tests/in_memory/test_in_memory_alias.py index 319e1e1..56f20b8 100644 --- a/tests/in_memory/test_in_memory_alias.py +++ b/tests/in_memory/test_in_memory_alias.py @@ -1,17 +1,18 @@ import pytest +from typing import Annotated from pydantic import BaseModel, ValidationError -from fuzztypes import InMemory, flags +from fuzztypes import InMemoryValidator, flags @pytest.fixture(scope="session") def MythicalFigure(MythSource): - return InMemory(MythSource, search_flag=flags.AliasSearch) + return InMemoryValidator(MythSource, search_flag=flags.AliasSearch) @pytest.fixture(scope="session") def CasedMythicalFigure(MythSource): - return InMemory( + return InMemoryValidator( MythSource, search_flag=flags.AliasSearch, case_sensitive=True, @@ -36,7 +37,7 @@ def test_alias_cased_getitem(CasedMythicalFigure): def test_uncased_alias_str(MythicalFigure): class Example(BaseModel): - value: MythicalFigure + value: Annotated[str, MythicalFigure] # Exact match assert Example(value="Zeus").value == "Zeus" @@ -48,7 +49,7 @@ class Example(BaseModel): def test_cased_alias_str(CasedMythicalFigure): class Example(BaseModel): - value: CasedMythicalFigure + value: Annotated[str, CasedMythicalFigure] # Exact match assert Example(value="Zeus").value == "Zeus" @@ -62,21 +63,21 @@ class Example(BaseModel): def test_duplicate_records(): source = [["c", "b"], ["a", "b"], ["d", "b"]] - A = InMemory(source, tiebreaker_mode="raise") + A = InMemoryValidator(source) assert A["a"].value == "a" try: assert A["b"].value == "a" assert False, "Didn't raise exception!" except KeyError as e: - assert str(e) == ( - "'Key Error: b [key (b) could not be resolved, " - "closest non-matches = b => c [100.0], b => a [" - "100.0], b => d [100.0]]'" + msg = str(e.args[0]) + assert ( + msg == "Key Error: b " + '["b" could not be resolved, did you mean "c", "a", or "d"?]' ) - A = InMemory(source, tiebreaker_mode="lesser") + A = InMemoryValidator(source, tiebreaker_mode="lesser") assert A["b"].value == "a" - A = InMemory(source, tiebreaker_mode="greater") + A = InMemoryValidator(source, tiebreaker_mode="greater") assert A["b"].value == "d" diff --git a/tests/in_memory/test_in_memory_fuzz.py b/tests/in_memory/test_in_memory_fuzz.py index b6bdb93..bbbbced 100644 --- a/tests/in_memory/test_in_memory_fuzz.py +++ b/tests/in_memory/test_in_memory_fuzz.py @@ -1,31 +1,45 @@ +from typing import Annotated, Optional from pydantic import BaseModel, ValidationError -from fuzztypes import NamedEntity, InMemory, flags - -FruitStr = InMemory( - ["Apple", "Banana"], - search_flag=flags.FuzzSearch, -) -DirectionStr = InMemory( - [ - ("Left", "L"), - ("Right", "R"), - ("Middle", "M"), - ], - search_flag=flags.FuzzSearch, -) -LooseStr = InMemory( - ["A B C", "X Y Z"], - min_similarity=10.0, - limit=1, - search_flag=flags.FuzzSearch, -) -StrictStr = InMemory( - ["A B C", "X Y Z"], - min_similarity=95.0, - limit=1, - search_flag=flags.FuzzSearch, -) +from fuzztypes import NamedEntity, InMemoryValidator, flags, validate_python + +FruitStr = Annotated[ + Optional[str], + InMemoryValidator( + ["Apple", "Banana"], + search_flag=flags.FuzzSearch, + ), +] + +DirectionStr = Annotated[ + Optional[str], + InMemoryValidator( + [ + ("Left", "L"), + ("Right", "R"), + ("Middle", "M"), + ], + search_flag=flags.FuzzSearch, + ), +] +LooseStr = Annotated[ + Optional[str], + InMemoryValidator( + ["A B C", "X Y Z"], + min_similarity=10.0, + limit=1, + search_flag=flags.FuzzSearch, + ), +] +StrictStr = Annotated[ + str, + InMemoryValidator( + ["A B C", "X Y Z"], + min_similarity=95.0, + limit=1, + search_flag=flags.FuzzSearch, + ), +] class Model(BaseModel): @@ -60,12 +74,12 @@ def test_synonyms(): def test_get_item(): - assert DirectionStr["L"].value == "Left" + assert validate_python(DirectionStr, "L") == "Left" try: - assert DirectionStr["XYZ"] + assert validate_python(DirectionStr, "XYZ") raise AssertionError("Didn't throw KeyError") - except KeyError: + except ValidationError: pass @@ -79,11 +93,10 @@ def test_min_score(): except ValidationError as e: assert e.errors(include_url=False) == [ { - "ctx": {"key": "B K L", "near": ["A B C [40.0]"]}, + "ctx": {"key": "B K L"}, "input": "B K L", "loc": ("strict",), - "msg": "key (B K L) could not be resolved, " - "closest non-matches = A B C [40.0]", + "msg": '"B K L" could not be resolved, did you mean "A B C"?', "type": "key_not_found", } ] @@ -103,7 +116,7 @@ def test_with_priority(): assert sorted(entities)[1].value == "WP1" # validate that priority wins - WithPriority = InMemory( + WithPriority = InMemoryValidator( entities, min_similarity=65.0, search_flag=flags.FuzzSearch, @@ -113,7 +126,7 @@ def test_with_priority(): def test_without_tiebreaker(): entities = ["NT1", "NT2", "NT3"] - WithoutPriority = InMemory( + WithoutPriority = InMemoryValidator( entities, min_similarity=65.0, search_flag=flags.FuzzSearch, @@ -126,7 +139,7 @@ def test_without_tiebreaker(): def test_with_lesser_tiebreaker(): entities = ["NT1", "NT2", "NT3"] - LesserTiebreak = InMemory( + LesserTiebreak = InMemoryValidator( entities, min_similarity=65, tiebreaker_mode="lesser", @@ -137,7 +150,7 @@ def test_with_lesser_tiebreaker(): def test_with_greater_tiebreaker(): entities = ["NT1", "NT2", "NT3", "XX5"] - GreaterTiebreak = InMemory( + GreaterTiebreak = InMemoryValidator( entities, min_similarity=0, tiebreaker_mode="greater", diff --git a/tests/in_memory/test_in_memory_name.py b/tests/in_memory/test_in_memory_name.py index 78e118d..7a3daf8 100644 --- a/tests/in_memory/test_in_memory_name.py +++ b/tests/in_memory/test_in_memory_name.py @@ -1,14 +1,18 @@ -from typing import Optional +from typing import Annotated, Optional from pydantic import BaseModel, ValidationError, Field -from fuzztypes import NamedEntity, InMemory, flags +from fuzztypes import NamedEntity, InMemoryValidator, flags names = ["George Washington", "John Adams", "Thomas Jefferson"] -President = InMemory(names, search_flag=flags.NameSearch) -CasedPrez = InMemory(names, case_sensitive=True, search_flag=flags.NameSearch) -NullPrez = InMemory(names, notfound_mode="none", search_flag=flags.NameSearch) -AllowPrez = InMemory( +President = InMemoryValidator(names, search_flag=flags.NameSearch) +CasedPrez = InMemoryValidator( + names, case_sensitive=True, search_flag=flags.NameSearch +) +NullPrez = InMemoryValidator( + names, notfound_mode="none", search_flag=flags.NameSearch +) +AllowPrez = InMemoryValidator( names, notfound_mode="allow", search_flag=flags.NameSearch ) @@ -31,7 +35,7 @@ def test_namestr_getitem(): def test_uncased_name_str(): class Example(BaseModel): - value: President + value: Annotated[str, President] # exact match assert Example(value="George Washington").value == "George Washington" @@ -42,7 +46,7 @@ class Example(BaseModel): def test_cased_name_str(): class Example(BaseModel): - value: CasedPrez + value: Annotated[str, CasedPrez] # exact match assert Example(value="George Washington").value == "George Washington" @@ -57,7 +61,7 @@ class Example(BaseModel): def test_nullable_name_str(): class Example(BaseModel): - value: Optional[NullPrez] = Field(None) + value: Annotated[Optional[str], NullPrez] = Field(default=None) assert Example().model_dump() == {"value": None} assert Example(value="The Rock").model_dump() == {"value": None} diff --git a/tests/in_memory/test_in_memory_similarity.py b/tests/in_memory/test_in_memory_similarity.py index 9ad8ea2..e96671c 100644 --- a/tests/in_memory/test_in_memory_similarity.py +++ b/tests/in_memory/test_in_memory_similarity.py @@ -1,13 +1,15 @@ import pytest from fuzztypes import flags -from fuzztypes.in_memory import InMemoryStorage +from fuzztypes.in_memory import InMemoryValidatorStorage from fuzztypes.lazy import create_reranker @pytest.fixture(scope="session") def EmotionMemoryStorage(EmotionSource): - storage = InMemoryStorage(EmotionSource, search_flag=flags.SemanticSearch) + storage = InMemoryValidatorStorage( + EmotionSource, search_flag=flags.SemanticSearch + ) storage.prepare() return storage diff --git a/tests/in_memory/test_in_memory_tags_example.py b/tests/in_memory/test_in_memory_tags_example.py index d2e0d4b..9721494 100644 --- a/tests/in_memory/test_in_memory_tags_example.py +++ b/tests/in_memory/test_in_memory_tags_example.py @@ -5,17 +5,24 @@ Collected tags from his website here: https://simonwillison.net/tags/ -Future Goal: Move to OnDisk implementation with NotFound=Allow where the +Future Goal: Move to OnDiskValidator implementation with NotFound=Allow where the tags are added to the database incrementally for future fuzzy matching. https://github.com/quickwit-oss/tantivy-py/issues/20 https://docs.rs/tantivy/latest/tantivy/query/struct.FuzzyTermQuery.html """ -from typing import List +from typing import Annotated, List from pydantic import BaseModel from pytest import fixture -from fuzztypes import EntitySource, InMemory, flags +from fuzztypes import ( + EntitySource, + InMemoryValidator, + flags, + validate_entity, + validate_python, + Entity, +) @fixture(scope="session") @@ -32,45 +39,64 @@ def Tag(TagSource): # min_similarity is very low for demo # QRatio used because tags are single "words" (e.g. sqlinjection) - return InMemory( - TagSource, - notfound_mode="allow", - search_flag=flags.FuzzSearch, - min_similarity=50.0, - fuzz_scorer="QRatio", - ) + return Annotated[ + str, + InMemoryValidator( + TagSource, + notfound_mode="allow", + search_flag=flags.FuzzSearch, + min_similarity=50.0, + fuzz_scorer="QRatio", + ), + ] -def test_fuzzy_tags_priority(Tag): - # exact matches - # priority is topic prevalence, higher wins. - assert Tag["2d"].priority == 3 - assert Tag["3d"].priority == 14 +def test_get_entity_from_annotation(Tag): + entity = validate_entity(Tag, "2d") + assert isinstance(entity, Entity) + assert entity.priority == 3 + + entity = validate_entity(Tag, "3d") + assert isinstance(entity, Entity) + assert entity.priority == 14 + +def test_fuzzy_tags_priority(Tag): # since min_similarity is 50.0, it chooses higher priority - assert Tag("4d") == "3d" + assert validate_python(Tag, "4d") == "3d" # matches because 67% ratio > 50.0 minimum - assert Tag("27d") == "2d" + assert validate_python(Tag, "27d") == "2d" # less than 50% similarity is passed through (notfound_mode="allow") - assert Tag("17d") == "17d" + assert validate_python(Tag, "17d") == "17d" # different - assert Tag("18d") == "i18n" + assert validate_python(Tag, "18d") == "i18n" # todo: collect allowed tags and use for future fuzzy matching - # assert Tag("15d") == "17d" - assert Tag("15d") == "15d" + # assert validate_python(Tag, "15d") == "17d" + assert validate_python(Tag, "15d") == "15d" def test_fuzzy_scoring_edge_cases(Tag): - assert Tag("prompt_injection") == "promptinjection" - assert Tag("promptinjections") == "promptinjection" - assert Tag("prompt injections") == "promptinjection" - + assert validate_python(Tag, "prompt_injection") == "promptinjection" + assert validate_python(Tag, "promptinjections") == "promptinjection" + assert validate_python(Tag, "prompt injections") == "promptinjection" + + +def test_as_a_list_of_tags(TagSource): + Tag = Annotated[ + str, + InMemoryValidator( + TagSource, + notfound_mode="allow", + search_flag=flags.FuzzSearch, + min_similarity=50.0, + fuzz_scorer="QRatio", + ), + ] -def test_as_a_list_of_tags(Tag): class Post(BaseModel): text: str tags: List[Tag] diff --git a/tests/on_disk/test_on_disk_alias.py b/tests/on_disk/test_on_disk_alias.py index f64b1cc..1349060 100644 --- a/tests/on_disk/test_on_disk_alias.py +++ b/tests/on_disk/test_on_disk_alias.py @@ -1,17 +1,21 @@ +from typing import Annotated + import pytest from pydantic import BaseModel, ValidationError -from fuzztypes import OnDisk, flags +from fuzztypes import OnDiskValidator, flags @pytest.fixture(scope="session") def MythicalFigure(MythSource): - return OnDisk("MythicalFigure", MythSource, search_flag=flags.AliasSearch) + return OnDiskValidator( + "MythicalFigure", MythSource, search_flag=flags.AliasSearch + ) @pytest.fixture(scope="session") def CasedMythicalFigure(MythSource): - return OnDisk( + return OnDiskValidator( "CasedMythicalFigure", MythSource, search_flag=flags.AliasSearch, @@ -37,7 +41,7 @@ def test_alias_cased_getitem(CasedMythicalFigure): def test_uncased_alias_str(MythicalFigure): class Example(BaseModel): - value: MythicalFigure + value: Annotated[str, MythicalFigure] # Exact match assert Example(value="Zeus").value == "Zeus" @@ -49,7 +53,7 @@ class Example(BaseModel): def test_cased_alias_str(CasedMythicalFigure): class Example(BaseModel): - value: CasedMythicalFigure + value: Annotated[str, CasedMythicalFigure] # Exact match assert Example(value="Zeus").value == "Zeus" @@ -63,21 +67,21 @@ class Example(BaseModel): def test_duplicate_records(): source = [["c", "b"], ["a", "b"], ["d", "b"]] - A = OnDisk("DupeRec", source, tiebreaker_mode="raise") + A = OnDiskValidator("DupeRec", source) assert A["a"].value == "a" try: assert A["b"].value == "a" assert False, "Didn't raise exception!" except KeyError as e: - assert str(e) == ( - "'Key Error: b [key (b) could not be resolved, " - "closest non-matches = b => c [100.0], b => a [" - "100.0], b => d [100.0]]'" + assert ( + str(e) + == '\'Key Error: b ' + '["b" could not be resolved, did you mean "c", "a", or "d"?]\'' ) - A = OnDisk("DupeRec", source, tiebreaker_mode="lesser") + A = OnDiskValidator("DupeRec", source, tiebreaker_mode="lesser") assert A["b"].value == "a" - A = OnDisk("DupeRec", source, tiebreaker_mode="greater") + A = OnDiskValidator("DupeRec", source, tiebreaker_mode="greater") assert A["b"].value == "d" diff --git a/tests/on_disk/test_on_disk_fuzz.py b/tests/on_disk/test_on_disk_fuzz.py index 96eca24..ba5b0ed 100644 --- a/tests/on_disk/test_on_disk_fuzz.py +++ b/tests/on_disk/test_on_disk_fuzz.py @@ -1,11 +1,13 @@ import os -import tantivy -from fuzztypes import Fuzzmoji, const + +import tantivy # type: ignore + +from fuzztypes import Fuzzmoji, const, validate_python def test_tantivy(): # make sure the index is built - assert Fuzzmoji("balloon") == "🎈" + assert validate_python(Fuzzmoji, "balloon") == "🎈" # standard schema schema_builder = tantivy.SchemaBuilder() @@ -14,7 +16,9 @@ def test_tantivy(): schema = schema_builder.build() # create the index - path = os.path.join(const.OnDiskPath, "Fuzzmoji.lance/_indices/tantivy") + path = os.path.join( + const.StoredValidatorPath, "Fuzzmoji.lance/_indices/tantivy" + ) index = tantivy.Index(schema, path=path) searcher = index.searcher() @@ -38,5 +42,5 @@ def test_tantivy(): def test_fuzzmoji(): - assert Fuzzmoji("thought bubble") == "💭" - assert Fuzzmoji("bubble team") == "🧋" + assert validate_python(Fuzzmoji, "thought bubble") == "💭" + assert validate_python(Fuzzmoji, "bubble team") == "🧋" diff --git a/tests/on_disk/test_on_disk_name.py b/tests/on_disk/test_on_disk_name.py index 8db8d5d..7578e3f 100644 --- a/tests/on_disk/test_on_disk_name.py +++ b/tests/on_disk/test_on_disk_name.py @@ -1,28 +1,28 @@ -from typing import Optional +from typing import Annotated, Optional from pydantic import BaseModel, ValidationError, Field -from fuzztypes import NamedEntity, OnDisk, flags +from fuzztypes import NamedEntity, OnDiskValidator, flags, validate_entity names = ["George Washington", "John Adams", "Thomas Jefferson"] -President = OnDisk( +President = OnDiskValidator( "President", names, search_flag=flags.NameSearch, ) -CasedPrez = OnDisk( +CasedPrez = OnDiskValidator( "CasedPrez", names, case_sensitive=True, search_flag=flags.NameSearch, ) -NullPrez = OnDisk( +NullPrez = OnDiskValidator( "NullPrez", names, notfound_mode="none", search_flag=flags.NameSearch, ) -AllowPrez = OnDisk( +AllowPrez = OnDiskValidator( "AllowPrez", names, notfound_mode="allow", @@ -34,6 +34,7 @@ def test_namestr_getitem(): entity = NamedEntity(value="Thomas Jefferson") assert President["Thomas Jefferson"] == entity assert President["THOMAS JEFFERSON"] == entity + assert validate_entity(President, "Thomas Jefferson") == entity assert CasedPrez["Thomas Jefferson"] == entity try: @@ -48,7 +49,7 @@ def test_namestr_getitem(): def test_uncased_name_str(): class Example(BaseModel): - value: President + value: Annotated[str, President] # exact match assert Example(value="George Washington").value == "George Washington" @@ -59,7 +60,7 @@ class Example(BaseModel): def test_cased_name_str(): class Example(BaseModel): - value: CasedPrez + value: Annotated[str, CasedPrez] # exact match assert Example(value="George Washington").value == "George Washington" @@ -74,7 +75,7 @@ class Example(BaseModel): def test_nullable_name_str(): class Example(BaseModel): - value: Optional[NullPrez] = Field(None) + value: Annotated[Optional[str], NullPrez] = Field(default=None) assert Example().model_dump() == {"value": None} assert Example(value="The Rock").model_dump() == {"value": None} diff --git a/tests/on_disk/test_on_disk_semantic.py b/tests/on_disk/test_on_disk_semantic.py index 9016866..f4e8e4b 100644 --- a/tests/on_disk/test_on_disk_semantic.py +++ b/tests/on_disk/test_on_disk_semantic.py @@ -1,25 +1,25 @@ import pytest from pydantic import BaseModel -from fuzztypes import flags, on_disk, Vibemoji +from fuzztypes import flags, on_disk, Vibemoji, validate_python @pytest.fixture(scope="session") -def EmotionOnDiskStorage(EmotionSource): - storage = on_disk.OnDiskStorage( +def EmotionStoredValidatorStorage(EmotionSource): + storage = on_disk.StoredValidatorStorage( "Emotions", EmotionSource, search_flag=flags.SemanticSearch ) storage.prepare(force_drop_table=True) return storage -def test_check_storage_directly(EmotionOnDiskStorage): - matches = EmotionOnDiskStorage.get("happiness") +def test_check_storage_directly(EmotionStoredValidatorStorage): + matches = EmotionStoredValidatorStorage.get("happiness") assert len(matches) == 1 assert matches[0].entity.value == "Happiness" assert matches[0].score == 100.0 - matches = EmotionOnDiskStorage.get("scared") + matches = EmotionStoredValidatorStorage.get("scared") assert len(matches) == 10 assert matches[0].entity.value == "Fear" assert matches[0].score == pytest.approx(91.23) @@ -30,8 +30,7 @@ class MyModel(BaseModel): def test_vibemoji_get_value(): - assert Vibemoji("bacon tastes good") == "🥓" - assert Vibemoji("take the bus to school") == "🚌" - assert Vibemoji("jolly santa") == "🎅" - assert Vibemoji("st. nick") == "🇲🇫" # can't win them all! - assert Vibemoji("United States") == "🇺🇸" + assert validate_python(Vibemoji, "bacon tastes good") == "🥓" + assert validate_python(Vibemoji, "take the bus to school") == "🚌" + assert validate_python(Vibemoji, "jolly santa") == "🎅" + assert validate_python(Vibemoji, "United States") == "🇺🇸" diff --git a/tests/test_ascii.py b/tests/test_ascii.py index 4f48af5..6d20f47 100644 --- a/tests/test_ascii.py +++ b/tests/test_ascii.py @@ -1,12 +1,19 @@ -from pydantic import BaseModel +# -*- coding: utf-8 -*- + +from pydantic import BaseModel, TypeAdapter + from fuzztypes import ASCII -class MyModel(BaseModel): - ascii: ASCII +def test_ascii_usable_type(): + ta = TypeAdapter(ASCII) + assert ta.validate_python("άνθρωποι") == "anthropoi" def test_transliterate_utf8_to_ascii(): + class MyModel(BaseModel): + ascii: ASCII + obj = MyModel(ascii="άνθρωποι") assert obj.ascii == "anthropoi" diff --git a/tests/test_date.py b/tests/test_date.py index c50771d..a654e8f 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -1,26 +1,40 @@ from datetime import datetime, date +from typing import Annotated from zoneinfo import ZoneInfo from pydantic import BaseModel -from fuzztypes import Date, DateType, DatetimeType - -Y2K = datetime(2000, 1, 1, 0, 0, 0) +from fuzztypes import ( + Date, + DateValidator, + DatetimeValidator, + validate_python, + validate_json, +) ny_tz = ZoneInfo("America/New_York") +DateY2K = Annotated[ + datetime, + DatetimeValidator(relative_base=datetime(2000, 1, 1), timezone="EST"), +] + class MyModel(BaseModel): date: Date - time: DatetimeType(relative_base=Y2K, timezone="EST") + time: DateY2K -def test_fuzzy_date_time(): - obj = MyModel(date="11 July 2012", time="tomorrow 5am") +def test_validate_python_date_and_datetime(): + data = dict(date="11 July 2012", time="tomorrow 5am") + obj = validate_python(MyModel, data) assert obj.date == date(2012, 7, 11) assert obj.time == datetime(2000, 1, 2, 5, 0, 0, tzinfo=ny_tz) - obj = MyModel(date="July 4th", time="1 year ago") + +def test_validate_json_date_and_datetime(): + json = '{"date": "July 4th", "time": "1 year ago"}' + obj = validate_json(MyModel, json) today = date.today() year = today.year if (today.month, today.day) >= (7, 4) else today.year - 1 assert obj.date == date(year, 7, 4) @@ -35,19 +49,29 @@ def test_mdy_vs_ymd(): # MDY vs. YMD ordering is context specific # https://dateparser.readthedocs.io/en/latest/settings.html#date-order # - assert Date["02-03-04"].value == date(year=2004, month=2, day=3) - - DateEN = DateType(languages=["en"]) - assert DateEN["02-03-04"].value == date(year=2004, month=2, day=3) - - DateMDY = DateType(date_order="MDY") - assert DateMDY["02-03-04"].value == date(year=2004, month=2, day=3) - - DateES = DateType(languages=["es"]) - assert DateES["02-03-04"].value == date(year=2004, month=3, day=2) - - DateDMY = DateType(date_order="DMY") - assert DateDMY["02-03-04"].value == date(year=2004, month=3, day=2) - - DateYMD = DateType(date_order="YMD") - assert DateYMD["02-03-04"].value == date(year=2002, month=3, day=4) + assert validate_python(Date, "02-03-04") == date(year=2004, month=2, day=3) + + DateEN = Annotated[date, DateValidator(languages=["en"])] + assert validate_python(DateEN, "02-03-04") == date( + year=2004, month=2, day=3 + ) + + DateMDY = Annotated[date, DateValidator(date_order="MDY")] + assert validate_python(DateMDY, "02-03-04") == date( + year=2004, month=2, day=3 + ) + + DateES = Annotated[date, DateValidator(languages=["es"])] + assert validate_python(DateES, "02-03-04") == date( + year=2004, month=3, day=2 + ) + + DateDMY = Annotated[date, DateValidator(date_order="DMY")] + assert validate_python(DateDMY, "02-03-04") == date( + year=2004, month=3, day=2 + ) + + DateYMD = Annotated[date, DateValidator(date_order="YMD")] + assert validate_python(DateYMD, "02-03-04") == date( + year=2002, month=3, day=4 + ) diff --git a/tests/test_emoji.py b/tests/test_emoji.py index 709ace5..81cdbbb 100644 --- a/tests/test_emoji.py +++ b/tests/test_emoji.py @@ -1,12 +1,12 @@ -from fuzztypes import Emoji, emojis +from fuzztypes import Emoji, emojis, validate_python def test_key_access(): - assert Emoji("balloon") == "🎈" - assert Emoji(":atm_sign:") == "🏧" - assert Emoji("atm sign") == "🏧" - assert Emoji("atm") == "🏧" - assert Emoji("United States") == "🇺🇸" + assert validate_python(Emoji, "balloon") == "🎈" + assert validate_python(Emoji, ":atm_sign:") == "🏧" + assert validate_python(Emoji, "atm sign") == "🏧" + assert validate_python(Emoji, "atm") == "🏧" + assert validate_python(Emoji, "United States") == "🇺🇸" def test_load_emojis(): diff --git a/tests/test_entity.py b/tests/test_entity.py index 4ec187d..3fc6ed8 100644 --- a/tests/test_entity.py +++ b/tests/test_entity.py @@ -1,4 +1,4 @@ -from fuzztypes import NamedEntity, InMemory, EntitySource +from fuzztypes import NamedEntity, InMemoryValidator, EntitySource def test_entity_conv(): @@ -47,7 +47,7 @@ def test_meta_edge_cases(): def test_csv_load(EmojiSource): - Emoji = InMemory(EmojiSource) + Emoji = InMemoryValidator(EmojiSource) assert Emoji["happy"].value == "happy" assert Emoji["🎉"].value == "party" assert Emoji["party"].rank < Emoji["celebrate"].rank @@ -56,13 +56,13 @@ def test_csv_load(EmojiSource): def test_jsonl_load_animal(AnimalSource): assert AnimalSource[0].value == "Dog" - AnimalStr = InMemory(AnimalSource) + AnimalStr = InMemoryValidator(AnimalSource) assert AnimalStr["dog"] == AnimalSource[0] assert AnimalStr["Bird of prey"].value == "Eagle" def test_jsonl_label_source(FruitSource): - FruitStr = InMemory( + FruitStr = InMemoryValidator( FruitSource, case_sensitive=True, notfound_mode="none", @@ -72,7 +72,7 @@ def test_jsonl_label_source(FruitSource): def test_tsv_load(MythSource): - Myth = InMemory(MythSource) + Myth = InMemoryValidator(MythSource) assert Myth["Pallas"].value == "Athena" assert Myth["Jupiter"].value == "Zeus" @@ -82,4 +82,6 @@ def fn(): return [NamedEntity(value="hi!")] source = EntitySource(source=fn) - assert source[0].value == "hi!" + entity = source[0] + assert isinstance(entity, NamedEntity) + assert entity.value == "hi!" diff --git a/tests/test_function.py b/tests/test_function.py deleted file mode 100644 index ff45598..0000000 --- a/tests/test_function.py +++ /dev/null @@ -1,76 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel, Field - -from fuzztypes import Function - -UpperType = Function(str.upper, examples=["A", "B", "C"]) -LowerType = Function(str.lower, examples=["a", "b", "c"]) - - -# Example usage -class MyClass(BaseModel): - my_upper: UpperType - my_lower: Optional[LowerType] = Field(None) - - -def test_simple_transforms(): - obj = MyClass(my_upper="Abc", my_lower="ABc") - assert obj.my_upper == "ABC" - assert obj.my_lower == "abc" - - -def test_getitem_upper(): - assert UpperType("hello") == "HELLO" - - -def test_class_getitem(): - StripType = Function(str.strip) - assert StripType(" a b c ") == "a b c" - - -def test_missing_lookup(): - def apple_banana(key: str) -> str: - return dict(a="apple", b="banana").get(key) - - AppleBanana = Function(apple_banana) - assert AppleBanana["a"].value == "apple" - assert AppleBanana("a") == "apple" - - try: - assert AppleBanana["c"] is not None - assert False, "Didn't throw exception." - except KeyError: - pass - - NoAppleBananaOk = Function(apple_banana, notfound_mode="none") - assert NoAppleBananaOk["d"] is None - - AnyFruitOk = Function(apple_banana, notfound_mode="allow") - assert AnyFruitOk("kiwi") == "kiwi" - - -def test_json_schema(): - assert MyClass.model_json_schema() == { - "properties": { - "my_lower": { - "anyOf": [ - { - "examples": ["a", "b", "c"], - "type": "string", - }, - {"type": "null"}, - ], - "default": None, - "title": "My Lower", - }, - "my_upper": { - "examples": ["A", "B", "C"], - "title": "My Upper", - "type": "string", - }, - }, - "required": ["my_upper"], - "title": "MyClass", - "type": "object", - } diff --git a/tests/test_integer.py b/tests/test_integer.py index 3c83945..d7a11f6 100644 --- a/tests/test_integer.py +++ b/tests/test_integer.py @@ -1,34 +1,30 @@ from pydantic import BaseModel, ValidationError -from fuzztypes import Integer - - -class MyModel(BaseModel): - num: Integer +from fuzztypes import Integer, validate_python def test_convert_number_to_int(): - assert MyModel(num=3).num == 3 - assert MyModel(num="three").num == 3 - assert MyModel(num="third").num == 3 - assert MyModel(num="nineteen billion and nineteen").num == 19_000_000_019 + assert validate_python(Integer, 3) == 3 + assert validate_python(Integer, "three") == 3 + assert validate_python(Integer, "third") == 3 + assert ( + validate_python(Integer, "nineteen billion and nineteen") + == 19_000_000_019 + ) assert ( - MyModel(num="two million three thousand and nineteen").num == 2_003_019 + validate_python(Integer, "two million three thousand and nineteen") + == 2_003_019 ) def test_validation_error(): + class MyModel(BaseModel): + num: Integer + + assert MyModel(num="three").num == 3 # type: ignore[arg-type] + try: - assert MyModel(num="xyz") + assert MyModel(num="xyz") # type: ignore[arg-type] assert False, "Didn't fail to parse non-integer." except ValidationError: pass - - -def test_json_schema(): - assert MyModel.model_json_schema() == { - "properties": {"num": {"title": "Num", "type": "integer"}}, - "required": ["num"], - "title": "MyModel", - "type": "object", - } diff --git a/tests/test_language.py b/tests/test_language.py new file mode 100644 index 0000000..ec3f56f --- /dev/null +++ b/tests/test_language.py @@ -0,0 +1,72 @@ +from pydantic import BaseModel + +from fuzztypes import ( + Language, + LanguageCode, + LanguageName, + validate_python, + LanguageNamedEntity, + LanguageScope, + LanguageType, +) +from fuzztypes.language import load_languages + + +def test_load_languages(): + source = load_languages() + entities = source() + assert len(entities) == 7910 + assert entities[0].resolve() == "Ghotuo" + + +def test_language_model_resolution(): + class Model(BaseModel): + language_code: LanguageCode + language_name: LanguageName + language: Language + + # Test that Language resolves to the complete language object + data = dict(language_code="en", language="English", language_name="ENG") + obj = validate_python(Model, data) + assert obj.language_code == "en" + assert obj.language_name == "English" + assert obj.language.scope == LanguageScope.INDIVIDUAL + assert obj.language.type == LanguageType.LIVING + assert isinstance(obj.language, LanguageNamedEntity) + assert obj.model_dump(exclude_defaults=True, mode="json") == { + "language": { + "aliases": ["en", "eng"], + "alpha_2": "en", + "alpha_3": "eng", + "scope": "I", + "type": "L", + "value": "English", + }, + "language_code": "en", + "language_name": "English", + } + + +def test_matching_edge_cases(): + # 'En' is a proper name of a language + assert validate_python(LanguageName, "En") == "En" + assert validate_python(LanguageCode, "En") == "enc" + + # 'en' is the alpha2 code for English + assert validate_python(LanguageName, "en") == "English" + assert validate_python(LanguageCode, "en") == "en" + + # Bangla is common name for Bengali + assert validate_python(LanguageName, "Bangla") == "Bengali" + assert validate_python(LanguageCode, "Bangla") == "bn" + assert validate_python(Language, "Bangla").model_dump( + exclude_defaults=True, mode="json" + ) == { + "aliases": ["bn", "ben", "Bangla"], + "alpha_2": "bn", + "alpha_3": "ben", + "common_name": "Bangla", + "scope": "I", + "type": "L", + "value": "Bengali", + } diff --git a/tests/test_person.py b/tests/test_person.py index 9c3b489..432def5 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, ValidationError -from fuzztypes import Person +from fuzztypes import Person, validate_python class MyModel(BaseModel): @@ -11,32 +11,36 @@ class MyModel(BaseModel): def test_example(): - obj = MyModel(person="Mr. John (Johnny) Q. Public IV") - assert str(obj.person) == "Mr. John Q. Public IV (Johnny)" - assert obj.person.last_name_first == "Public, John Q." - assert obj.person.short_name == "John Public" - assert obj.person.legal_name == "John Q. Public IV" - assert obj.person.full_name == "Mr. John Q. Public IV (Johnny)" + person = validate_python(Person, "Mr. John (Johnny) Q. Public IV") + assert str(person) == "Mr. John Q. Public IV (Johnny)" + assert person.last_name_first == "Public, John Q." + assert person.short_name == "John Public" + assert person.legal_name == "John Q. Public IV" + assert person.full_name == "Mr. John Q. Public IV (Johnny)" - assert obj.person.initials == "J. Q. P." - assert obj.person.full_initials == "J. Q. P." - assert obj.person.short_initials == "J. P." + assert person.initials == "J. Q. P." + assert person.full_initials == "J. Q. P." + assert person.short_initials == "J. P." - obj2 = MyModel(person=obj.person) - assert obj2.person == obj.person - assert obj2.person.human_name() == obj.person.human_name() + obj2 = MyModel(person=person) + assert obj2.person == person + assert obj2.person.human_name() == person.human_name() assert obj2.optional is None -def test_mixed_capitalization(): - obj = MyModel(person="shirley maclaine") - assert obj.person.first == "Shirley" - assert obj.person.last == "MacLaine" +def test_mixed_capitalization_with_validate_python(): + person = validate_python(Person, "shirley maclaine") + assert person.first == "Shirley" + assert person.last == "MacLaine" + + +def test_null_person_ok(): + assert validate_python(Optional[Person], None) is None def test_different_nickname_format_oh_well(): - obj = MyModel(person="Arthur 'The Fonz' Fonzerelli") + obj = validate_python(MyModel, dict(person="Arthur 'The Fonz' Fonzerelli")) assert obj.person.first == "Arthur" assert obj.person.last == "Fonzerelli" assert obj.person.middle == "'the Fonz'" @@ -44,106 +48,41 @@ def test_different_nickname_format_oh_well(): def test_json_serialization(): - json = '{"person": "Grace Hopper"}' + json = '{"person": "Grace Hopper", "optional": null}' obj = MyModel.model_validate_json(json) assert str(obj.person) == "Grace Hopper" + assert obj.optional is None - data = dict(person="grace hopper") + data = dict(person="grace hopper", optional="ava lovelace") obj = MyModel.model_validate(data) assert str(obj.person) == "Grace Hopper" + assert str(obj.optional) == "Ava Lovelace" json = obj.model_dump_json(exclude_defaults=True) - assert json == '{"person":{"first":"Grace","last":"Hopper"}}' + assert ( + json == '{"person":{"first":"Grace","last":"Hopper"},' + '"optional":{"first":"Ava","last":"Lovelace"}}' + ) obj = MyModel.model_validate_json(json) data = obj.model_dump(exclude_defaults=True) - assert data == dict(person=dict(first="Grace", last="Hopper")) + assert data == dict( + person=dict(first="Grace", last="Hopper"), + optional=dict(first="Ava", last="Lovelace"), + ) def test_value_error(): try: - assert MyModel(person=None).person is None + data: dict = {} + validate_python(MyModel, data) assert False, "Didn't fail as expected." except ValidationError: pass try: - assert MyModel(person=5) + data = dict(person=5) + validate_python(MyModel, data) assert False, "Didn't fail as expected." except ValueError: pass - - -def test_json_schema(): - assert MyModel.model_json_schema() == { - "$defs": { - "PersonModel": { - "properties": { - "first": { - "default": "", - "title": "First", - "type": "string", - }, - "init_format": { - "default": "{first} " "{middle} " "{last}", - "title": "Init " "Format", - "type": "string", - }, - "last": {"default": "", "title": "Last", "type": "string"}, - "middle": { - "default": "", - "title": "Middle", - "type": "string", - }, - "name_format": { - "default": "{title} " - "{first} " - "{middle} " - "{last} " - "{suffix} " - "({nickname})", - "title": "Name " "Format", - "type": "string", - }, - "nickname": { - "default": "", - "title": "Nickname", - "type": "string", - }, - "suffix": { - "default": "", - "title": "Suffix", - "type": "string", - }, - "title": { - "default": "", - "title": "Title", - "type": "string", - }, - }, - "title": "PersonModel", - "type": "object", - } - }, - "properties": { - "person": { - "anyOf": [ - {"$ref": "#/$defs/PersonModel"}, - {"type": "string"}, - ], - "title": "Person", - }, - "optional": { - "anyOf": [ - {"$ref": "#/$defs/PersonModel"}, - {"type": "string"}, - {"type": "null"}, - ], - "default": None, - "title": "Optional", - }, - }, - "required": ["person"], - "title": "MyModel", - "type": "object", - } diff --git a/tests/test_readme.py b/tests/test_readme.py new file mode 100644 index 0000000..445dd77 --- /dev/null +++ b/tests/test_readme.py @@ -0,0 +1,281 @@ +from datetime import date, datetime +from typing import Annotated + +from pydantic import BaseModel + +from fuzztypes import ( + ASCII, + Datetime, + Email, + Fuzzmoji, + InMemoryValidator, + Integer, + Person, + RegexValidator, + ZipCode, + flags, +) + + +# define a source, see EntitySource for using TSV, CSV, JSONL +inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"] + +# define a in memory validator with fuzz search enabled. +Inventor = Annotated[ + str, InMemoryValidator(inventors, search_flag=flags.FuzzSearch) +] + +# custom Regex type for finding twitter handles. +Handle = Annotated[ + str, RegexValidator(r"@\w{1,15}", examples=["@genomoncology"]) +] + + +# define a Pydantic class with 9 fuzzy type attributes +class Fuzzy(BaseModel): + ascii: ASCII + email: Email + emoji: Fuzzmoji + handle: Handle + integer: Integer + inventor: Inventor + person: Person + time: Datetime + zipcode: ZipCode + + +def test_full_model(): + # create an instance of class Fuzzy + obj = Fuzzy( + ascii="άνθρωπος", + email="John Doe ", + emoji="thought bubble", + handle="Ian Maurer (@imaurer)", + integer="fifty-five", # type: ignore[arg-type] + inventor="ada luvlace", + person="mr. arthur h. fonzarelli (fonzie)", # type: ignore[arg-type] + time="5am on Jan 1, 2025", # type: ignore[arg-type] + zipcode="(Zipcode: 12345-6789)", + ) + + # test the autocorrecting performed + + # greek for man: https://en.wiktionary.org/wiki/άνθρωπος + assert obj.ascii == "anthropos" + + # extract email via regular expression + assert obj.email == "jdoe@example.com" + + # fuzzy match "thought bubble" to "thought balloon" emoji + assert obj.emoji == "💭" + + # simple, inline regex example (see above Handle type) + assert obj.handle == "@imaurer" + + # convert integer word phrase to integer value + assert obj.integer == 55 + + # case-insensitive fuzzy match on lowercase, misspelled name + assert obj.inventor == "Ada Lovelace" + + # human name parser (title, first, middle, last, suffix, nickname) + assert str(obj.person) == "Mr. Arthur H. Fonzarelli (fonzie)" + assert obj.person.short_name == "Arthur Fonzarelli" + assert obj.person.nickname == "fonzie" + assert obj.person.last == "Fonzarelli" + + # convert time phrase to datetime object + assert obj.time.isoformat() == "2025-01-01T05:00:00" + + # extract zip5 or zip9 formats using regular expressions + assert obj.zipcode == "12345-6789" + + # print JSON on success + assert obj.model_dump() == { + "ascii": "anthropos", + "email": "jdoe@example.com", + "emoji": "💭", + "handle": "@imaurer", + "integer": 55, + "inventor": "Ada Lovelace", + "person": { + "first": "Arthur", + "init_format": "{first} {middle} {last}", + "last": "Fonzarelli", + "middle": "H.", + "name_format": "{title} {first} {middle} {last} {suffix} " + "({nickname})", + "nickname": "fonzie", + "suffix": "", + "title": "Mr.", + }, + "time": datetime(2025, 1, 1, 5), + "zipcode": "12345-6789", + } + + +def test_json_schema(): + data = Fuzzy.model_json_schema() + expected_data = { + "$defs": { + "PersonModel": { + "properties": { + "first": { + "default": "", + "title": "First", + "type": "string", + }, + "init_format": { + "default": "{first} " "{middle} " "{last}", + "title": "Init " "Format", + "type": "string", + }, + "last": {"default": "", "title": "Last", "type": "string"}, + "middle": { + "default": "", + "title": "Middle", + "type": "string", + }, + "name_format": { + "default": "{title} " + "{first} " + "{middle} " + "{last} " + "{suffix} " + "({nickname})", + "title": "Name " "Format", + "type": "string", + }, + "nickname": { + "default": "", + "title": "Nickname", + "type": "string", + }, + "suffix": { + "default": "", + "title": "Suffix", + "type": "string", + }, + "title": { + "default": "", + "title": "Title", + "type": "string", + }, + }, + "title": "PersonModel", + "type": "object", + } + }, + "properties": { + "ascii": {"title": "Ascii", "type": "string"}, + "email": { + "examples": ["user@example.com"], + "title": "Email", + "type": "string", + }, + "emoji": {"title": "Emoji", "type": "string"}, + "handle": { + "examples": ["@genomoncology"], + "title": "Handle", + "type": "string", + }, + "integer": {"title": "Integer", "type": "integer"}, + "inventor": {"title": "Inventor", "type": "string"}, + "person": {"$ref": "#/$defs/PersonModel"}, + "time": {"format": "date-time", "title": "Time", "type": "string"}, + "zipcode": { + "examples": ["12345", "12345-6789"], + "title": "Zipcode", + "type": "string", + }, + }, + "required": [ + "ascii", + "email", + "emoji", + "handle", + "integer", + "inventor", + "person", + "time", + "zipcode", + ], + "title": "Fuzzy", + "type": "object", + } + assert data == expected_data + + +def test_in_memory_validator(): + # Create a custom annotation type for matching fruits in memory + fruits = ["Apple", "Banana", "Orange"] + Fruit = Annotated[ + str, InMemoryValidator(fruits, search_flag=flags.FuzzSearch) + ] + + class MyModel(BaseModel): + fruit: Fruit + + model = MyModel(fruit="appel") + assert model.fruit == "Apple" + + +def test_on_disk_validator(): + from fuzztypes import OnDiskValidator + + # Create a custom annotation type for matching countries stored on disk + countries = [ + ("United States", "US"), + ("United Kingdom", "UK"), + ("Canada", "CA"), + ] + Country = Annotated[str, OnDiskValidator("Country", countries)] + + class MyModel(BaseModel): + country: Country + + assert MyModel(country="Canada").country == "Canada" + assert MyModel(country="US").country == "United States" + + +def test_date_validators(): + from fuzztypes import DateValidator, DatetimeValidator + + MyDate = Annotated[date, DateValidator(date_order="MDY")] + MyTime = Annotated[datetime, DatetimeValidator(timezone="UTC")] + + class MyModel(BaseModel): + date: MyDate + time: MyTime + + model = MyModel(date="1/1/2023", time="1/1/23 at 10:30 PM") # type: ignore + assert model.date.isoformat() == "2023-01-01" + assert model.time.isoformat() == "2023-01-01T22:30:00+00:00" + + +def test_fuzz_validator(): + from fuzztypes import FuzzValidator + + # Create a custom annotation type that converts a value to uppercase + UpperCase = Annotated[str, FuzzValidator(str.upper)] + + class MyModel(BaseModel): + name: UpperCase + + model = MyModel(name="john") + assert model.name == "JOHN" + + +def test_regex_validator(): + from fuzztypes import RegexValidator + + # Create a custom annotation type for matching email addresses + IPAddress = Annotated[ + str, RegexValidator(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}$") + ] + + class MyModel(BaseModel): + ip_address: IPAddress + + model = MyModel(ip_address="My internet IP address is 192.168.127.12") + assert model.ip_address == "192.168.127.12" diff --git a/tests/test_regex.py b/tests/test_regex.py index 72d5358..b094350 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -1,47 +1,50 @@ -from pydantic_core import PydanticCustomError +from pydantic import ValidationError -from fuzztypes import Email, SSN, ZipCode +from fuzztypes import Email, SSN, ZipCode, validate_python def test_email_regexer(): - assert Email("Jane Doe ") == "jdoe@example.com" - assert Email[""] == "jdoe@example.com" + assert ( + validate_python(Email, "Jane Doe ") + == "jdoe@example.com" + ) + assert validate_python(Email, "") == "jdoe@example.com" try: - assert Email["abc@xyz"] is not None + assert validate_python(Email, "abc@xyz") is not None assert False, "Invalid email did not fail!" - except KeyError: + except ValidationError: pass def test_valid_ssn(): # Value call - assert SSN("Valid SSN: 123-45-6789") == "123-45-6789" + assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" # Entity value comparison - assert SSN["Valid SSN: 123-45-6789"].value == "123-45-6789" + assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" # Entity equivalence to a value - assert SSN["Valid SSN: 123-45-6789"] == "123-45-6789" + assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" def test_valid_ssn_with_touching_bounding_chars(): - assert SSN("Valid SSN:123-45-6789.") == "123-45-6789" + assert validate_python(SSN, "Valid SSN:123-45-6789.") == "123-45-6789" def test_invalid_ssn_format(): try: - SSN("Invalid SSN: 123-456-789") + validate_python(SSN, "Invalid SSN: 123-456-789") assert False, "Invalid SSN format was accepted." - except PydanticCustomError: + except ValidationError: pass def test_ssn_needs_bounding_spaces(): try: - SSN("SSN text: abc123-45-6789xyz") + validate_python(SSN, "SSN text: abc123-45-6789xyz") assert False, "SSNs require some sort of bounding characters." - except PydanticCustomError: + except ValidationError: pass @@ -49,33 +52,36 @@ def test_multiple_ssns(): # This test depends on how you decide to handle multiple SSNs. multi_ssn_string = "Two SSNs: 123-45-6789 and 987-65-4321" try: - assert SSN(multi_ssn_string) is not None + assert validate_python(SSN, multi_ssn_string) is not None assert False, "Invalid SSN format was accepted." - except PydanticCustomError as e: + except ValidationError: pass def test_valid_zip_code_5_digits(): - assert ZipCode("Postal code: 12345") == "12345" + assert validate_python(ZipCode, "Postal code: 12345") == "12345" def test_valid_zip_code_9_digits(): - assert ZipCode("ZIP:12345-6789") == "12345-6789" + assert validate_python(ZipCode, "ZIP:12345-6789") == "12345-6789" def test_zip_code_within_text(): - assert ZipCode("Send it to 98765-4321, please.") == "98765-4321" + assert ( + validate_python(ZipCode, "Send it to 98765-4321, please.") + == "98765-4321" + ) def test_invalid_zip_code(): try: - ZipCode("Invalid ZIP: 1234") + validate_python(ZipCode, "Invalid ZIP: 1234") assert False, "Invalid ZIP code did not fail." - except PydanticCustomError: + except ValidationError: pass def test_zip_code_with_invalid_four_format(): # Python's re module does not support lookbehinds (?