From 43622ca0afc82e94cf14d1451b5164ade686ce6e Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Tue, 19 Mar 2024 14:40:50 -0400 Subject: [PATCH 01/15] Download files with caching. --- pyproject.toml | 1 + requirements-dev.txt | 2 + src/fuzztypes/__init__.py | 18 +++++- src/fuzztypes/const.py | 1 + src/fuzztypes/utils/__init__.py | 6 ++ src/fuzztypes/utils/download.py | 59 ++++++++++++++++++++ tests/utils/test_download.py | 99 +++++++++++++++++++++++++++++++++ 7 files changed, 183 insertions(+), 3 deletions(-) create mode 100644 src/fuzztypes/utils/__init__.py create mode 100644 src/fuzztypes/utils/download.py create mode 100644 tests/utils/test_download.py diff --git a/pyproject.toml b/pyproject.toml index adbe1e8..8aee98d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dynamic = ["version"] [project.optional-dependencies] test = [ "pytest", + "pytest-mock", "coverage[toml]", ] local = [ diff --git a/requirements-dev.txt b/requirements-dev.txt index ec60578..0f478cb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -320,6 +320,8 @@ pylance==0.10.2 pyproject-hooks==1.0.0 # via build pytest==8.0.1 + # via pytest-mock +pytest-mock==3.12.0 python-dateutil==2.9.0.post0 # via # arrow diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index b52020a..71ad4e5 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -1,5 +1,17 @@ +__version__ = "0.0.2" + +# logging +import logging + +logger = logging.getLogger("fuzztypes") +logger.setLevel(logging.WARNING) + +# flags and constants from . import flags from . import const + +# utilities +from . import utils from . import lazy # Schema @@ -9,7 +21,7 @@ # Hidden Abstract Types from . import abstract -# Base Entity Types +# Base Named Entity Types from .in_memory import InMemory from .on_disk import OnDisk @@ -25,8 +37,6 @@ from .person import Person from .regex import Email, SSN, ZipCode -__version__ = "0.0.2" - __all__ = ( "ASCII", @@ -56,4 +66,6 @@ "const", "flags", "lazy", + "logger", + "utils", ) diff --git a/src/fuzztypes/const.py b/src/fuzztypes/const.py index 755dbbe..ad04e66 100644 --- a/src/fuzztypes/const.py +++ b/src/fuzztypes/const.py @@ -5,6 +5,7 @@ FuzzHome = "~/.local/fuzztypes/" FuzzHome = os.path.expanduser(os.environ.get("FUZZTYPES_HOME", FuzzHome)) OnDiskPath = os.path.join(FuzzHome, "on_disk") +DownloadsPath = os.path.join(FuzzHome, "downloads") # Default encoder to use when generating semantic embeddings. # Override with environment variable `FUZZTYPES_DEFAULT_ENCODER`. diff --git a/src/fuzztypes/utils/__init__.py b/src/fuzztypes/utils/__init__.py new file mode 100644 index 0000000..1e84b9f --- /dev/null +++ b/src/fuzztypes/utils/__init__.py @@ -0,0 +1,6 @@ +from .download import download_file, get_file + +__all__ = ( + "download_file", + "get_file", +) diff --git a/src/fuzztypes/utils/download.py b/src/fuzztypes/utils/download.py new file mode 100644 index 0000000..6727946 --- /dev/null +++ b/src/fuzztypes/utils/download.py @@ -0,0 +1,59 @@ +import os +import urllib.request +from datetime import datetime +from typing import Optional +from urllib.error import URLError, HTTPError + +from fuzztypes import logger, const + + +def get_file_age_in_days(file_path: str) -> int: + age = datetime.now() - datetime.fromtimestamp(os.path.getmtime(file_path)) + return age.days + + +def get_file(url: str, expires_in_days: int = 30) -> Optional[str]: + """ + Tries to retrieve a file from the cache or download it if not available + or expired. + + :param url: The URL of the original file to be downloaded. + :param expires_in_days: Expiration period for the cached file. + :return: Path to the downloaded file, or None if fails. + """ + file_name = os.path.basename(url) + cache_file_path = os.path.join(const.DownloadsPath, file_name) + temp_download_path = f"{cache_file_path}.tmp" + + cache_ok = os.path.exists(cache_file_path) + if cache_ok: + file_age = get_file_age_in_days(cache_file_path) + cache_ok = file_age <= expires_in_days + + if not cache_ok: + download_success = download_file(url, temp_download_path) + if download_success: + os.replace(temp_download_path, cache_file_path) + cache_ok = os.path.exists(cache_file_path) + + if cache_ok: + return cache_file_path + + logger.error(f"Unable to download the file and no cached file: {url}") + + +def download_file(url, download_path): + """ + Attempt to download a file directly to a specified path. + If the download fails, logs a warning and returns None. + + :param url: The URL of the file to be downloaded. + :param download_path: The full file path where the file should be saved. + :return: Boolean indicating success or failure of the download. + """ + try: + urllib.request.urlretrieve(url, download_path) + return True + except (HTTPError, URLError, ValueError, OSError, Exception) as e: + logger.warning(f"Download (url={url}) failed: {e}", exc_info=True) + return False diff --git a/tests/utils/test_download.py b/tests/utils/test_download.py new file mode 100644 index 0000000..e61986c --- /dev/null +++ b/tests/utils/test_download.py @@ -0,0 +1,99 @@ +import os.path + +import pytest + +from fuzztypes.const import DownloadsPath +from fuzztypes.utils.download import get_file + + +@pytest.fixture +def mock_path_exists(mocker): + return mocker.patch("os.path.exists") + + +@pytest.fixture +def mock_getmtime(mocker): + return mocker.patch("os.path.getmtime") + + +@pytest.fixture +def mock_replace(mocker): + return mocker.patch("os.replace") + + +@pytest.fixture +def mock_file_age(mocker): + return mocker.patch("fuzztypes.utils.download.get_file_age_in_days") + + +@pytest.fixture +def mock_urlretrieve(mocker): + return mocker.patch("urllib.request.urlretrieve") + + +@pytest.fixture +def mock_logger_warning(mocker): + return mocker.patch("fuzztypes.logger.warning") + + +@pytest.fixture +def mock_logger_error(mocker): + return mocker.patch("fuzztypes.logger.error") + + +def test_get_file_cache_hit(mock_path_exists, mock_file_age, mock_replace): + mock_path_exists.return_value = True + mock_file_age.return_value = 10 + + result = get_file("http://example.com/file.txt") + assert result == os.path.join(DownloadsPath, "file.txt") + mock_replace.assert_not_called() + + +def test_cache_miss_due_to_expiry( + mock_path_exists, mock_file_age, mock_replace, mock_urlretrieve +): + mock_path_exists.return_value = True + mock_file_age.return_value = 31 + mock_urlretrieve.return_value = True + + result = get_file("http://example.com/file.txt") + assert result == os.path.join(DownloadsPath, "file.txt") + mock_replace.assert_called_once() + mock_urlretrieve.assert_called_once_with( + "http://example.com/file.txt", + os.path.join(DownloadsPath, "file.txt.tmp"), + ) + + +def test_cache_miss_due_to_absence( + mock_path_exists, mock_replace, mock_urlretrieve +): + mock_path_exists.side_effect = [ + False, + True, + ] # First call for cache check, second for download check + mock_urlretrieve.return_value = True # Simulate successful download + assert get_file("http://example.com/file.txt") is not None + mock_replace.assert_called_once() + mock_urlretrieve.assert_called_once() + + +def test_download_failure( + mock_path_exists, mock_logger_error, mock_urlretrieve +): + mock_path_exists.return_value = False + mock_urlretrieve.side_effect = Exception("Download failed") + assert get_file("http://example.com/file.txt") is None + mock_logger_error.assert_called_once() + + +def test_download_exception_handling( + mock_path_exists, mock_logger_warning, mock_urlretrieve +): + mock_path_exists.return_value = False + mock_urlretrieve.side_effect = Exception( + "Unexpected error" + ) # Simulate an exception during download + assert get_file("http://example.com/file.txt") is None + mock_logger_warning.assert_called_once() From c994002416267c435d2f2d01ffa0c7c0dce2d2a5 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Tue, 19 Mar 2024 16:46:46 -0400 Subject: [PATCH 02/15] Added Language from ISO CODES. --- src/fuzztypes/__init__.py | 4 ++ src/fuzztypes/abstract.py | 4 +- src/fuzztypes/entity.py | 5 +- src/fuzztypes/in_memory.py | 25 ++++++--- src/fuzztypes/language.py | 106 +++++++++++++++++++++++++++++++++++++ src/fuzztypes/match.py | 28 +++++++--- src/fuzztypes/on_disk.py | 30 +++++++---- tests/test_language.py | 60 +++++++++++++++++++++ 8 files changed, 235 insertions(+), 27 deletions(-) create mode 100644 src/fuzztypes/language.py create mode 100644 tests/test_language.py diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index 71ad4e5..ec54199 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -34,6 +34,7 @@ from .date import Date, DateType, Datetime, DatetimeType from .emojis import Emoji, Fuzzmoji, Vibemoji from .integer import Integer +from .language import Language, LanguageName, LanguageCode from .person import Person from .regex import Email, SSN, ZipCode @@ -49,6 +50,9 @@ "Fuzzmoji", "InMemory", "Integer", + "Language", + "LanguageCode", + "LanguageName", "Match", "MatchList", "NamedEntity", diff --git a/src/fuzztypes/abstract.py b/src/fuzztypes/abstract.py index b879257..a1953ab 100644 --- a/src/fuzztypes/abstract.py +++ b/src/fuzztypes/abstract.py @@ -107,7 +107,7 @@ def __new__(cls, key: str, _: Any = None) -> Optional[Any]: """ entity = cls.lookup(key) if entity: - return entity.value + return entity.resolve() @classmethod def __class_getitem__(cls, key) -> EntityType: @@ -169,6 +169,7 @@ def __init__( *, case_sensitive: bool = False, encoder: Union[Callable, str, object] = None, + entity_type: Type[NamedEntity] = NamedEntity, device: const.DeviceList = "cpu", fuzz_scorer: str = "token_sort_ratio", limit: int = 10, @@ -183,6 +184,7 @@ def __init__( # options self.case_sensitive = case_sensitive self.device = device + self.entity_type = entity_type self.limit = limit self.min_similarity = min_similarity self.prepped = False diff --git a/src/fuzztypes/entity.py b/src/fuzztypes/entity.py index 04c59bf..699401a 100644 --- a/src/fuzztypes/entity.py +++ b/src/fuzztypes/entity.py @@ -28,6 +28,9 @@ def __eq__(self, other: Any): other = getattr(other, "value", other) return self.value == other + def resolve(self): + return self.value + @property def rank(self) -> int: """Normalized by converting None to 0 and making lower better.""" @@ -83,7 +86,7 @@ def convert(cls, item: Union[str, dict, list, tuple, "NamedEntity"]): elif isinstance(item, str): item = dict(value=item) - return NamedEntity(**item) + return cls(**item) NamedEntityAdapter = TypeAdapter(NamedEntity) diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index ada8eb8..4253c43 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Callable, Iterable, Union, List, Dict +from typing import Callable, Iterable, Union, List, Dict, Type from pydantic import PositiveInt @@ -31,7 +31,7 @@ def __init__(self, *args, **kwargs): def prepare(self): for item in self.source: - entity = NamedEntity.convert(item) + entity = self.entity_type.convert(item) self.add(entity) def add(self, entity: NamedEntity) -> None: @@ -46,13 +46,19 @@ def add(self, entity: NamedEntity) -> None: def add_by_name(self, entity: NamedEntity) -> None: term = entity.value - record = Record(entity=entity, term=term, is_alias=False) - self._mapping[self.normalize(term)].append(record) + norm_term = self.normalize(term) + record = Record( + entity=entity, term=term, norm_term=norm_term, is_alias=False + ) + self._mapping[norm_term].append(record) def add_by_alias(self, entity: NamedEntity) -> None: for term in entity.aliases: - record = Record(entity=entity, term=term, is_alias=True) - self._mapping[self.normalize(term)].append(record) + norm_term = self.normalize(term) + record = Record( + entity=entity, term=term, norm_term=norm_term, is_alias=True + ) + self._mapping[norm_term].append(record) def add_fuzz_or_semantic(self, entity: NamedEntity) -> None: clean_name: str = self.fuzz_clean(entity.value) @@ -72,7 +78,9 @@ def add_fuzz_or_semantic(self, entity: NamedEntity) -> None: def get(self, key: str) -> MatchList: records = self._mapping.get(self.normalize(key), []) - match_list = Record.from_list(records, key=key) + match_list = Record.from_list( + records, key=key, entity_type=self.entity_type + ) if not match_list: if self.search_flag.is_fuzz_ok: @@ -177,6 +185,7 @@ def InMemory( *, case_sensitive: bool = False, encoder: Union[Callable, str, object] = None, + entity_type: Type[NamedEntity] = NamedEntity, examples: list = None, fuzz_scorer: const.FuzzScorer = "token_sort_ratio", limit: PositiveInt = 10, @@ -199,7 +208,7 @@ def InMemory( return abstract.AbstractType( storage, - EntityType=NamedEntity, + EntityType=entity_type, examples=examples, input_type=str, notfound_mode=notfound_mode, diff --git a/src/fuzztypes/language.py b/src/fuzztypes/language.py new file mode 100644 index 0000000..d3a52e9 --- /dev/null +++ b/src/fuzztypes/language.py @@ -0,0 +1,106 @@ +import json +from enum import Enum +from typing import Optional, List, Iterable, Type + +from pydantic import TypeAdapter + +from fuzztypes import EntitySource, NamedEntity, OnDisk, flags, utils + + +class LanguageScope(Enum): + INDIVIDUAL = "I" + MACROLANGUAGE = "M" + SPECIAL = "S" + + +class LanguageType(Enum): + ANCIENT = "A" + CONSTRUCTED = "C" + EXTINCT = "E" + HISTORICAL = "H" + LIVING = "L" + SPECIAL = "S" + + +class LanguageNamedEntity(NamedEntity): + """Resolves to language full name.""" + + alpha_2: Optional[str] = None + alpha_3: str + scope: Optional[LanguageScope] = None + type: Optional[LanguageType] = None + common_name: Optional[str] = None + inverted_name: Optional[str] = None + bibliographic: Optional[str] = None + + @property + def code(self): + return self.alpha_2 or self.alpha_3 + + +class LanguageModelNamedEntity(LanguageNamedEntity): + """Resolves to self as a full child object.""" + + def resolve(self): + return self + + +class LanguageCodeNameEntity(LanguageNamedEntity): + """Resolves to code name.""" + + def resolve(self): + return self.code + + +remote = ( + "https://salsa.debian.org/iso-codes-team/iso-codes/-/raw/main/data" + "/iso_639-3.json" +) +local = utils.get_file(remote) + + +def load_languages(cls: Type[LanguageNamedEntity] = LanguageNamedEntity): + def do_load() -> Iterable[NamedEntity]: + data = json.load(open(local))["639-3"] + alias_fields = { + "alpha_2", + "alpha_3", + "common_name", + "inverted_name", + "bibliographic", + } + entities = [] + for item in data: + item["value"] = item.pop("name") + aliases = [v for k, v in item.items() if k in alias_fields] + item["aliases"] = aliases + entities.append(item) + return TypeAdapter(List[cls]).validate_python(data) + + return do_load + + +LanguageName = OnDisk( + "Language", + EntitySource(load_languages(LanguageNamedEntity)), + entity_type=LanguageNamedEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", +) + +LanguageCode = OnDisk( + "Language", + EntitySource(load_languages(LanguageCodeNameEntity)), + entity_type=LanguageCodeNameEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", +) + +Language = OnDisk( + "Language", + EntitySource(load_languages(LanguageModelNamedEntity)), + entity_type=LanguageModelNamedEntity, + input_type=LanguageModelNamedEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", +) diff --git a/src/fuzztypes/match.py b/src/fuzztypes/match.py index e943feb..abfefdc 100644 --- a/src/fuzztypes/match.py +++ b/src/fuzztypes/match.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional, Iterator, Any, Union +from typing import List, Tuple, Optional, Iterator, Any, Union, Type from pydantic import BaseModel, Field @@ -99,19 +99,31 @@ def choose(self, min_score: float, tiebreaker_mode: const.TiebreakerMode): class Record(BaseModel): entity: Union[NamedEntity, str] term: str + norm_term: Optional[str] = None is_alias: bool vector: Any = None - def deserialize(self): + def deserialize(self, entity_type: Type[NamedEntity]): if isinstance(self.entity, str): - self.entity = NamedEntity.model_validate_json(self.entity) + self.entity = entity_type.model_validate_json(self.entity) @classmethod - def from_list(cls, recs: list, key, score: float = 100.0) -> List[Match]: - return [record.to_match(key, score) for record in recs] - - def to_match(self, key, score: float = 100.0) -> Match: - self.deserialize() + def from_list( + cls, + recs: list, + key, + score: float = 100.0, + entity_type: Type[NamedEntity] = NamedEntity, + ) -> List[Match]: + return [record.to_match(key, score, entity_type) for record in recs] + + def to_match( + self, + key, + score: float = 100.0, + entity_type: Type[NamedEntity] = NamedEntity, + ) -> Match: + self.deserialize(entity_type) return Match( key=key, entity=self.entity, diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index 3adcc5c..cdfa83b 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -1,4 +1,4 @@ -from typing import Callable, Iterable, Union, List +from typing import Callable, Iterable, Union, List, Type from pydantic import PositiveInt @@ -43,7 +43,7 @@ def prepare(self, force_drop_table: bool = False): if self.name not in table_names: try: self.create_table() - except Exception as e: + except Exception as e: # pragma: no cover # if any issue occurs, drop the table and re-raise error # in the future, handle errors better self.conn.drop_table(self.name) @@ -57,6 +57,7 @@ def create_table(self): schema = pa.schema( [ pa.field("term", pa.string()), + pa.field("norm_term", pa.string()), pa.field("entity", pa.string()), pa.field("is_alias", pa.string()), pa.field( @@ -110,7 +111,7 @@ def create_records(self): records = [] empty = [0.0] * self.vect_dimensions for item in self.source: - entity = NamedEntity.convert(item) + entity = self.entity_type.convert(item) json = entity.model_dump_json(exclude_defaults=True) terms = [] @@ -125,13 +126,14 @@ def create_records(self): for term in terms: # normalize for case sensitivity - term = self.normalize(term) + norm_term = self.normalize(term) # construct and add record if term: record = Record( entity=json, term=term, + norm_term=norm_term, is_alias=is_alias, vector=empty, ) @@ -147,9 +149,13 @@ def create_records(self): # def get(self, key: str) -> MatchList: - where = f'term = "{self.normalize(key)}"' + where = f'term = "{key}"' match_list = self.run_query(key, where=where) + if not match_list: + where = f'norm_term = "{self.normalize(key)}"' + match_list = self.run_query(key, where=where) + if not match_list: if self.search_flag.is_fuzz_ok: match_list = self.get_by_fuzz(key) @@ -184,7 +190,7 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: if vector is not None and self.search_flag.is_semantic_ok: qb = qb.metric("cosine") - qb = qb.select(["entity", "term", "is_alias"]) + qb = qb.select(["entity", "term", "norm_term", "is_alias"]) if where is not None: qb = qb.where(where, prefilter=True) @@ -204,7 +210,10 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: score = 100.0 # Exact match record = Record.model_validate(item) - match_list.append(record.to_match(key=key, score=score)) + match = record.to_match( + key=key, score=score, entity_type=self.entity_type + ) + match_list.append(match) return match_list @@ -216,8 +225,10 @@ def OnDisk( case_sensitive: bool = False, device: str = None, encoder: Union[Callable, str, object] = None, + entity_type: Type[NamedEntity] = NamedEntity, examples: list = None, fuzz_scorer: const.FuzzScorer = "token_sort_ratio", + input_type=str, limit: PositiveInt = 10, min_similarity: float = 80.0, notfound_mode: const.NotFoundMode = "raise", @@ -230,6 +241,7 @@ def OnDisk( source, case_sensitive=case_sensitive, device=device, + entity_type=entity_type, fuzz_scorer=fuzz_scorer, limit=limit, min_similarity=min_similarity, @@ -240,9 +252,9 @@ def OnDisk( return abstract.AbstractType( storage, - EntityType=NamedEntity, + EntityType=entity_type, examples=examples, - input_type=str, + input_type=input_type, notfound_mode=notfound_mode, validator_mode=validator_mode, ) diff --git a/tests/test_language.py b/tests/test_language.py new file mode 100644 index 0000000..38a31f2 --- /dev/null +++ b/tests/test_language.py @@ -0,0 +1,60 @@ +from pydantic import BaseModel + +from fuzztypes import Language, LanguageCode, LanguageName +from fuzztypes.language import load_languages, LanguageType, LanguageScope + + +def test_load_languages(): + source = load_languages() + entities = source() + assert len(entities) == 7910 + assert entities[0].resolve() == 'Ghotuo' + + +def test_language_model_resolution(): + class Model(BaseModel): + language: Language + language_code: LanguageCode + language_name: LanguageName + + # Test that Language resolves to the complete language object + model = Model(language="English", language_code="en", language_name="ENG") + assert model.language.scope == LanguageScope.INDIVIDUAL + assert model.language.type == LanguageType.LIVING + assert model.model_dump(exclude_defaults=True, mode="json") == { + "language": { + "aliases": ["en", "eng"], + "alpha_2": "en", + "alpha_3": "eng", + "scope": "I", + "type": "L", + "value": "English", + }, + "language_code": "en", + "language_name": "English", + } + + +def test_matching_edge_cases(): + # 'En' is a proper name of a language + assert LanguageName("En") == "En" + assert LanguageCode("En") == "enc" + + # 'en' is the alpha2 code for English + assert LanguageName("en") == "English" + assert LanguageCode("en") == "en" + + # Bangla is common name for Bengali + assert LanguageName("Bangla") == "Bengali" + assert LanguageCode("Bangla") == "bn" + assert Language("Bangla").model_dump( + exclude_defaults=True, mode="json" + ) == { + "aliases": ["bn", "ben", "Bangla"], + "alpha_2": "bn", + "alpha_3": "ben", + "common_name": "Bangla", + "scope": "I", + "type": "L", + "value": "Bengali", + } From c9c36649094aef5add33f030818b6522f5d744f8 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Tue, 19 Mar 2024 16:55:19 -0400 Subject: [PATCH 03/15] Removed notebooks. --- notebooks/00_readme_examples.ipynb | 200 ----------------------------- 1 file changed, 200 deletions(-) delete mode 100644 notebooks/00_readme_examples.ipynb diff --git a/notebooks/00_readme_examples.ipynb b/notebooks/00_readme_examples.ipynb deleted file mode 100644 index babf61d..0000000 --- a/notebooks/00_readme_examples.ipynb +++ /dev/null @@ -1,200 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "## Readme Code Examples\n", - "These are the code examples from the Readme document." - ], - "metadata": { - "collapsed": false - }, - "id": "3542b4dbc383efb7" - }, - { - "cell_type": "code", - "execution_count": 14, - "outputs": [], - "source": [ - "from pydantic import BaseModel\n", - "\n", - "class Normal(BaseModel):\n", - " boolean: bool\n", - " float: float\n", - " integer: int\n", - " \n", - "obj = Normal(\n", - " boolean='yes',\n", - " float='2',\n", - " integer='3',\n", - ")\n", - "assert obj.boolean is True\n", - "assert obj.float == 2.0\n", - "assert obj.integer == 3" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-19T16:35:29.171400Z", - "start_time": "2024-03-19T16:35:29.169287Z" - } - }, - "id": "55243aba75d3e44d" - }, - { - "cell_type": "code", - "execution_count": 15, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"ascii\": \"anthropos\",\n", - " \"email\": \"jdoe@example.com\",\n", - " \"emoji\": \"💭\",\n", - " \"handle\": \"@imaurer\",\n", - " \"integer\": 55,\n", - " \"inventor\": \"Ada Lovelace\",\n", - " \"person\": {\n", - " \"name_format\": \"{title} {first} {middle} {last} {suffix} ({nickname})\",\n", - " \"init_format\": \"{first} {middle} {last}\",\n", - " \"title\": \"Mr.\",\n", - " \"first\": \"Arthur\",\n", - " \"middle\": \"Herbert\",\n", - " \"last\": \"Fonzarelli\",\n", - " \"suffix\": \"\",\n", - " \"nickname\": \"fonzie\"\n", - " },\n", - " \"time\": \"2025-01-01T05:00:00\",\n", - " \"zipcode\": \"12345-6789\"\n", - "}\n" - ] - } - ], - "source": [ - "from pydantic import BaseModel\n", - "from fuzztypes import (\n", - " ASCII,\n", - " Datetime,\n", - " Email,\n", - " Fuzzmoji,\n", - " InMemory,\n", - " Integer,\n", - " Person,\n", - " Regex,\n", - " ZipCode,\n", - " flags,\n", - ")\n", - "\n", - "# define a source, see EntitySource for using TSV, CSV, JSONL\n", - "inventors = [\"Ada Lovelace\", \"Alan Turing\", \"Claude Shannon\"]\n", - "\n", - "# define a named entity type in memory. use OnDisk for larger data sets.\n", - "Inventor = InMemory(inventors, search_flag=flags.FuzzSearch)\n", - "\n", - "# custom Regex type for finding twitter handles.\n", - "Handle = Regex(r'@\\w{1,15}', examples=[\"@genomoncology\"])\n", - "\n", - "# define a Pydantic class with 9 fuzzy type attriubutes\n", - "class Fuzzy(BaseModel):\n", - " ascii: ASCII\n", - " email: Email\n", - " emoji: Fuzzmoji\n", - " handle: Handle\n", - " integer: Integer\n", - " inventor: Inventor\n", - " person: Person\n", - " time: Datetime\n", - " zipcode: ZipCode\n", - "\n", - "# create an instance of class Fuzzy\n", - "obj = Fuzzy(\n", - " ascii=\"άνθρωπος\",\n", - " email=\"John Doe \",\n", - " emoji='thought bubble',\n", - " handle='Ian Maurer (@imaurer)',\n", - " integer='fifty-five',\n", - " inventor='ada luvlace',\n", - " person='mr. arthur herbert fonzarelli (fonzie)',\n", - " time='5am on Jan 1, 2025',\n", - " zipcode=\"(Zipcode: 12345-6789)\",\n", - ")\n", - "\n", - "# test the autocorrecting performed\n", - "\n", - "# greek for man: https://en.wiktionary.org/wiki/άνθρωπος\n", - "assert obj.ascii == \"anthropos\"\n", - "\n", - "# extract email via regular expression\n", - "assert obj.email == \"jdoe@example.com\"\n", - "\n", - "# fuzzy match \"thought bubble\" to \"thought balloon\" emoji\n", - "assert obj.emoji == \"💭\"\n", - "\n", - "# simple, inline regex example (see above Handle type)\n", - "assert obj.handle == \"@imaurer\"\n", - "\n", - "# convert integer word phrase to integer value\n", - "assert obj.integer == 55\n", - "\n", - "# case-insensitive fuzzy match on lowercase, misspelled name\n", - "assert obj.inventor == \"Ada Lovelace\"\n", - "\n", - "# human name parser (title, first, middle, last, suffix, nickname)\n", - "assert str(obj.person) == 'Mr. Arthur Herbert Fonzarelli (fonzie)'\n", - "assert obj.person.short_name == \"Arthur Fonzarelli\"\n", - "assert obj.person.nickname == \"fonzie\"\n", - "assert obj.person.last == \"Fonzarelli\"\n", - "\n", - "# convert time phrase to datetime object\n", - "assert obj.time.isoformat() == \"2025-01-01T05:00:00\"\n", - "\n", - "# extract zip5 or zip9 formats using regular expressions\n", - "assert obj.zipcode == \"12345-6789\"\n", - "\n", - "# print JSON on success\n", - "print(obj.model_dump_json(indent=4))" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-19T16:35:51.056350Z", - "start_time": "2024-03-19T16:35:50.994941Z" - } - }, - "id": "6c30a7cafa50364e" - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [], - "metadata": { - "collapsed": false - }, - "id": "b3b7c2fac600ccd8" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From a3dacf3bc0946a102a98675a8be4fe0d2a9718cf Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Wed, 20 Mar 2024 19:40:31 -0400 Subject: [PATCH 04/15] mypy cleanup. --- .gitignore | 3 +- pyproject.toml | 5 +- requirements-dev.txt | 5 ++ src/fuzztypes/__init__.py | 4 +- src/fuzztypes/abstract.py | 89 +++++++++++++++----------- src/fuzztypes/ascii.py | 5 +- src/fuzztypes/date.py | 10 +-- src/fuzztypes/entity.py | 43 ++++++++----- src/fuzztypes/function.py | 17 +++-- src/fuzztypes/in_memory.py | 49 +++++++------- src/fuzztypes/integer.py | 5 +- src/fuzztypes/language.py | 16 +++-- src/fuzztypes/lazy.py | 10 ++- src/fuzztypes/match.py | 23 +++---- src/fuzztypes/on_disk.py | 49 +++++++------- src/fuzztypes/person.py | 14 ++-- src/fuzztypes/regex.py | 11 ++-- src/fuzztypes/utils/download.py | 6 +- tests/in_memory/test_in_memory_fuzz.py | 15 ++++- tests/test_ascii.py | 13 +++- tests/test_integer.py | 7 +- tests/test_language.py | 2 +- tests/test_person.py | 19 ++---- 23 files changed, 239 insertions(+), 181 deletions(-) diff --git a/.gitignore b/.gitignore index c4f6c53..5da7fbb 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,5 @@ wheels/ model_cache/ .DS_Store /training/ -profile.dat \ No newline at end of file +profile.dat +notebooks \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8aee98d..e017fdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ local = [ "build", "jupyter", "ipython", + "mypy", "pip", "setuptools", "twine", @@ -59,8 +60,8 @@ ext = [ [tool.hatch.version] path = "src/fuzztypes/__init__.py" -[tool.mypy] -strict = true +[mypy] +no-untyped-def = false [[tool.mypy.overrides]] module = "gpt.tests.*" diff --git a/requirements-dev.txt b/requirements-dev.txt index 0f478cb..2c481d7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -217,6 +217,9 @@ more-itertools==10.2.0 # via jaraco-classes mpmath==1.3.0 # via sympy +mypy==1.9.0 +mypy-extensions==1.0.0 + # via mypy nameparser==1.1.3 nbclient==0.10.0 # via nbconvert @@ -436,6 +439,7 @@ tomli==2.0.1 # build # coverage # jupyterlab + # mypy # pyproject-hooks # pytest torch==2.2.1 @@ -482,6 +486,7 @@ typing-extensions==4.9.0 # async-lru # huggingface-hub # ipython + # mypy # pydantic # pydantic-core # torch diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index ec54199..be21aa5 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -16,7 +16,7 @@ # Schema from .entity import Entity, NamedEntity, EntitySource -from .match import Match, MatchList, Record +from .match import Match, MatchResult, Record # Hidden Abstract Types from . import abstract @@ -54,7 +54,7 @@ "LanguageCode", "LanguageName", "Match", - "MatchList", + "MatchResult", "NamedEntity", "OnDisk", "Person", diff --git a/src/fuzztypes/abstract.py b/src/fuzztypes/abstract.py index a1953ab..a23203c 100644 --- a/src/fuzztypes/abstract.py +++ b/src/fuzztypes/abstract.py @@ -1,5 +1,16 @@ from datetime import date, datetime -from typing import Any, Callable, Type, Union, Optional, Iterable, List +from typing import ( + Any, + Callable, + Dict, + Generic, + Iterable, + List, + Optional, + Type, + TypeVar, + Union, +) from pydantic import ( BaseModel, @@ -9,21 +20,43 @@ ) from pydantic_core import CoreSchema, PydanticCustomError, core_schema -from fuzztypes import NamedEntity, Entity, MatchList, const, flags, lazy +from fuzztypes import NamedEntity, Entity, MatchResult, const, flags, lazy + +T = TypeVar("T") + +SupportedType = Union[ + str, float, int, dict, list, date, datetime, BaseModel, T +] + -SupportedType = Union[str, float, int, dict, list, date, datetime, BaseModel] +class _AbstractTypeMeta(type, Generic[T]): + def __getitem__(cls: Type[T], key: Any) -> Entity[T]: + """ + Get the entity associated with the given key using dictionary-like + access. + + This method allows retrieving the entity using dictionary-like + syntax (e.g., AbstractType[key]). + + If entity found, it is returned. + If entity not found, raise a KeyError based on PydanticCustomError. + """ + try: + return cls.lookup(key) # type: ignore + except PydanticCustomError as err: + raise KeyError(f"Key Error: {key} [{err}]") from err def AbstractType( - lookup_function: Callable[[str], MatchList], + lookup_function: Callable[[T], MatchResult], *, - EntityType: Type = Entity, - examples: list = None, + EntityType: Type[Entity] = Entity, + examples: Optional[list] = None, input_type: Type[SupportedType] = str, notfound_mode: const.NotFoundMode = "raise", - output_type: Type[SupportedType] = None, + output_type: Optional[Type[T]] = None, validator_mode: const.ValidatorMode = "before", -): +) -> _AbstractTypeMeta: """ Factory function to create a specialized AbstractType, which is a Pydantic based type with added fuzzy matching capabilities. @@ -39,10 +72,8 @@ def AbstractType( :return: A specialized AbstractType based on the provided specifications. """ - output_type = output_type or input_type - # noinspection PyClassHasNoInit - class _AbstractType(output_type): + class _AbstractType(metaclass=_AbstractTypeMeta): @classmethod def __get_pydantic_core_schema__( cls, @@ -55,7 +86,7 @@ def __get_pydantic_core_schema__( This method is used internally by Pydantic to generate the schema based on the provided validation mode and input/output types. """ - validation_function_map = { + validation_function_map: Dict[str, Callable] = { "before": core_schema.with_info_before_validator_function, "after": core_schema.with_info_before_validator_function, "plain": core_schema.with_info_plain_validator_function, @@ -92,7 +123,7 @@ def __get_pydantic_json_schema__( schema["examples"] = examples return schema - def __new__(cls, key: str, _: Any = None) -> Optional[Any]: + def __new__(cls, key: T, _: Any = None) -> Optional[T]: # type: ignore """ Doesn't create an AbstractType, it's actually a class-level __call__ function. @@ -106,28 +137,10 @@ def __new__(cls, key: str, _: Any = None) -> Optional[Any]: If an exception is raised, it is will not be caught. """ entity = cls.lookup(key) - if entity: - return entity.resolve() - - @classmethod - def __class_getitem__(cls, key) -> EntityType: - """ - Get the entity associated with the given key using dictionary-like - access. - - This method allows retrieving the entity using dictionary-like - syntax (e.g., AbstractType[key]). - - If entity found, it is returned. - If entity not found, raise a KeyError based on PydanticCustomError. - """ - try: - return cls.lookup(key) - except PydanticCustomError as err: - raise KeyError(f"Key Error: {key} [{err}]") from err + return entity.resolve() if entity else None @classmethod - def lookup(cls, key: str) -> Optional[EntityType]: + def lookup(cls, key: T) -> Optional[Entity[T]]: """ Lookup the entity for the given key. @@ -141,7 +154,7 @@ def lookup(cls, key: str) -> Optional[EntityType]: "allow": returns an entity with the key as value "raise": raises a PydanticCustomError """ - match_list: MatchList = lookup_function(key) + match_list: MatchResult = lookup_function(key) if match_list.success: return match_list.entity @@ -150,10 +163,10 @@ def lookup(cls, key: str) -> Optional[EntityType]: return EntityType(value=key) if notfound_mode == "none": - return + return None msg = "key ({key}) could not be resolved" - ctx = dict(key=key) + ctx: Dict[str, Any] = dict(key=key) if match_list: ctx["near"] = [str(m) for m in match_list] msg += f", closest non-matches = {match_list}" @@ -196,7 +209,7 @@ def __init__( self._encoder = encoder self._vect_dimensions = None - def __call__(self, key: str) -> MatchList: + def __call__(self, key: str) -> MatchResult: if not self.prepped: self.prepped = True self.prepare() @@ -208,7 +221,7 @@ def __call__(self, key: str) -> MatchList: def prepare(self): raise NotImplementedError - def get(self, key: str) -> MatchList: + def get(self, key: str) -> MatchResult: raise NotImplementedError def normalize(self, key: str): diff --git a/src/fuzztypes/ascii.py b/src/fuzztypes/ascii.py index 32c710e..826e546 100644 --- a/src/fuzztypes/ascii.py +++ b/src/fuzztypes/ascii.py @@ -32,4 +32,7 @@ def to_ascii(key: str) -> str: return f(key) -ASCII = Function(to_ascii) +ASCII = Function( + to_ascii, + output_type=str, +) diff --git a/src/fuzztypes/date.py b/src/fuzztypes/date.py index e014b0c..80fff5e 100644 --- a/src/fuzztypes/date.py +++ b/src/fuzztypes/date.py @@ -1,13 +1,13 @@ import datetime from typing import Optional, Union, Type -from . import Entity, MatchList, abstract, const, lazy +from . import Entity, MatchResult, abstract, const, lazy date_or_datetime = Union[datetime.date, datetime.datetime] def DateType( - date_order: const.DateOrder = None, + date_order: Optional[const.DateOrder] = None, examples: Optional[list] = None, languages: Optional[list[str]] = None, notfound_mode: const.NotFoundMode = "raise", @@ -35,8 +35,8 @@ def DateType( parser = DateDataParser(languages=languages, settings=settings) - def parse(key: str) -> MatchList: - match_list = MatchList() + def parse(key: str) -> MatchResult: + match_list = MatchResult() value = parser.get_date_data(key).date_obj if value is not None: if input_type is datetime.date: @@ -55,7 +55,7 @@ def parse(key: str) -> MatchList: def DatetimeType( - date_order: const.DateOrder = None, + date_order: Optional[const.DateOrder] = None, examples: Optional[list] = None, languages: Optional[list[str]] = None, notfound_mode: const.NotFoundMode = "raise", diff --git a/src/fuzztypes/entity.py b/src/fuzztypes/entity.py index 699401a..9f5c8b2 100644 --- a/src/fuzztypes/entity.py +++ b/src/fuzztypes/entity.py @@ -1,13 +1,25 @@ import csv import json from pathlib import Path -from typing import List, Union, Type, Any, Optional, Tuple, Callable +from typing import ( + List, + Union, + Type, + Any, + Optional, + Tuple, + Callable, + Generic, + TypeVar, +) from pydantic import BaseModel, Field, TypeAdapter +T = TypeVar("T") -class Entity(BaseModel): - value: Any = Field( + +class Entity(BaseModel, Generic[T]): + value: T = Field( ..., description="Value stored by Entity.", ) @@ -16,11 +28,11 @@ class Entity(BaseModel): description="Entity concept type such as PERSON, ORG, or GPE.", ) meta: Optional[dict] = Field( - None, + default=None, description="Additional attributes accessible through dot-notation.", ) priority: Optional[int] = Field( - None, + default=None, description="Tiebreaker rank (higher wins, None=0, negative allowed)", ) @@ -28,7 +40,7 @@ def __eq__(self, other: Any): other = getattr(other, "value", other) return self.value == other - def resolve(self): + def resolve(self) -> T: return self.value @property @@ -54,10 +66,7 @@ def __setattr__(self, key: str, value: Any): if key in self.model_fields: super().__setattr__(key, value) else: - # Initialize meta if it's None - if self.__dict__.get("meta") is None: - super().__setattr__("meta", {}) - # Add or update the attribute in the meta dictionary + self.meta = self.meta or {} self.meta[key] = value @@ -77,16 +86,18 @@ def convert(cls, item: Union[str, dict, list, tuple, "NamedEntity"]): if isinstance(item, cls): return item + data = {} if item and isinstance(item, (list, tuple)): value, aliases = item[0], item[1:] if len(aliases) == 1 and isinstance(aliases[0], (tuple, list)): aliases = aliases[0] - item = dict(value=value, aliases=aliases) - - elif isinstance(item, str): - item = dict(value=item) + data = dict(value=value, aliases=aliases) + elif isinstance(item, dict): + data = item + else: + data = dict(value=item) - return cls(**item) + return cls(**data) NamedEntityAdapter = TypeAdapter(NamedEntity) @@ -107,7 +118,7 @@ def __len__(self): def __getitem__( self, key: Union[int, slice, str] - ) -> Union[NamedEntity, "EntitySource"]: + ) -> Union[NamedEntity, list[NamedEntity], "EntitySource"]: if isinstance(key, str): # return another shell, let loading occur on demand. return EntitySource(source=(self, key)) diff --git a/src/fuzztypes/function.py b/src/fuzztypes/function.py index 783bfbd..54c730a 100644 --- a/src/fuzztypes/function.py +++ b/src/fuzztypes/function.py @@ -1,18 +1,22 @@ -from typing import Callable, Type +from typing import Callable, Type, Optional, TypeVar -from . import Entity, MatchList, const, abstract +from . import Entity, MatchResult, const, abstract + + +T = TypeVar("T", bound=abstract.SupportedType) def Function( - source: Callable[[abstract.SupportedType], abstract.SupportedType], - examples: list = None, + source: Callable[[T], abstract.SupportedType], + examples: Optional[list] = None, notfound_mode: const.NotFoundMode = "raise", input_type: Type[abstract.SupportedType] = str, + output_type: Optional[Type[abstract.SupportedType]] = None, validator_mode: const.ValidatorMode = "before", ): - def do_lookup(key: str) -> MatchList: + def do_lookup(key: T) -> MatchResult: value = source(key) - match_list = MatchList() + match_list = MatchResult() if value is not None: entity = Entity(value=value) match_list.set(key=key, entity=entity) @@ -22,6 +26,7 @@ def do_lookup(key: str) -> MatchList: do_lookup, examples=examples, input_type=input_type, + output_type=output_type, notfound_mode=notfound_mode, validator_mode=validator_mode, ) diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index 4253c43..71bb44b 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -1,11 +1,11 @@ from collections import defaultdict -from typing import Callable, Iterable, Union, List, Dict, Type +from typing import Callable, Iterable, Union, List, Dict, Type, Optional from pydantic import PositiveInt from fuzztypes import ( Match, - MatchList, + MatchResult, NamedEntity, Record, abstract, @@ -19,10 +19,10 @@ class InMemoryStorage(abstract.AbstractStorage): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._mapping: Dict[str, List[Record]] = defaultdict(list) - self._terms: list[str] = [] - self._is_alias: list[bool] = [] - self._entities: list[NamedEntity] = [] + self._mapping = defaultdict(list) + self._terms = [] + self._is_alias = [] + self._entities = [] self._embeddings = None # @@ -76,27 +76,28 @@ def add_fuzz_or_semantic(self, entity: NamedEntity) -> None: # Getters # - def get(self, key: str) -> MatchList: + def get(self, key: str) -> MatchResult: records = self._mapping.get(self.normalize(key), []) match_list = Record.from_list( records, key=key, entity_type=self.entity_type ) - if not match_list: + results = MatchResult(matches=match_list) + + if not results: if self.search_flag.is_fuzz_ok: - match_list = self.get_by_fuzz(key) + results = self.get_by_fuzz(key) if self.search_flag.is_semantic_ok: - match_list = self.get_by_semantic(key) + results = self.get_by_semantic(key) - matches = MatchList(matches=match_list) - return matches + return results # # Fuzzy Matching # - def get_by_fuzz(self, term) -> MatchList: + def get_by_fuzz(self, term) -> MatchResult: query = self.fuzz_clean(term) matches = self.fuzz_match(query) return matches @@ -104,7 +105,7 @@ def get_by_fuzz(self, term) -> MatchList: def fuzz_match( self, query: str, - ) -> MatchList: + ) -> MatchResult: # https://rapidfuzz.github.io/RapidFuzz/Usage/process.html#extract extract = self.rapidfuzz.process.extract( query=query, @@ -113,24 +114,24 @@ def fuzz_match( limit=self.limit, ) - match_list = MatchList() + results = MatchResult() for key, score, index in extract: entity = self._entities[index] is_alias = self._is_alias[index] m = Match(key=key, entity=entity, is_alias=is_alias, score=score) - match_list.append(m) - return match_list + results.append(m) + return results # # Vector Similarity Search # - def get_by_semantic(self, key) -> List[Match]: + def get_by_semantic(self, key) -> MatchResult: # find closest match using knn indices, scores = self.find_knn(key) - # create a MatchList from the results - matches = [] + # create a MatchResult from the results + results = MatchResult() for index, score in zip(indices, scores): entity = self._entities[index] term = self._terms[index] @@ -142,9 +143,9 @@ def get_by_semantic(self, key) -> List[Match]: is_alias=is_alias, term=term, ) - matches.append(match) + results.append(match) - return matches + return results @property def embeddings(self): @@ -181,12 +182,12 @@ def find_knn(self, key: str) -> tuple: def InMemory( - source: Iterable, + source: Iterable[NamedEntity], *, case_sensitive: bool = False, encoder: Union[Callable, str, object] = None, entity_type: Type[NamedEntity] = NamedEntity, - examples: list = None, + examples: Optional[list] = None, fuzz_scorer: const.FuzzScorer = "token_sort_ratio", limit: PositiveInt = 10, min_similarity: float = 80.0, diff --git a/src/fuzztypes/integer.py b/src/fuzztypes/integer.py index 2f070ea..85084d5 100644 --- a/src/fuzztypes/integer.py +++ b/src/fuzztypes/integer.py @@ -23,4 +23,7 @@ def to_int(key: Union[int, str]) -> int: return val -Integer = Function(to_int, input_type=int) +Integer = Function( + to_int, + output_type=int, +) diff --git a/src/fuzztypes/language.py b/src/fuzztypes/language.py index d3a52e9..ac10dea 100644 --- a/src/fuzztypes/language.py +++ b/src/fuzztypes/language.py @@ -52,15 +52,17 @@ def resolve(self): return self.code -remote = ( - "https://salsa.debian.org/iso-codes-team/iso-codes/-/raw/main/data" - "/iso_639-3.json" -) -local = utils.get_file(remote) +LanguageNamedEntityType = Type[LanguageNamedEntity] -def load_languages(cls: Type[LanguageNamedEntity] = LanguageNamedEntity): +def load_languages( + entity_cls: Type[LanguageNamedEntity] = LanguageNamedEntity, +): def do_load() -> Iterable[NamedEntity]: + repo = "https://salsa.debian.org/iso-codes-team/iso-codes/" + remote = f"{repo}-/raw/main/data/iso_639-3.json" + local = utils.get_file(remote) + assert local, f"Could not download: {remote}" data = json.load(open(local))["639-3"] alias_fields = { "alpha_2", @@ -75,7 +77,7 @@ def do_load() -> Iterable[NamedEntity]: aliases = [v for k, v in item.items() if k in alias_fields] item["aliases"] = aliases entities.append(item) - return TypeAdapter(List[cls]).validate_python(data) + return TypeAdapter(List[LanguageNamedEntity]).validate_python(data) return do_load diff --git a/src/fuzztypes/lazy.py b/src/fuzztypes/lazy.py index 8743db9..93542b6 100644 --- a/src/fuzztypes/lazy.py +++ b/src/fuzztypes/lazy.py @@ -1,7 +1,7 @@ import functools import importlib import os -from typing import Any, List, TypedDict, Callable +from typing import Any, List, TypedDict, Callable, Optional, Union from fuzztypes import const @@ -9,7 +9,7 @@ @functools.lru_cache(maxsize=None) def lazy_import( library_name: str, - attr_name: str = None, + attr_name: Optional[str] = None, return_none_on_error: bool = False, ) -> Any: """ @@ -46,10 +46,8 @@ def lazy_import( except ImportError as e: version_info = f"(version {version})" if version else "" install = f"`pip install {install_name}{version_info}`" - details = list(filter(None, [purpose, url, license_type])) - if details: - details = ", ".join(details) - details = f" ({details})" + details = ", ".join(list(filter(None, [purpose, url, license_type]))) + details = f" ({details})" if details else "" msg = f"Import Failed: {install}{details}" if not info: diff --git a/src/fuzztypes/match.py b/src/fuzztypes/match.py index abfefdc..7f6a0d1 100644 --- a/src/fuzztypes/match.py +++ b/src/fuzztypes/match.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional, Iterator, Any, Union, Type +from typing import List, Tuple, Optional, Iterator, Any, Union, Type, Generator from pydantic import BaseModel, Field @@ -30,7 +30,7 @@ def __str__(self): return f"{self.entity.value} [{self.score:.1f}]" -class MatchList(BaseModel): +class MatchResult(BaseModel): matches: List[Match] = Field(default_factory=list) choice: Optional[Match] = None @@ -40,9 +40,6 @@ def __bool__(self): def __len__(self): return len(self.matches) - def __iter__(self) -> Iterator[Match]: - return iter(self.matches) - def __getitem__(self, item): return self.matches[item] @@ -59,10 +56,10 @@ def entity(self): def set( self, - key: str, + key: Any, entity: Entity, is_alias: bool = False, - term: str = None, + term: Optional[str] = None, ): """If match is a known winner, just set it and forget it.""" match = Match(key=key, entity=entity, is_alias=is_alias, term=term) @@ -103,10 +100,6 @@ class Record(BaseModel): is_alias: bool vector: Any = None - def deserialize(self, entity_type: Type[NamedEntity]): - if isinstance(self.entity, str): - self.entity = entity_type.model_validate_json(self.entity) - @classmethod def from_list( cls, @@ -123,10 +116,14 @@ def to_match( score: float = 100.0, entity_type: Type[NamedEntity] = NamedEntity, ) -> Match: - self.deserialize(entity_type) + if isinstance(self.entity, str): + match_entity = entity_type.model_validate_json(self.entity) + else: + match_entity = self.entity + return Match( key=key, - entity=self.entity, + entity=match_entity, is_alias=self.is_alias, score=score, term=self.term, diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index cdfa83b..3bb820e 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -1,10 +1,10 @@ -from typing import Callable, Iterable, Union, List, Type +from typing import Callable, Iterable, Union, List, Type, Optional, Any from pydantic import PositiveInt from fuzztypes import ( Match, - MatchList, + MatchResult, NamedEntity, Record, abstract, @@ -26,14 +26,23 @@ def __init__( super().__init__(source, **kwargs) self.name = name - self.conn = None - self.table = None + self._conn = None + self._table = None + + @property + def conn(self) -> Any: + if self._conn is None: + lancedb = lazy.lazy_import("lancedb") + self._conn = lancedb.connect(const.OnDiskPath) + return self._conn + + @property + def table(self) -> Any: + if self._table is None: + self._table = self.conn.open_table(self.name) + return self._table def prepare(self, force_drop_table: bool = False): - lancedb = lazy.lazy_import("lancedb") - - self.conn = lancedb.connect(const.OnDiskPath) - table_names = set(self.conn.table_names(limit=999_999_999)) if force_drop_table and self.name in table_names: @@ -49,8 +58,6 @@ def prepare(self, force_drop_table: bool = False): self.conn.drop_table(self.name) raise e - self.table = self.conn.open_table(self.name) - def create_table(self): pa = lazy.lazy_import("pyarrow") @@ -66,9 +73,7 @@ def create_table(self): ), ] ) - self.table = self.conn.create_table( - self.name, schema=schema, exist_ok=True - ) + table = self.conn.create_table(self.name, schema=schema, exist_ok=True) # create records from source records = self.create_records() @@ -81,7 +86,7 @@ def create_table(self): record.vector = vector # add records in a batch to table - self.table.add([record.model_dump() for record in records]) + table.add([record.model_dump() for record in records]) # adjust num_partitions and num_sub_vectors based on dataset size num_records = len(records) @@ -89,7 +94,7 @@ def create_table(self): should_index = num_records > 256 and self.search_flag.is_semantic_ok if self.search_flag.is_fuzz_ok: # pragma: no cover - self.table.create_fts_index("term") + table.create_fts_index("term") if should_index: # pragma: no cover num_partitions = min(num_records, 256) @@ -97,7 +102,7 @@ def create_table(self): index_cache_size = min(num_records, 256) accelerator = self.device if self.device in accelerators else None - self.table.create_index( + table.create_index( metric="cosine", num_partitions=num_partitions, num_sub_vectors=num_sub_vectors, @@ -148,7 +153,7 @@ def create_records(self): # Getters # - def get(self, key: str) -> MatchList: + def get(self, key: str) -> MatchResult: where = f'term = "{key}"' match_list = self.run_query(key, where=where) @@ -163,7 +168,7 @@ def get(self, key: str) -> MatchList: if self.search_flag.is_semantic_ok: match_list = self.get_by_semantic(key) - matches = MatchList(matches=match_list) + matches = MatchResult(matches=match_list) return matches def get_by_fuzz(self, key: str) -> List[Match]: @@ -220,13 +225,13 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: def OnDisk( identity: str, - source: Iterable, + source: Iterable[NamedEntity], *, case_sensitive: bool = False, - device: str = None, + device: Optional[const.DeviceList] = None, encoder: Union[Callable, str, object] = None, entity_type: Type[NamedEntity] = NamedEntity, - examples: list = None, + examples: Optional[list] = None, fuzz_scorer: const.FuzzScorer = "token_sort_ratio", input_type=str, limit: PositiveInt = 10, @@ -235,7 +240,7 @@ def OnDisk( search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", validator_mode: const.ValidatorMode = "before", -) -> abstract.AbstractType: +): storage = OnDiskStorage( identity, source, diff --git a/src/fuzztypes/person.py b/src/fuzztypes/person.py index 98355a5..af8ccc9 100644 --- a/src/fuzztypes/person.py +++ b/src/fuzztypes/person.py @@ -1,7 +1,7 @@ -from typing import Type, Union +from typing import Type, Union, Optional from pydantic import BaseModel -from fuzztypes import Entity, MatchList, abstract, const, lazy +from fuzztypes import Entity, MatchResult, abstract, const, lazy FULL_NAME = "{title} {first} {middle} {last} {suffix} ({nickname})" SHORT_NAME = "{first} {last}" @@ -87,11 +87,11 @@ def PersonModelType( name_format: str = FULL_NAME, init_format: str = FULL_INIT, capitalize: bool = True, - examples: list = None, + examples: Optional[list] = None, notfound_mode: const.NotFoundMode = "raise", validator_mode: const.ValidatorMode = "before", -) -> Type[PersonModel]: - def do_lookup(key: Union[str, PersonModel]) -> MatchList: +): + def do_lookup(key: Union[str, PersonModel]) -> MatchResult: if isinstance(key, str): human_name = parse(full_name=key) if capitalize: @@ -109,7 +109,7 @@ def do_lookup(key: Union[str, PersonModel]) -> MatchList: else: raise ValueError(f"Unexpected key type {type(key)} for {key}.") - match_list = MatchList() + match_list = MatchResult() entity = Entity(value=value) match_list.set(key=key, entity=entity) return match_list @@ -119,7 +119,7 @@ def do_lookup(key: Union[str, PersonModel]) -> MatchList: examples=examples, input_type=PersonModel, notfound_mode=notfound_mode, - output_type=str, + output_type=PersonModel, validator_mode=validator_mode, ) diff --git a/src/fuzztypes/regex.py b/src/fuzztypes/regex.py index d284ec8..f979ec8 100644 --- a/src/fuzztypes/regex.py +++ b/src/fuzztypes/regex.py @@ -1,27 +1,28 @@ import re +from typing import Optional -from . import Entity, Match, MatchList, abstract, const +from . import Entity, Match, MatchResult, abstract, const def Regex( pattern: str, - examples: list = None, + examples: Optional[list] = None, notfound_mode: const.NotFoundMode = "raise", validator_mode: const.ValidatorMode = "before", tiebreaker_mode: const.TiebreakerMode = "raise", ): regex = re.compile(pattern) - def do_lookup(key: str) -> MatchList: + def do_lookup(key: str) -> MatchResult: matches = regex.findall(key) - match_list = MatchList() + match_list = MatchResult() for match in matches: # Create and append Entity for each match found entity = Entity(value=match) match_list.append(Match(key=match, entity=entity, is_alias=False)) - # Leave tiebreaker and error handling to MatchList.choose + # Leave tiebreaker and error handling to MatchResult.choose match_list.choose(min_score=0, tiebreaker_mode=tiebreaker_mode) return match_list diff --git a/src/fuzztypes/utils/download.py b/src/fuzztypes/utils/download.py index 6727946..503d635 100644 --- a/src/fuzztypes/utils/download.py +++ b/src/fuzztypes/utils/download.py @@ -36,10 +36,10 @@ def get_file(url: str, expires_in_days: int = 30) -> Optional[str]: os.replace(temp_download_path, cache_file_path) cache_ok = os.path.exists(cache_file_path) - if cache_ok: - return cache_file_path + if not cache_ok: + logger.error(f"Unable to download the file and no cached file: {url}") - logger.error(f"Unable to download the file and no cached file: {url}") + return cache_file_path if cache_ok else None def download_file(url, download_path): diff --git a/tests/in_memory/test_in_memory_fuzz.py b/tests/in_memory/test_in_memory_fuzz.py index b6bdb93..e2eb89d 100644 --- a/tests/in_memory/test_in_memory_fuzz.py +++ b/tests/in_memory/test_in_memory_fuzz.py @@ -79,11 +79,20 @@ def test_min_score(): except ValidationError as e: assert e.errors(include_url=False) == [ { - "ctx": {"key": "B K L", "near": ["A B C [40.0]"]}, + "ctx": { + "key": "B K L", + "near": [ + "('matches', [Match(key='a b c', entity=NamedEntity(" + "value='A B C', label=None, meta=None, " + "priority=None, aliases=[]), is_alias=False, " + "score=40.0, term=None)])", + "('choice', None)", + ], + }, "input": "B K L", "loc": ("strict",), - "msg": "key (B K L) could not be resolved, " - "closest non-matches = A B C [40.0]", + "msg": "key (B K L) could not be resolved, closest " + "non-matches = A B C [40.0]", "type": "key_not_found", } ] diff --git a/tests/test_ascii.py b/tests/test_ascii.py index 4f48af5..30e86f0 100644 --- a/tests/test_ascii.py +++ b/tests/test_ascii.py @@ -1,12 +1,19 @@ +# -*- coding: utf-8 -*- + from pydantic import BaseModel -from fuzztypes import ASCII +from fuzztypes import ASCII, Entity, NamedEntity -class MyModel(BaseModel): - ascii: ASCII +def test_ascii_usable_type(): + assert isinstance(ASCII("άνθρωποι"), str) + assert isinstance(ASCII["άνθρωποι"], Entity) + assert not isinstance(ASCII["άνθρωποι"], NamedEntity) def test_transliterate_utf8_to_ascii(): + class MyModel(BaseModel): + ascii: ASCII + obj = MyModel(ascii="άνθρωποι") assert obj.ascii == "anthropoi" diff --git a/tests/test_integer.py b/tests/test_integer.py index 3c83945..ae997d9 100644 --- a/tests/test_integer.py +++ b/tests/test_integer.py @@ -27,7 +27,12 @@ def test_validation_error(): def test_json_schema(): assert MyModel.model_json_schema() == { - "properties": {"num": {"title": "Num", "type": "integer"}}, + "properties": { + "num": { + "anyOf": [{"type": "string"}, {"type": "integer"}], + "title": "Num", + } + }, "required": ["num"], "title": "MyModel", "type": "object", diff --git a/tests/test_language.py b/tests/test_language.py index 38a31f2..ae5af53 100644 --- a/tests/test_language.py +++ b/tests/test_language.py @@ -8,7 +8,7 @@ def test_load_languages(): source = load_languages() entities = source() assert len(entities) == 7910 - assert entities[0].resolve() == 'Ghotuo' + assert entities[0].resolve() == "Ghotuo" def test_language_model_resolution(): diff --git a/tests/test_person.py b/tests/test_person.py index 9c3b489..8acb06b 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -75,7 +75,8 @@ def test_value_error(): def test_json_schema(): - assert MyModel.model_json_schema() == { + data = MyModel.model_json_schema() + expected_data = { "$defs": { "PersonModel": { "properties": { @@ -126,24 +127,14 @@ def test_json_schema(): } }, "properties": { - "person": { - "anyOf": [ - {"$ref": "#/$defs/PersonModel"}, - {"type": "string"}, - ], - "title": "Person", - }, "optional": { - "anyOf": [ - {"$ref": "#/$defs/PersonModel"}, - {"type": "string"}, - {"type": "null"}, - ], + "anyOf": [{"$ref": "#/$defs/PersonModel"}, {"type": "null"}], "default": None, - "title": "Optional", }, + "person": {"$ref": "#/$defs/PersonModel"}, }, "required": ["person"], "title": "MyModel", "type": "object", } + assert data == expected_data From a080af933c5c1b3ae5a58a842a99254abc3fc98e Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Wed, 20 Mar 2024 20:14:12 -0400 Subject: [PATCH 05/15] mypy cleanup. --- pyproject.toml | 1 + src/fuzztypes/in_memory.py | 2 +- src/fuzztypes/on_disk.py | 2 +- src/fuzztypes/py.typed | 0 tests/conftest.py | 7 +++++-- tests/in_memory/test_in_memory_name.py | 2 +- tests/on_disk/test_on_disk_fuzz.py | 2 +- tests/on_disk/test_on_disk_name.py | 2 +- tests/test_entity.py | 4 +++- tests/test_function.py | 2 +- 10 files changed, 15 insertions(+), 9 deletions(-) create mode 100644 src/fuzztypes/py.typed diff --git a/pyproject.toml b/pyproject.toml index e017fdd..b523c1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ path = "src/fuzztypes/__init__.py" [mypy] no-untyped-def = false +import-untyped = false [[tool.mypy.overrides]] module = "gpt.tests.*" diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index 71bb44b..f5fed76 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -182,7 +182,7 @@ def find_knn(self, key: str) -> tuple: def InMemory( - source: Iterable[NamedEntity], + source: Iterable, *, case_sensitive: bool = False, encoder: Union[Callable, str, object] = None, diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index 3bb820e..cf1324f 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -225,7 +225,7 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: def OnDisk( identity: str, - source: Iterable[NamedEntity], + source: Iterable, *, case_sensitive: bool = False, device: Optional[const.DeviceList] = None, diff --git a/src/fuzztypes/py.typed b/src/fuzztypes/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py index 0300586..8a8a488 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,7 @@ from pytest import fixture -from fuzztypes import EntitySource +from fuzztypes import EntitySource, NamedEntity @fixture(scope="session") @@ -21,11 +21,14 @@ def EmojiSource(data_path): def FruitSource(data_path): # loading separately from AnimalSource to test lazy loading MixedSource = EntitySource(data_path / "mixed.jsonl") - FruitSource = MixedSource["fruit"] assert MixedSource.loaded is False + + FruitSource = MixedSource["fruit"] + assert isinstance(FruitSource, EntitySource) assert FruitSource.loaded is False # first access loads FruitSource -> MixedSource + assert isinstance(FruitSource[0], NamedEntity) assert FruitSource[0].value == "Apple" assert FruitSource.loaded is True assert MixedSource.loaded is True diff --git a/tests/in_memory/test_in_memory_name.py b/tests/in_memory/test_in_memory_name.py index 78e118d..2f2575b 100644 --- a/tests/in_memory/test_in_memory_name.py +++ b/tests/in_memory/test_in_memory_name.py @@ -57,7 +57,7 @@ class Example(BaseModel): def test_nullable_name_str(): class Example(BaseModel): - value: Optional[NullPrez] = Field(None) + value: Optional[NullPrez] = Field(default=None) assert Example().model_dump() == {"value": None} assert Example(value="The Rock").model_dump() == {"value": None} diff --git a/tests/on_disk/test_on_disk_fuzz.py b/tests/on_disk/test_on_disk_fuzz.py index 96eca24..72ad99c 100644 --- a/tests/on_disk/test_on_disk_fuzz.py +++ b/tests/on_disk/test_on_disk_fuzz.py @@ -1,5 +1,5 @@ import os -import tantivy +import tantivy # type: ignore from fuzztypes import Fuzzmoji, const diff --git a/tests/on_disk/test_on_disk_name.py b/tests/on_disk/test_on_disk_name.py index 8db8d5d..414e47d 100644 --- a/tests/on_disk/test_on_disk_name.py +++ b/tests/on_disk/test_on_disk_name.py @@ -74,7 +74,7 @@ class Example(BaseModel): def test_nullable_name_str(): class Example(BaseModel): - value: Optional[NullPrez] = Field(None) + value: Optional[NullPrez] = Field(default=None) assert Example().model_dump() == {"value": None} assert Example(value="The Rock").model_dump() == {"value": None} diff --git a/tests/test_entity.py b/tests/test_entity.py index 4ec187d..352a043 100644 --- a/tests/test_entity.py +++ b/tests/test_entity.py @@ -82,4 +82,6 @@ def fn(): return [NamedEntity(value="hi!")] source = EntitySource(source=fn) - assert source[0].value == "hi!" + entity = source[0] + assert isinstance(entity, NamedEntity) + assert entity.value == "hi!" diff --git a/tests/test_function.py b/tests/test_function.py index ff45598..696cd4a 100644 --- a/tests/test_function.py +++ b/tests/test_function.py @@ -30,7 +30,7 @@ def test_class_getitem(): def test_missing_lookup(): - def apple_banana(key: str) -> str: + def apple_banana(key: str) -> Optional[str]: return dict(a="apple", b="banana").get(key) AppleBanana = Function(apple_banana) From 3be6451915648abb68dd8ed75138855c392f0932 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Wed, 20 Mar 2024 21:50:59 -0400 Subject: [PATCH 06/15] mypy cleanup. --- pyproject.toml | 8 +------- src/fuzztypes/abstract.py | 2 +- src/fuzztypes/entity.py | 7 ++++--- src/fuzztypes/match.py | 6 +----- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b523c1c..60e16af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,13 +60,7 @@ ext = [ [tool.hatch.version] path = "src/fuzztypes/__init__.py" -[mypy] -no-untyped-def = false -import-untyped = false - -[[tool.mypy.overrides]] -module = "gpt.tests.*" -ignore_missing_imports = true +[tool.mypy] check_untyped_defs = true [tool.pytest.ini_options] diff --git a/src/fuzztypes/abstract.py b/src/fuzztypes/abstract.py index a23203c..29c0c24 100644 --- a/src/fuzztypes/abstract.py +++ b/src/fuzztypes/abstract.py @@ -156,7 +156,7 @@ def lookup(cls, key: T) -> Optional[Entity[T]]: """ match_list: MatchResult = lookup_function(key) - if match_list.success: + if match_list.choice is not None: return match_list.entity if notfound_mode == "allow": diff --git a/src/fuzztypes/entity.py b/src/fuzztypes/entity.py index 9f5c8b2..564f8fd 100644 --- a/src/fuzztypes/entity.py +++ b/src/fuzztypes/entity.py @@ -133,14 +133,14 @@ def __iter__(self): def _load_if_necessary(self): if not self.loaded: self.loaded = True - if isinstance(self.source, Tuple): + if isinstance(self.source, tuple): parent, label = self.source self.entities = [e for e in parent if e.label == label] - elif isinstance(self.source, Callable): + elif callable(self.source): self.entities = self.source() - elif self.source: + elif isinstance(self.source, Path): dialects = { "csv": self.from_csv, "tsv": self.from_tsv, @@ -149,6 +149,7 @@ def _load_if_necessary(self): } _, ext = self.source.name.lower().rsplit(".", maxsplit=1) f = dialects.get(ext) + assert f is not None, f"No reader found for: {ext}" # noinspection PyArgumentList self.entities = f(self.source) diff --git a/src/fuzztypes/match.py b/src/fuzztypes/match.py index 7f6a0d1..b0d9dc1 100644 --- a/src/fuzztypes/match.py +++ b/src/fuzztypes/match.py @@ -46,13 +46,9 @@ def __getitem__(self, item): def __str__(self): return ", ".join(map(str, self.matches)) - @property - def success(self): - return self.choice is not None - @property def entity(self): - return self.success and self.choice.entity + return self.choice is not None and self.choice.entity def set( self, From 52692376d672fd14f85964a3578d76ad158c127d Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Thu, 21 Mar 2024 06:19:03 -0700 Subject: [PATCH 07/15] separate storage from abstract. --- src/fuzztypes/__init__.py | 1 + src/fuzztypes/abstract.py | 99 ----------------------------------- src/fuzztypes/in_memory.py | 10 ++-- src/fuzztypes/on_disk.py | 7 +-- src/fuzztypes/storage.py | 102 +++++++++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 107 deletions(-) create mode 100644 src/fuzztypes/storage.py diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index be21aa5..1500cfb 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -20,6 +20,7 @@ # Hidden Abstract Types from . import abstract +from . import storage # Base Named Entity Types from .in_memory import InMemory diff --git a/src/fuzztypes/abstract.py b/src/fuzztypes/abstract.py index 29c0c24..d8e52b9 100644 --- a/src/fuzztypes/abstract.py +++ b/src/fuzztypes/abstract.py @@ -173,102 +173,3 @@ def lookup(cls, key: T) -> Optional[Entity[T]]: raise PydanticCustomError("key_not_found", msg, ctx) return _AbstractType - - -class AbstractStorage: - def __init__( - self, - source: Iterable[NamedEntity], - *, - case_sensitive: bool = False, - encoder: Union[Callable, str, object] = None, - entity_type: Type[NamedEntity] = NamedEntity, - device: const.DeviceList = "cpu", - fuzz_scorer: str = "token_sort_ratio", - limit: int = 10, - min_similarity: float = 80.0, - search_flag: flags.SearchFlag = flags.DefaultSearch, - tiebreaker_mode: const.TiebreakerMode = "raise", - ): - assert not search_flag.is_hybrid, "Hybrid search not yet supported!" - - self.source = source - - # options - self.case_sensitive = case_sensitive - self.device = device - self.entity_type = entity_type - self.limit = limit - self.min_similarity = min_similarity - self.prepped = False - self.search_flag = search_flag - self.tiebreaker_mode = tiebreaker_mode - - # store string for lazy loading - self._fuzz_scorer = fuzz_scorer - self._encoder = encoder - self._vect_dimensions = None - - def __call__(self, key: str) -> MatchResult: - if not self.prepped: - self.prepped = True - self.prepare() - - match_list = self.get(key) - match_list.choose(self.min_similarity, self.tiebreaker_mode) - return match_list - - def prepare(self): - raise NotImplementedError - - def get(self, key: str) -> MatchResult: - raise NotImplementedError - - def normalize(self, key: str): - if key: - key = key.strip() - if self.case_sensitive: - return key - else: - return key.lower() - - # - # encoding - # - - @property - def encoder(self): - return lazy.create_encoder(self._encoder, device=self.device) - - @property - def vect_dimensions(self): - if self._vect_dimensions is None: - dummy_encoded = self.encode([""]) - self._vect_dimensions = dummy_encoded.shape[1] - return self._vect_dimensions - - def encode(self, values: List[str]): - return self.encoder( - values, - ) - - # - # fuzzy matching - # - - @property - def rapidfuzz(self): - return lazy.lazy_import("rapidfuzz") - - @property - def fuzz_scorer(self): - return getattr( - self.rapidfuzz.fuzz, - self._fuzz_scorer, - self.rapidfuzz.fuzz.token_sort_ratio, - ) - - def fuzz_clean(self, term: str) -> str: - # no really, it's a string - # noinspection PyTypeChecker - return self.rapidfuzz.utils.default_process(term) diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index f5fed76..61d18c6 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Callable, Iterable, Union, List, Dict, Type, Optional +from typing import Callable, Iterable, Union, Type, Optional from pydantic import PositiveInt @@ -12,10 +12,11 @@ const, flags, lazy, + storage, ) -class InMemoryStorage(abstract.AbstractStorage): +class InMemoryStorage(storage.AbstractStorage): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -196,7 +197,7 @@ def InMemory( tiebreaker_mode: const.TiebreakerMode = "raise", validator_mode: const.ValidatorMode = "before", ): - storage = InMemoryStorage( + source = InMemoryStorage( source, case_sensitive=case_sensitive, encoder=encoder, @@ -208,10 +209,9 @@ def InMemory( ) return abstract.AbstractType( - storage, + source, EntityType=entity_type, examples=examples, - input_type=str, notfound_mode=notfound_mode, validator_mode=validator_mode, ) diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index cf1324f..e9ad2ce 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -11,12 +11,13 @@ const, flags, lazy, + storage, ) accelerators = {"cuda", "mps"} -class OnDiskStorage(abstract.AbstractStorage): +class OnDiskStorage(storage.AbstractStorage): def __init__( self, name: str, @@ -241,7 +242,7 @@ def OnDisk( tiebreaker_mode: const.TiebreakerMode = "raise", validator_mode: const.ValidatorMode = "before", ): - storage = OnDiskStorage( + source = OnDiskStorage( identity, source, case_sensitive=case_sensitive, @@ -256,7 +257,7 @@ def OnDisk( ) return abstract.AbstractType( - storage, + source, EntityType=entity_type, examples=examples, input_type=input_type, diff --git a/src/fuzztypes/storage.py b/src/fuzztypes/storage.py new file mode 100644 index 0000000..f6d78af --- /dev/null +++ b/src/fuzztypes/storage.py @@ -0,0 +1,102 @@ +from typing import Callable, Iterable, List, Type, Union + +from fuzztypes import NamedEntity, MatchResult, const, flags, lazy + + +class AbstractStorage: + def __init__( + self, + source: Iterable[NamedEntity], + *, + case_sensitive: bool = False, + encoder: Union[Callable, str, object] = None, + entity_type: Type[NamedEntity] = NamedEntity, + device: const.DeviceList = "cpu", + fuzz_scorer: str = "token_sort_ratio", + limit: int = 10, + min_similarity: float = 80.0, + search_flag: flags.SearchFlag = flags.DefaultSearch, + tiebreaker_mode: const.TiebreakerMode = "raise", + ): + assert not search_flag.is_hybrid, "Hybrid search not yet supported!" + + self.source = source + + # options + self.case_sensitive = case_sensitive + self.device = device + self.entity_type = entity_type + self.limit = limit + self.min_similarity = min_similarity + self.prepped = False + self.search_flag = search_flag + self.tiebreaker_mode = tiebreaker_mode + + # store string for lazy loading + self._fuzz_scorer = fuzz_scorer + self._encoder = encoder + self._vect_dimensions = None + + def __call__(self, key: str) -> MatchResult: + if not self.prepped: + self.prepped = True + self.prepare() + + match_list = self.get(key) + match_list.choose(self.min_similarity, self.tiebreaker_mode) + return match_list + + def prepare(self): + raise NotImplementedError + + def get(self, key: str) -> MatchResult: + raise NotImplementedError + + def normalize(self, key: str): + if key: + key = key.strip() + if self.case_sensitive: + return key + else: + return key.lower() + + # + # encoding + # + + @property + def encoder(self): + return lazy.create_encoder(self._encoder, device=self.device) + + @property + def vect_dimensions(self): + if self._vect_dimensions is None: + dummy_encoded = self.encode([""]) + self._vect_dimensions = dummy_encoded.shape[1] + return self._vect_dimensions + + def encode(self, values: List[str]): + return self.encoder( + values, + ) + + # + # fuzzy matching + # + + @property + def rapidfuzz(self): + return lazy.lazy_import("rapidfuzz") + + @property + def fuzz_scorer(self): + return getattr( + self.rapidfuzz.fuzz, + self._fuzz_scorer, + self.rapidfuzz.fuzz.token_sort_ratio, + ) + + def fuzz_clean(self, term: str) -> str: + # no really, it's a string + # noinspection PyTypeChecker + return self.rapidfuzz.utils.default_process(term) From 89d45f49e59dca21c283f89c76de7ef88a7a52a0 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Sat, 23 Mar 2024 18:56:53 -0700 Subject: [PATCH 08/15] added type adapter functions. refactoring to Annotated for mypy purposes. --- README.md | 23 +--- src/fuzztypes/__init__.py | 2 - src/fuzztypes/abstract.py | 14 +-- src/fuzztypes/ascii.py | 14 +-- src/fuzztypes/date.py | 18 +-- src/fuzztypes/function.py | 32 ----- src/fuzztypes/integer.py | 11 +- src/fuzztypes/person.py | 6 +- src/fuzztypes/storage.py | 24 ++-- src/fuzztypes/utils/__init__.py | 4 + src/fuzztypes/utils/adapter.py | 37 ++++++ tests/test_annotation.py | 27 +++++ tests/test_ascii.py | 10 +- tests/test_date.py | 45 ++++--- tests/test_emoji.py | 12 +- tests/test_full_model.py | 201 ++++++++++++++++++++++++++++++++ tests/test_function.py | 76 ------------ tests/test_integer.py | 43 +++---- tests/test_person.py | 76 +----------- 19 files changed, 378 insertions(+), 297 deletions(-) delete mode 100644 src/fuzztypes/function.py create mode 100644 src/fuzztypes/utils/adapter.py create mode 100644 tests/test_annotation.py create mode 100644 tests/test_full_model.py delete mode 100644 tests/test_function.py diff --git a/README.md b/README.md index 7eeacd9..3262790 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ Inventor = InMemory(inventors, search_flag=flags.FuzzSearch) # custom Regex type for finding twitter handles. Handle = Regex(r'@\w{1,15}', examples=["@genomoncology"]) -# define a Pydantic class with 9 fuzzy type attriubutes +# define a Pydantic class with 9 fuzzy type attributes class Fuzzy(BaseModel): ascii: ASCII email: Email @@ -370,27 +370,6 @@ def load_animals(): animal_source = EntitySource(load_animals) ``` -### Function Base Type - -The `Function` base type allows you to use any function that accepts -one value and returns one value for transformation. It is useful -for creating simple annotation types that perform custom data -transformations. - -Example: -```python -from fuzztypes import Function - -# Create a custom annotation type that converts a value to uppercase -UpperCase = Function(str.upper) - -class MyModel(BaseModel): - name: UpperCase - -model = MyModel(name="john") -assert model.name == "JOHN" -``` - ### InMemory Base Type The `InMemory` base type enables matching entities in memory using diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index 1500cfb..5f24648 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -27,7 +27,6 @@ from .on_disk import OnDisk # Base Non-Entity Types -from .function import Function from .regex import Regex # Usable Types @@ -47,7 +46,6 @@ "Emoji", "Entity", "EntitySource", - "Function", "Fuzzmoji", "InMemory", "Integer", diff --git a/src/fuzztypes/abstract.py b/src/fuzztypes/abstract.py index d8e52b9..5f4361b 100644 --- a/src/fuzztypes/abstract.py +++ b/src/fuzztypes/abstract.py @@ -4,8 +4,6 @@ Callable, Dict, Generic, - Iterable, - List, Optional, Type, TypeVar, @@ -20,7 +18,7 @@ ) from pydantic_core import CoreSchema, PydanticCustomError, core_schema -from fuzztypes import NamedEntity, Entity, MatchResult, const, flags, lazy +from fuzztypes import Entity, MatchResult, const T = TypeVar("T") @@ -87,10 +85,10 @@ def __get_pydantic_core_schema__( based on the provided validation mode and input/output types. """ validation_function_map: Dict[str, Callable] = { - "before": core_schema.with_info_before_validator_function, - "after": core_schema.with_info_before_validator_function, - "plain": core_schema.with_info_plain_validator_function, - "wrap": core_schema.with_info_wrap_validator_function, + "before": core_schema.no_info_before_validator_function, + "after": core_schema.no_info_before_validator_function, + "plain": core_schema.no_info_plain_validator_function, + "wrap": core_schema.no_info_wrap_validator_function, } validation_function = validation_function_map[validator_mode] @@ -123,7 +121,7 @@ def __get_pydantic_json_schema__( schema["examples"] = examples return schema - def __new__(cls, key: T, _: Any = None) -> Optional[T]: # type: ignore + def __new__(cls, key: T) -> Optional[T]: # type: ignore """ Doesn't create an AbstractType, it's actually a class-level __call__ function. diff --git a/src/fuzztypes/ascii.py b/src/fuzztypes/ascii.py index 826e546..3e16e72 100644 --- a/src/fuzztypes/ascii.py +++ b/src/fuzztypes/ascii.py @@ -1,6 +1,7 @@ -from typing import Callable +from typing import Annotated, Any, Callable +from pydantic import BeforeValidator -from fuzztypes import Function, lazy +from fuzztypes import lazy _tx = None @@ -27,12 +28,9 @@ def get_tx() -> Callable: # pragma: no cover return _tx -def to_ascii(key: str) -> str: +def to_ascii(key: Any) -> str: f = _tx or get_tx() - return f(key) + return f(str(key)) -ASCII = Function( - to_ascii, - output_type=str, -) +ASCII = Annotated[str, BeforeValidator(to_ascii)] diff --git a/src/fuzztypes/date.py b/src/fuzztypes/date.py index 80fff5e..2165599 100644 --- a/src/fuzztypes/date.py +++ b/src/fuzztypes/date.py @@ -1,9 +1,11 @@ import datetime -from typing import Optional, Union, Type +from typing import Annotated, Optional, Union, Type from . import Entity, MatchResult, abstract, const, lazy -date_or_datetime = Union[datetime.date, datetime.datetime] +DateOrDatetime = Union[datetime.date, datetime.datetime] +DateOrStr = Union[str, datetime.date] +DatetimeOrStr = Union[str, datetime.datetime] def DateType( @@ -11,12 +13,12 @@ def DateType( examples: Optional[list] = None, languages: Optional[list[str]] = None, notfound_mode: const.NotFoundMode = "raise", - input_type: Type[date_or_datetime] = datetime.date, + input_type: Type[DateOrDatetime] = datetime.date, timezone: Optional[str] = None, validator_mode: const.ValidatorMode = "before", strict: bool = False, prefer_future_dates: bool = False, - relative_base: Optional[date_or_datetime] = None, + relative_base: Optional[DateOrDatetime] = None, ): DateDataParser = lazy.lazy_import("dateparser.date", "DateDataParser") languages = languages or ["en"] @@ -59,12 +61,12 @@ def DatetimeType( examples: Optional[list] = None, languages: Optional[list[str]] = None, notfound_mode: const.NotFoundMode = "raise", - input_type: Type[date_or_datetime] = datetime.datetime, + input_type: Type[DateOrDatetime] = datetime.datetime, timezone: Optional[str] = None, validator_mode: const.ValidatorMode = "before", strict: bool = False, prefer_future_dates: bool = False, - relative_base: Optional[date_or_datetime] = None, + relative_base: Optional[DateOrDatetime] = None, ): return DateType( date_order, @@ -80,5 +82,5 @@ def DatetimeType( ) -Date = DateType() -Datetime = DatetimeType() +Date = Annotated[datetime.date, DateType()] +Datetime = Annotated[datetime.datetime, DatetimeType()] diff --git a/src/fuzztypes/function.py b/src/fuzztypes/function.py deleted file mode 100644 index 54c730a..0000000 --- a/src/fuzztypes/function.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Callable, Type, Optional, TypeVar - -from . import Entity, MatchResult, const, abstract - - -T = TypeVar("T", bound=abstract.SupportedType) - - -def Function( - source: Callable[[T], abstract.SupportedType], - examples: Optional[list] = None, - notfound_mode: const.NotFoundMode = "raise", - input_type: Type[abstract.SupportedType] = str, - output_type: Optional[Type[abstract.SupportedType]] = None, - validator_mode: const.ValidatorMode = "before", -): - def do_lookup(key: T) -> MatchResult: - value = source(key) - match_list = MatchResult() - if value is not None: - entity = Entity(value=value) - match_list.set(key=key, entity=entity) - return match_list - - return abstract.AbstractType( - do_lookup, - examples=examples, - input_type=input_type, - output_type=output_type, - notfound_mode=notfound_mode, - validator_mode=validator_mode, - ) diff --git a/src/fuzztypes/integer.py b/src/fuzztypes/integer.py index 85084d5..475ead9 100644 --- a/src/fuzztypes/integer.py +++ b/src/fuzztypes/integer.py @@ -1,6 +1,8 @@ -from typing import Callable, Union +from typing import Annotated, Callable, Union -from fuzztypes import Function, lazy +from pydantic import BeforeValidator, WithJsonSchema + +from fuzztypes import lazy _tx = None @@ -23,7 +25,4 @@ def to_int(key: Union[int, str]) -> int: return val -Integer = Function( - to_int, - output_type=int, -) +Integer = Annotated[int, BeforeValidator(to_int)] diff --git a/src/fuzztypes/person.py b/src/fuzztypes/person.py index af8ccc9..edcb64b 100644 --- a/src/fuzztypes/person.py +++ b/src/fuzztypes/person.py @@ -1,4 +1,4 @@ -from typing import Type, Union, Optional +from typing import Annotated, Union, Optional from pydantic import BaseModel from fuzztypes import Entity, MatchResult, abstract, const, lazy @@ -83,7 +83,7 @@ def human_name(self, name_format=None, init_format=None): ) -def PersonModelType( +def PersonValidator( name_format: str = FULL_NAME, init_format: str = FULL_INIT, capitalize: bool = True, @@ -125,4 +125,4 @@ def do_lookup(key: Union[str, PersonModel]) -> MatchResult: # default annotation -Person = PersonModelType() +Person = Annotated[PersonModel, PersonValidator()] diff --git a/src/fuzztypes/storage.py b/src/fuzztypes/storage.py index f6d78af..fef5c14 100644 --- a/src/fuzztypes/storage.py +++ b/src/fuzztypes/storage.py @@ -5,18 +5,18 @@ class AbstractStorage: def __init__( - self, - source: Iterable[NamedEntity], - *, - case_sensitive: bool = False, - encoder: Union[Callable, str, object] = None, - entity_type: Type[NamedEntity] = NamedEntity, - device: const.DeviceList = "cpu", - fuzz_scorer: str = "token_sort_ratio", - limit: int = 10, - min_similarity: float = 80.0, - search_flag: flags.SearchFlag = flags.DefaultSearch, - tiebreaker_mode: const.TiebreakerMode = "raise", + self, + source: Iterable[NamedEntity], + *, + case_sensitive: bool = False, + encoder: Union[Callable, str, object] = None, + entity_type: Type[NamedEntity] = NamedEntity, + device: const.DeviceList = "cpu", + fuzz_scorer: str = "token_sort_ratio", + limit: int = 10, + min_similarity: float = 80.0, + search_flag: flags.SearchFlag = flags.DefaultSearch, + tiebreaker_mode: const.TiebreakerMode = "raise", ): assert not search_flag.is_hybrid, "Hybrid search not yet supported!" diff --git a/src/fuzztypes/utils/__init__.py b/src/fuzztypes/utils/__init__.py index 1e84b9f..df01c34 100644 --- a/src/fuzztypes/utils/__init__.py +++ b/src/fuzztypes/utils/__init__.py @@ -1,6 +1,10 @@ +from .adapter import get_type_adapter, validate_json, validate_python from .download import download_file, get_file __all__ = ( "download_file", + "get_type_adapter", "get_file", + "validate_json", + "validate_python", ) diff --git a/src/fuzztypes/utils/adapter.py b/src/fuzztypes/utils/adapter.py new file mode 100644 index 0000000..1490af0 --- /dev/null +++ b/src/fuzztypes/utils/adapter.py @@ -0,0 +1,37 @@ +from functools import lru_cache +from typing import Any, Union + +from pydantic import TypeAdapter + + +@lru_cache(maxsize=None) +def get_type_adapter(cls: Any) -> TypeAdapter: + """ + Get a type adapter for the given class wrapped by a cache. + + :param cls: TypedDict, BaseModel, or Annotation. + :return: TypeAdapter wrapper of cls + """ + return TypeAdapter(cls) + + +def validate_json(cls: Any, value: Union[str, bytes]) -> Any: + """ + Validate a JSON string or bytes against the model. + + :param cls: TypedDict, BaseModel, or Annotation. + :param value: JSON string or bytes to validate. + :return: Validated Python object. + """ + return get_type_adapter(cls).validate_json(value) + + +def validate_python(cls: Any, value: Any) -> Any: + """ + Validate a Python object against the model. + + :param cls: TypedDict, BaseModel, or Annotation. + :param value: Python object to validate. + :return: Validated Python object. + """ + return get_type_adapter(cls).validate_python(value) diff --git a/tests/test_annotation.py b/tests/test_annotation.py new file mode 100644 index 0000000..a3eaf0f --- /dev/null +++ b/tests/test_annotation.py @@ -0,0 +1,27 @@ +from pydantic import BaseModel, BeforeValidator, TypeAdapter +from typing import Annotated + +from fuzztypes.ascii import to_ascii +from fuzztypes.in_memory import InMemory + + +def test_ascii(): + ASCII = Annotated[str, BeforeValidator(to_ascii)] + + assert TypeAdapter(ASCII).validate_python("άνθρωποι") == "anthropoi" + + class MyModel(BaseModel): + ascii: ASCII + + assert MyModel(ascii="άνθρωποι").ascii == "anthropoi" + assert MyModel(ascii=123).ascii == "123" + + +def test_in_memory(MythSource): + Myth = Annotated[str, InMemory(MythSource)] + assert TypeAdapter(Myth).validate_python("zeus") == "Zeus" + + class MyModel(BaseModel): + myth: Myth + + assert MyModel(myth="jove").myth == "Zeus" diff --git a/tests/test_ascii.py b/tests/test_ascii.py index 30e86f0..6d20f47 100644 --- a/tests/test_ascii.py +++ b/tests/test_ascii.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- -from pydantic import BaseModel -from fuzztypes import ASCII, Entity, NamedEntity +from pydantic import BaseModel, TypeAdapter + +from fuzztypes import ASCII def test_ascii_usable_type(): - assert isinstance(ASCII("άνθρωποι"), str) - assert isinstance(ASCII["άνθρωποι"], Entity) - assert not isinstance(ASCII["άνθρωποι"], NamedEntity) + ta = TypeAdapter(ASCII) + assert ta.validate_python("άνθρωποι") == "anthropoi" def test_transliterate_utf8_to_ascii(): diff --git a/tests/test_date.py b/tests/test_date.py index c50771d..f570f18 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -1,26 +1,33 @@ from datetime import datetime, date +from typing import Annotated from zoneinfo import ZoneInfo from pydantic import BaseModel -from fuzztypes import Date, DateType, DatetimeType - -Y2K = datetime(2000, 1, 1, 0, 0, 0) +from fuzztypes import Date, DateType, DatetimeType, utils ny_tz = ZoneInfo("America/New_York") +DateY2K = Annotated[ + datetime, DatetimeType(relative_base=datetime(2000, 1, 1), timezone="EST") +] + class MyModel(BaseModel): date: Date - time: DatetimeType(relative_base=Y2K, timezone="EST") + time: DateY2K -def test_fuzzy_date_time(): - obj = MyModel(date="11 July 2012", time="tomorrow 5am") +def test_validate_python_date_and_datetime(): + data = dict(date="11 July 2012", time="tomorrow 5am") + obj = utils.validate_python(MyModel, data) assert obj.date == date(2012, 7, 11) assert obj.time == datetime(2000, 1, 2, 5, 0, 0, tzinfo=ny_tz) - obj = MyModel(date="July 4th", time="1 year ago") + +def test_validate_json_date_and_datetime(): + json = '{"date": "July 4th", "time": "1 year ago"}' + obj = utils.validate_json(MyModel, json) today = date.today() year = today.year if (today.month, today.day) >= (7, 4) else today.year - 1 assert obj.date == date(year, 7, 4) @@ -35,19 +42,31 @@ def test_mdy_vs_ymd(): # MDY vs. YMD ordering is context specific # https://dateparser.readthedocs.io/en/latest/settings.html#date-order # - assert Date["02-03-04"].value == date(year=2004, month=2, day=3) + assert utils.validate_python(Date, "02-03-04") == date( + year=2004, month=2, day=3 + ) DateEN = DateType(languages=["en"]) - assert DateEN["02-03-04"].value == date(year=2004, month=2, day=3) + assert utils.validate_python(DateEN, "02-03-04") == date( + year=2004, month=2, day=3 + ) DateMDY = DateType(date_order="MDY") - assert DateMDY["02-03-04"].value == date(year=2004, month=2, day=3) + assert utils.validate_python(DateMDY, "02-03-04") == date( + year=2004, month=2, day=3 + ) DateES = DateType(languages=["es"]) - assert DateES["02-03-04"].value == date(year=2004, month=3, day=2) + assert utils.validate_python(DateES, "02-03-04") == date( + year=2004, month=3, day=2 + ) DateDMY = DateType(date_order="DMY") - assert DateDMY["02-03-04"].value == date(year=2004, month=3, day=2) + assert utils.validate_python(DateDMY, "02-03-04") == date( + year=2004, month=3, day=2 + ) DateYMD = DateType(date_order="YMD") - assert DateYMD["02-03-04"].value == date(year=2002, month=3, day=4) + assert utils.validate_python(DateYMD, "02-03-04") == date( + year=2002, month=3, day=4 + ) diff --git a/tests/test_emoji.py b/tests/test_emoji.py index 709ace5..43c4a0a 100644 --- a/tests/test_emoji.py +++ b/tests/test_emoji.py @@ -1,12 +1,12 @@ -from fuzztypes import Emoji, emojis +from fuzztypes import Emoji, emojis, utils def test_key_access(): - assert Emoji("balloon") == "🎈" - assert Emoji(":atm_sign:") == "🏧" - assert Emoji("atm sign") == "🏧" - assert Emoji("atm") == "🏧" - assert Emoji("United States") == "🇺🇸" + assert utils.validate_python(Emoji, "balloon") == "🎈" + assert utils.validate_python(Emoji, ":atm_sign:") == "🏧" + assert utils.validate_python(Emoji, "atm sign") == "🏧" + assert utils.validate_python(Emoji, "atm") == "🏧" + assert utils.validate_python(Emoji, "United States") == "🇺🇸" def test_load_emojis(): diff --git a/tests/test_full_model.py b/tests/test_full_model.py new file mode 100644 index 0000000..af44482 --- /dev/null +++ b/tests/test_full_model.py @@ -0,0 +1,201 @@ +from datetime import datetime +from typing import Annotated + +from pydantic import BaseModel + +from fuzztypes import ( + ASCII, + Datetime, + Email, + Fuzzmoji, + InMemory, + Integer, + Person, + Regex, + ZipCode, + flags, +) + +# define a source, see EntitySource for using TSV, CSV, JSONL +inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"] + +# define a named entity type in memory. use OnDisk for larger data sets. +Inventor = Annotated[str, InMemory(inventors, search_flag=flags.FuzzSearch)] + +# custom Regex type for finding twitter handles. +Handle = Annotated[str, Regex(r"@\w{1,15}", examples=["@genomoncology"])] + + +# define a Pydantic class with 9 fuzzy type attributes +class Fuzzy(BaseModel): + ascii: ASCII + email: Email + emoji: Fuzzmoji + handle: Handle + integer: Integer + inventor: Inventor + person: Person + time: Datetime + zipcode: ZipCode + + +def test_full_model(): + # create an instance of class Fuzzy + obj = Fuzzy( + ascii="άνθρωπος", + email="John Doe ", + emoji="thought bubble", + handle="Ian Maurer (@imaurer)", + integer="fifty-five", + inventor="ada luvlace", + person="mr. arthur herbert fonzarelli (fonzie)", + time="5am on Jan 1, 2025", + zipcode="(Zipcode: 12345-6789)", + ) + + # test the autocorrecting performed + + # greek for man: https://en.wiktionary.org/wiki/άνθρωπος + assert obj.ascii == "anthropos" + + # extract email via regular expression + assert obj.email == "jdoe@example.com" + + # fuzzy match "thought bubble" to "thought balloon" emoji + assert obj.emoji == "💭" + + # simple, inline regex example (see above Handle type) + assert obj.handle == "@imaurer" + + # convert integer word phrase to integer value + assert obj.integer == 55 + + # case-insensitive fuzzy match on lowercase, misspelled name + assert obj.inventor == "Ada Lovelace" + + # human name parser (title, first, middle, last, suffix, nickname) + assert str(obj.person) == "Mr. Arthur Herbert Fonzarelli (fonzie)" + assert obj.person.short_name == "Arthur Fonzarelli" + assert obj.person.nickname == "fonzie" + assert obj.person.last == "Fonzarelli" + + # convert time phrase to datetime object + assert obj.time.isoformat() == "2025-01-01T05:00:00" + + # extract zip5 or zip9 formats using regular expressions + assert obj.zipcode == "12345-6789" + + # print JSON on success + assert obj.model_dump() == { + "ascii": "anthropos", + "email": "jdoe@example.com", + "emoji": "💭", + "handle": "@imaurer", + "integer": 55, + "inventor": "Ada Lovelace", + "person": { + "first": "Arthur", + "init_format": "{first} {middle} {last}", + "last": "Fonzarelli", + "middle": "Herbert", + "name_format": "{title} {first} {middle} {last} {suffix} " + "({nickname})", + "nickname": "fonzie", + "suffix": "", + "title": "Mr.", + }, + "time": datetime(2025, 1, 1, 5), + "zipcode": "12345-6789", + } + + +def test_json_schema(): + data = Fuzzy.model_json_schema() + expected_data = { + "$defs": { + "PersonModel": { + "properties": { + "first": { + "default": "", + "title": "First", + "type": "string", + }, + "init_format": { + "default": "{first} " "{middle} " "{last}", + "title": "Init " "Format", + "type": "string", + }, + "last": {"default": "", "title": "Last", "type": "string"}, + "middle": { + "default": "", + "title": "Middle", + "type": "string", + }, + "name_format": { + "default": "{title} " + "{first} " + "{middle} " + "{last} " + "{suffix} " + "({nickname})", + "title": "Name " "Format", + "type": "string", + }, + "nickname": { + "default": "", + "title": "Nickname", + "type": "string", + }, + "suffix": { + "default": "", + "title": "Suffix", + "type": "string", + }, + "title": { + "default": "", + "title": "Title", + "type": "string", + }, + }, + "title": "PersonModel", + "type": "object", + } + }, + "properties": { + "ascii": {"title": "Ascii", "type": "string"}, + "email": { + "examples": ["user@example.com"], + "title": "Email", + "type": "string", + }, + "emoji": {"title": "Emoji", "type": "string"}, + "handle": { + "examples": ["@genomoncology"], + "title": "Handle", + "type": "string", + }, + 'integer': {'title': 'Integer', 'type': 'integer'}, + "inventor": {"title": "Inventor", "type": "string"}, + "person": {"$ref": "#/$defs/PersonModel"}, + "time": {"format": "date-time", "title": "Time", "type": "string"}, + "zipcode": { + "examples": ["12345", "12345-6789"], + "title": "Zipcode", + "type": "string", + }, + }, + "required": [ + "ascii", + "email", + "emoji", + "handle", + "integer", + "inventor", + "person", + "time", + "zipcode", + ], + "title": "Fuzzy", + "type": "object", + } + assert data == expected_data diff --git a/tests/test_function.py b/tests/test_function.py deleted file mode 100644 index 696cd4a..0000000 --- a/tests/test_function.py +++ /dev/null @@ -1,76 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel, Field - -from fuzztypes import Function - -UpperType = Function(str.upper, examples=["A", "B", "C"]) -LowerType = Function(str.lower, examples=["a", "b", "c"]) - - -# Example usage -class MyClass(BaseModel): - my_upper: UpperType - my_lower: Optional[LowerType] = Field(None) - - -def test_simple_transforms(): - obj = MyClass(my_upper="Abc", my_lower="ABc") - assert obj.my_upper == "ABC" - assert obj.my_lower == "abc" - - -def test_getitem_upper(): - assert UpperType("hello") == "HELLO" - - -def test_class_getitem(): - StripType = Function(str.strip) - assert StripType(" a b c ") == "a b c" - - -def test_missing_lookup(): - def apple_banana(key: str) -> Optional[str]: - return dict(a="apple", b="banana").get(key) - - AppleBanana = Function(apple_banana) - assert AppleBanana["a"].value == "apple" - assert AppleBanana("a") == "apple" - - try: - assert AppleBanana["c"] is not None - assert False, "Didn't throw exception." - except KeyError: - pass - - NoAppleBananaOk = Function(apple_banana, notfound_mode="none") - assert NoAppleBananaOk["d"] is None - - AnyFruitOk = Function(apple_banana, notfound_mode="allow") - assert AnyFruitOk("kiwi") == "kiwi" - - -def test_json_schema(): - assert MyClass.model_json_schema() == { - "properties": { - "my_lower": { - "anyOf": [ - { - "examples": ["a", "b", "c"], - "type": "string", - }, - {"type": "null"}, - ], - "default": None, - "title": "My Lower", - }, - "my_upper": { - "examples": ["A", "B", "C"], - "title": "My Upper", - "type": "string", - }, - }, - "required": ["my_upper"], - "title": "MyClass", - "type": "object", - } diff --git a/tests/test_integer.py b/tests/test_integer.py index ae997d9..b5c48e0 100644 --- a/tests/test_integer.py +++ b/tests/test_integer.py @@ -1,39 +1,32 @@ from pydantic import BaseModel, ValidationError -from fuzztypes import Integer - - -class MyModel(BaseModel): - num: Integer +from fuzztypes import Integer, utils def test_convert_number_to_int(): - assert MyModel(num=3).num == 3 - assert MyModel(num="three").num == 3 - assert MyModel(num="third").num == 3 - assert MyModel(num="nineteen billion and nineteen").num == 19_000_000_019 + assert utils.validate_python(Integer, 3) == 3 + assert utils.validate_python(Integer, "three") == 3 + assert utils.validate_python(Integer, "third") == 3 + assert ( + utils.validate_python(Integer, "nineteen billion and nineteen") + == 19_000_000_019 + ) assert ( - MyModel(num="two million three thousand and nineteen").num == 2_003_019 + utils.validate_python( + Integer, "two million three thousand and nineteen" + ) + == 2_003_019 ) def test_validation_error(): + class MyModel(BaseModel): + num: Integer + + assert MyModel(num="three").num == 3 # type: ignore[arg-type] + try: - assert MyModel(num="xyz") + assert MyModel(num="xyz") # type: ignore[arg-type] assert False, "Didn't fail to parse non-integer." except ValidationError: pass - - -def test_json_schema(): - assert MyModel.model_json_schema() == { - "properties": { - "num": { - "anyOf": [{"type": "string"}, {"type": "integer"}], - "title": "Num", - } - }, - "required": ["num"], - "title": "MyModel", - "type": "object", - } diff --git a/tests/test_person.py b/tests/test_person.py index 8acb06b..0e801c7 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, ValidationError -from fuzztypes import Person +from fuzztypes import Person, utils class MyModel(BaseModel): @@ -29,10 +29,10 @@ def test_example(): assert obj2.optional is None -def test_mixed_capitalization(): - obj = MyModel(person="shirley maclaine") - assert obj.person.first == "Shirley" - assert obj.person.last == "MacLaine" +def test_mixed_capitalization_with_validate_python(): + person = utils.validate_python(Person, "shirley maclaine") + assert person.first == "Shirley" + assert person.last == "MacLaine" def test_different_nickname_format_oh_well(): @@ -72,69 +72,3 @@ def test_value_error(): assert False, "Didn't fail as expected." except ValueError: pass - - -def test_json_schema(): - data = MyModel.model_json_schema() - expected_data = { - "$defs": { - "PersonModel": { - "properties": { - "first": { - "default": "", - "title": "First", - "type": "string", - }, - "init_format": { - "default": "{first} " "{middle} " "{last}", - "title": "Init " "Format", - "type": "string", - }, - "last": {"default": "", "title": "Last", "type": "string"}, - "middle": { - "default": "", - "title": "Middle", - "type": "string", - }, - "name_format": { - "default": "{title} " - "{first} " - "{middle} " - "{last} " - "{suffix} " - "({nickname})", - "title": "Name " "Format", - "type": "string", - }, - "nickname": { - "default": "", - "title": "Nickname", - "type": "string", - }, - "suffix": { - "default": "", - "title": "Suffix", - "type": "string", - }, - "title": { - "default": "", - "title": "Title", - "type": "string", - }, - }, - "title": "PersonModel", - "type": "object", - } - }, - "properties": { - "optional": { - "anyOf": [{"$ref": "#/$defs/PersonModel"}, {"type": "null"}], - "default": None, - }, - "person": {"$ref": "#/$defs/PersonModel"}, - }, - "required": ["person"], - "title": "MyModel", - "type": "object", - } - assert data == expected_data From d775bcea1252191a74b83e0f5305dd2fa5806b85 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Sat, 23 Mar 2024 20:21:52 -0700 Subject: [PATCH 09/15] more validation refactoring. --- src/fuzztypes/__init__.py | 18 +++++- src/fuzztypes/ascii.py | 5 +- src/fuzztypes/date.py | 61 ++++++------------- src/fuzztypes/emojis.py | 59 ++++++++++-------- src/fuzztypes/integer.py | 6 +- src/fuzztypes/on_disk.py | 2 +- src/fuzztypes/person.py | 34 ++++------- src/fuzztypes/utils/__init__.py | 4 -- .../{utils/adapter.py => validation.py} | 32 +++++++++- tests/on_disk/test_on_disk_fuzz.py | 10 +-- tests/on_disk/test_on_disk_semantic.py | 11 ++-- tests/test_annotation.py | 27 -------- tests/test_date.py | 39 ++++++------ tests/test_emoji.py | 12 ++-- tests/test_full_model.py | 14 ++--- tests/test_integer.py | 14 ++--- tests/test_person.py | 28 ++++----- 17 files changed, 180 insertions(+), 196 deletions(-) rename src/fuzztypes/{utils/adapter.py => validation.py} (51%) delete mode 100644 tests/test_annotation.py diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index 5f24648..a077c3c 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -14,6 +14,14 @@ from . import utils from . import lazy +# Validation +from .validation import ( + FuzzValidator, + validate_python, + validate_json, + get_type_adapter, +) + # Schema from .entity import Entity, NamedEntity, EntitySource from .match import Match, MatchResult, Record @@ -31,7 +39,7 @@ # Usable Types from .ascii import ASCII -from .date import Date, DateType, Datetime, DatetimeType +from .date import Date, DateValidator, Datetime, DatetimeValidator from .emojis import Emoji, Fuzzmoji, Vibemoji from .integer import Integer from .language import Language, LanguageName, LanguageCode @@ -47,6 +55,7 @@ "Entity", "EntitySource", "Fuzzmoji", + "FuzzValidator", "InMemory", "Integer", "Language", @@ -61,14 +70,17 @@ "Regex", "SSN", "Date", - "DateType", + "DateValidator", "Datetime", - "DatetimeType", + "DatetimeValidator", "Vibemoji", "ZipCode", "const", "flags", + "get_type_adapter", "lazy", "logger", "utils", + "validate_json", + "validate_python", ) diff --git a/src/fuzztypes/ascii.py b/src/fuzztypes/ascii.py index 3e16e72..726ab29 100644 --- a/src/fuzztypes/ascii.py +++ b/src/fuzztypes/ascii.py @@ -1,7 +1,6 @@ from typing import Annotated, Any, Callable -from pydantic import BeforeValidator -from fuzztypes import lazy +from fuzztypes import FuzzValidator, lazy _tx = None @@ -33,4 +32,4 @@ def to_ascii(key: Any) -> str: return f(str(key)) -ASCII = Annotated[str, BeforeValidator(to_ascii)] +ASCII = Annotated[str, FuzzValidator(to_ascii)] diff --git a/src/fuzztypes/date.py b/src/fuzztypes/date.py index 2165599..0f1247f 100644 --- a/src/fuzztypes/date.py +++ b/src/fuzztypes/date.py @@ -1,21 +1,16 @@ import datetime -from typing import Annotated, Optional, Union, Type +from typing import Annotated, Optional, Union -from . import Entity, MatchResult, abstract, const, lazy +from . import FuzzValidator, const, lazy DateOrDatetime = Union[datetime.date, datetime.datetime] -DateOrStr = Union[str, datetime.date] -DatetimeOrStr = Union[str, datetime.datetime] -def DateType( +def DateValidator( date_order: Optional[const.DateOrder] = None, - examples: Optional[list] = None, + is_date: bool = True, languages: Optional[list[str]] = None, - notfound_mode: const.NotFoundMode = "raise", - input_type: Type[DateOrDatetime] = datetime.date, timezone: Optional[str] = None, - validator_mode: const.ValidatorMode = "before", strict: bool = False, prefer_future_dates: bool = False, relative_base: Optional[DateOrDatetime] = None, @@ -37,50 +32,32 @@ def DateType( parser = DateDataParser(languages=languages, settings=settings) - def parse(key: str) -> MatchResult: - match_list = MatchResult() + def parse(key: str) -> DateOrDatetime: value = parser.get_date_data(key).date_obj - if value is not None: - if input_type is datetime.date: - value = value.date() - entity = Entity(value=value) - match_list.set(key=key, entity=entity) - return match_list + value = value.date() if (value and is_date) else value + return value - return abstract.AbstractType( - parse, - examples=examples, - input_type=input_type, - notfound_mode=notfound_mode, - validator_mode=validator_mode, - ) + return FuzzValidator(parse) -def DatetimeType( +def DatetimeValidator( date_order: Optional[const.DateOrder] = None, - examples: Optional[list] = None, languages: Optional[list[str]] = None, - notfound_mode: const.NotFoundMode = "raise", - input_type: Type[DateOrDatetime] = datetime.datetime, timezone: Optional[str] = None, - validator_mode: const.ValidatorMode = "before", strict: bool = False, prefer_future_dates: bool = False, relative_base: Optional[DateOrDatetime] = None, ): - return DateType( - date_order, - examples, - languages, - notfound_mode, - input_type, - timezone, - validator_mode, - strict, - prefer_future_dates, - relative_base, + return DateValidator( + date_order=date_order, + is_date=False, + languages=languages, + timezone=timezone, + strict=strict, + prefer_future_dates=prefer_future_dates, + relative_base=relative_base, ) -Date = Annotated[datetime.date, DateType()] -Datetime = Annotated[datetime.datetime, DatetimeType()] +Date = Annotated[datetime.date, DateValidator()] +Datetime = Annotated[datetime.datetime, DatetimeValidator()] diff --git a/src/fuzztypes/emojis.py b/src/fuzztypes/emojis.py index 2b763b5..0246655 100644 --- a/src/fuzztypes/emojis.py +++ b/src/fuzztypes/emojis.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import List +from typing import Annotated, List from pydantic import TypeAdapter from fuzztypes import NamedEntity, EntitySource, OnDisk, flags, lazy @@ -21,27 +21,36 @@ def load_emoji_entities() -> List[NamedEntity]: EmojiSource = EntitySource(load_emoji_entities) -Emoji = OnDisk( - "Emoji", - EmojiSource, - search_flag=flags.AliasSearch, - tiebreaker_mode="lesser", -) - -Fuzzmoji = OnDisk( - "Fuzzmoji", - EmojiSource, - search_flag=flags.FuzzSearch, - tiebreaker_mode="lesser", - min_similarity=10.0, - device="cpu", -) - -Vibemoji = OnDisk( - "Vibemoji", - EmojiSource, - search_flag=flags.SemanticSearch, - tiebreaker_mode="lesser", - min_similarity=10.0, - device="cpu", -) +Emoji = Annotated[ + str, + OnDisk( + "Emoji", + EmojiSource, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", + ), +] + +Fuzzmoji = Annotated[ + str, + OnDisk( + "Fuzzmoji", + EmojiSource, + search_flag=flags.FuzzSearch, + tiebreaker_mode="lesser", + min_similarity=10.0, + device="cpu", + ), +] + +Vibemoji = Annotated[ + str, + OnDisk( + "Vibemoji", + EmojiSource, + search_flag=flags.SemanticSearch, + tiebreaker_mode="lesser", + min_similarity=10.0, + device="cpu", + ), +] diff --git a/src/fuzztypes/integer.py b/src/fuzztypes/integer.py index 475ead9..0f348a2 100644 --- a/src/fuzztypes/integer.py +++ b/src/fuzztypes/integer.py @@ -1,8 +1,6 @@ from typing import Annotated, Callable, Union -from pydantic import BeforeValidator, WithJsonSchema - -from fuzztypes import lazy +from fuzztypes import FuzzValidator, lazy _tx = None @@ -25,4 +23,4 @@ def to_int(key: Union[int, str]) -> int: return val -Integer = Annotated[int, BeforeValidator(to_int)] +Integer = Annotated[int, FuzzValidator(to_int)] diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index e9ad2ce..3234410 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -226,7 +226,7 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: def OnDisk( identity: str, - source: Iterable, + source: Iterable[NamedEntity], *, case_sensitive: bool = False, device: Optional[const.DeviceList] = None, diff --git a/src/fuzztypes/person.py b/src/fuzztypes/person.py index edcb64b..df08dfa 100644 --- a/src/fuzztypes/person.py +++ b/src/fuzztypes/person.py @@ -1,7 +1,8 @@ -from typing import Annotated, Union, Optional +from typing import Annotated, Optional + from pydantic import BaseModel -from fuzztypes import Entity, MatchResult, abstract, const, lazy +from fuzztypes import FuzzValidator, const, lazy FULL_NAME = "{title} {first} {middle} {last} {suffix} ({nickname})" SHORT_NAME = "{first} {last}" @@ -87,41 +88,28 @@ def PersonValidator( name_format: str = FULL_NAME, init_format: str = FULL_INIT, capitalize: bool = True, - examples: Optional[list] = None, - notfound_mode: const.NotFoundMode = "raise", - validator_mode: const.ValidatorMode = "before", ): - def do_lookup(key: Union[str, PersonModel]) -> MatchResult: + def to_person(key) -> Optional[PersonModel]: if isinstance(key, str): human_name = parse(full_name=key) if capitalize: human_name.capitalize(force=True) data = human_name.as_dict() - value = PersonModel( + person = PersonModel( name_format=name_format, init_format=init_format, **data ) elif isinstance(key, PersonModel): - value = key + person = key elif isinstance(key, dict): - value = PersonModel(**key) + person = PersonModel(**key) elif key is None: - value = None + person = None else: raise ValueError(f"Unexpected key type {type(key)} for {key}.") - match_list = MatchResult() - entity = Entity(value=value) - match_list.set(key=key, entity=entity) - return match_list - - return abstract.AbstractType( - do_lookup, - examples=examples, - input_type=PersonModel, - notfound_mode=notfound_mode, - output_type=PersonModel, - validator_mode=validator_mode, - ) + return person + + return FuzzValidator(to_person) # default annotation diff --git a/src/fuzztypes/utils/__init__.py b/src/fuzztypes/utils/__init__.py index df01c34..1e84b9f 100644 --- a/src/fuzztypes/utils/__init__.py +++ b/src/fuzztypes/utils/__init__.py @@ -1,10 +1,6 @@ -from .adapter import get_type_adapter, validate_json, validate_python from .download import download_file, get_file __all__ = ( "download_file", - "get_type_adapter", "get_file", - "validate_json", - "validate_python", ) diff --git a/src/fuzztypes/utils/adapter.py b/src/fuzztypes/validation.py similarity index 51% rename from src/fuzztypes/utils/adapter.py rename to src/fuzztypes/validation.py index 1490af0..42b2338 100644 --- a/src/fuzztypes/utils/adapter.py +++ b/src/fuzztypes/validation.py @@ -1,7 +1,19 @@ +"""This module contains related classes and functions for validation.""" + +import dataclasses +import sys from functools import lru_cache -from typing import Any, Union +from typing import Any, Union, Callable, Dict, cast + +from pydantic import GetCoreSchemaHandler, TypeAdapter +from pydantic_core import core_schema +from fuzztypes import const -from pydantic import TypeAdapter +dataclass_kwargs: Dict[str, Any] + +slots_true: Dict[str, bool] = {} +if sys.version_info >= (3, 10): + slots_true = {"slots": True} @lru_cache(maxsize=None) @@ -35,3 +47,19 @@ def validate_python(cls: Any, value: Any) -> Any: :return: Validated Python object. """ return get_type_adapter(cls).validate_python(value) + + +@dataclasses.dataclass(frozen=True, **slots_true) +class FuzzValidator: + func: Callable[[Any], Any] + notfound_mode: const.NotFoundMode = "raise" + + def __get_pydantic_core_schema__( + self, source_type: Any, handler: GetCoreSchemaHandler + ) -> core_schema.CoreSchema: + schema = handler(source_type) + func = cast(core_schema.NoInfoValidatorFunction, self.func) + + return core_schema.no_info_before_validator_function( + func, schema=schema + ) diff --git a/tests/on_disk/test_on_disk_fuzz.py b/tests/on_disk/test_on_disk_fuzz.py index 72ad99c..8895506 100644 --- a/tests/on_disk/test_on_disk_fuzz.py +++ b/tests/on_disk/test_on_disk_fuzz.py @@ -1,11 +1,13 @@ import os + import tantivy # type: ignore -from fuzztypes import Fuzzmoji, const + +from fuzztypes import Fuzzmoji, const, validate_python def test_tantivy(): # make sure the index is built - assert Fuzzmoji("balloon") == "🎈" + assert validate_python(Fuzzmoji, "balloon") == "🎈" # standard schema schema_builder = tantivy.SchemaBuilder() @@ -38,5 +40,5 @@ def test_tantivy(): def test_fuzzmoji(): - assert Fuzzmoji("thought bubble") == "💭" - assert Fuzzmoji("bubble team") == "🧋" + assert validate_python(Fuzzmoji, "thought bubble") == "💭" + assert validate_python(Fuzzmoji, "bubble team") == "🧋" diff --git a/tests/on_disk/test_on_disk_semantic.py b/tests/on_disk/test_on_disk_semantic.py index 9016866..3e05f4a 100644 --- a/tests/on_disk/test_on_disk_semantic.py +++ b/tests/on_disk/test_on_disk_semantic.py @@ -1,7 +1,7 @@ import pytest from pydantic import BaseModel -from fuzztypes import flags, on_disk, Vibemoji +from fuzztypes import flags, on_disk, Vibemoji, validate_python @pytest.fixture(scope="session") @@ -30,8 +30,7 @@ class MyModel(BaseModel): def test_vibemoji_get_value(): - assert Vibemoji("bacon tastes good") == "🥓" - assert Vibemoji("take the bus to school") == "🚌" - assert Vibemoji("jolly santa") == "🎅" - assert Vibemoji("st. nick") == "🇲🇫" # can't win them all! - assert Vibemoji("United States") == "🇺🇸" + assert validate_python(Vibemoji, "bacon tastes good") == "🥓" + assert validate_python(Vibemoji, "take the bus to school") == "🚌" + assert validate_python(Vibemoji, "jolly santa") == "🎅" + assert validate_python(Vibemoji, "United States") == "🇺🇸" diff --git a/tests/test_annotation.py b/tests/test_annotation.py deleted file mode 100644 index a3eaf0f..0000000 --- a/tests/test_annotation.py +++ /dev/null @@ -1,27 +0,0 @@ -from pydantic import BaseModel, BeforeValidator, TypeAdapter -from typing import Annotated - -from fuzztypes.ascii import to_ascii -from fuzztypes.in_memory import InMemory - - -def test_ascii(): - ASCII = Annotated[str, BeforeValidator(to_ascii)] - - assert TypeAdapter(ASCII).validate_python("άνθρωποι") == "anthropoi" - - class MyModel(BaseModel): - ascii: ASCII - - assert MyModel(ascii="άνθρωποι").ascii == "anthropoi" - assert MyModel(ascii=123).ascii == "123" - - -def test_in_memory(MythSource): - Myth = Annotated[str, InMemory(MythSource)] - assert TypeAdapter(Myth).validate_python("zeus") == "Zeus" - - class MyModel(BaseModel): - myth: Myth - - assert MyModel(myth="jove").myth == "Zeus" diff --git a/tests/test_date.py b/tests/test_date.py index f570f18..a654e8f 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -4,12 +4,19 @@ from pydantic import BaseModel -from fuzztypes import Date, DateType, DatetimeType, utils +from fuzztypes import ( + Date, + DateValidator, + DatetimeValidator, + validate_python, + validate_json, +) ny_tz = ZoneInfo("America/New_York") DateY2K = Annotated[ - datetime, DatetimeType(relative_base=datetime(2000, 1, 1), timezone="EST") + datetime, + DatetimeValidator(relative_base=datetime(2000, 1, 1), timezone="EST"), ] @@ -20,14 +27,14 @@ class MyModel(BaseModel): def test_validate_python_date_and_datetime(): data = dict(date="11 July 2012", time="tomorrow 5am") - obj = utils.validate_python(MyModel, data) + obj = validate_python(MyModel, data) assert obj.date == date(2012, 7, 11) assert obj.time == datetime(2000, 1, 2, 5, 0, 0, tzinfo=ny_tz) def test_validate_json_date_and_datetime(): json = '{"date": "July 4th", "time": "1 year ago"}' - obj = utils.validate_json(MyModel, json) + obj = validate_json(MyModel, json) today = date.today() year = today.year if (today.month, today.day) >= (7, 4) else today.year - 1 assert obj.date == date(year, 7, 4) @@ -42,31 +49,29 @@ def test_mdy_vs_ymd(): # MDY vs. YMD ordering is context specific # https://dateparser.readthedocs.io/en/latest/settings.html#date-order # - assert utils.validate_python(Date, "02-03-04") == date( - year=2004, month=2, day=3 - ) + assert validate_python(Date, "02-03-04") == date(year=2004, month=2, day=3) - DateEN = DateType(languages=["en"]) - assert utils.validate_python(DateEN, "02-03-04") == date( + DateEN = Annotated[date, DateValidator(languages=["en"])] + assert validate_python(DateEN, "02-03-04") == date( year=2004, month=2, day=3 ) - DateMDY = DateType(date_order="MDY") - assert utils.validate_python(DateMDY, "02-03-04") == date( + DateMDY = Annotated[date, DateValidator(date_order="MDY")] + assert validate_python(DateMDY, "02-03-04") == date( year=2004, month=2, day=3 ) - DateES = DateType(languages=["es"]) - assert utils.validate_python(DateES, "02-03-04") == date( + DateES = Annotated[date, DateValidator(languages=["es"])] + assert validate_python(DateES, "02-03-04") == date( year=2004, month=3, day=2 ) - DateDMY = DateType(date_order="DMY") - assert utils.validate_python(DateDMY, "02-03-04") == date( + DateDMY = Annotated[date, DateValidator(date_order="DMY")] + assert validate_python(DateDMY, "02-03-04") == date( year=2004, month=3, day=2 ) - DateYMD = DateType(date_order="YMD") - assert utils.validate_python(DateYMD, "02-03-04") == date( + DateYMD = Annotated[date, DateValidator(date_order="YMD")] + assert validate_python(DateYMD, "02-03-04") == date( year=2002, month=3, day=4 ) diff --git a/tests/test_emoji.py b/tests/test_emoji.py index 43c4a0a..81cdbbb 100644 --- a/tests/test_emoji.py +++ b/tests/test_emoji.py @@ -1,12 +1,12 @@ -from fuzztypes import Emoji, emojis, utils +from fuzztypes import Emoji, emojis, validate_python def test_key_access(): - assert utils.validate_python(Emoji, "balloon") == "🎈" - assert utils.validate_python(Emoji, ":atm_sign:") == "🏧" - assert utils.validate_python(Emoji, "atm sign") == "🏧" - assert utils.validate_python(Emoji, "atm") == "🏧" - assert utils.validate_python(Emoji, "United States") == "🇺🇸" + assert validate_python(Emoji, "balloon") == "🎈" + assert validate_python(Emoji, ":atm_sign:") == "🏧" + assert validate_python(Emoji, "atm sign") == "🏧" + assert validate_python(Emoji, "atm") == "🏧" + assert validate_python(Emoji, "United States") == "🇺🇸" def test_load_emojis(): diff --git a/tests/test_full_model.py b/tests/test_full_model.py index af44482..ad0b669 100644 --- a/tests/test_full_model.py +++ b/tests/test_full_model.py @@ -46,10 +46,10 @@ def test_full_model(): email="John Doe ", emoji="thought bubble", handle="Ian Maurer (@imaurer)", - integer="fifty-five", - inventor="ada luvlace", - person="mr. arthur herbert fonzarelli (fonzie)", - time="5am on Jan 1, 2025", + integer="fifty-five", # type: ignore[arg-type] + inventor="ada luvlace", # type: ignore[arg-type] + person="mr. arthur h. fonzarelli (fonzie)", # type: ignore[arg-type] + time="5am on Jan 1, 2025", # type: ignore[arg-type] zipcode="(Zipcode: 12345-6789)", ) @@ -74,7 +74,7 @@ def test_full_model(): assert obj.inventor == "Ada Lovelace" # human name parser (title, first, middle, last, suffix, nickname) - assert str(obj.person) == "Mr. Arthur Herbert Fonzarelli (fonzie)" + assert str(obj.person) == "Mr. Arthur H. Fonzarelli (fonzie)" assert obj.person.short_name == "Arthur Fonzarelli" assert obj.person.nickname == "fonzie" assert obj.person.last == "Fonzarelli" @@ -97,7 +97,7 @@ def test_full_model(): "first": "Arthur", "init_format": "{first} {middle} {last}", "last": "Fonzarelli", - "middle": "Herbert", + "middle": "H.", "name_format": "{title} {first} {middle} {last} {suffix} " "({nickname})", "nickname": "fonzie", @@ -174,7 +174,7 @@ def test_json_schema(): "title": "Handle", "type": "string", }, - 'integer': {'title': 'Integer', 'type': 'integer'}, + "integer": {"title": "Integer", "type": "integer"}, "inventor": {"title": "Inventor", "type": "string"}, "person": {"$ref": "#/$defs/PersonModel"}, "time": {"format": "date-time", "title": "Time", "type": "string"}, diff --git a/tests/test_integer.py b/tests/test_integer.py index b5c48e0..d7a11f6 100644 --- a/tests/test_integer.py +++ b/tests/test_integer.py @@ -1,20 +1,18 @@ from pydantic import BaseModel, ValidationError -from fuzztypes import Integer, utils +from fuzztypes import Integer, validate_python def test_convert_number_to_int(): - assert utils.validate_python(Integer, 3) == 3 - assert utils.validate_python(Integer, "three") == 3 - assert utils.validate_python(Integer, "third") == 3 + assert validate_python(Integer, 3) == 3 + assert validate_python(Integer, "three") == 3 + assert validate_python(Integer, "third") == 3 assert ( - utils.validate_python(Integer, "nineteen billion and nineteen") + validate_python(Integer, "nineteen billion and nineteen") == 19_000_000_019 ) assert ( - utils.validate_python( - Integer, "two million three thousand and nineteen" - ) + validate_python(Integer, "two million three thousand and nineteen") == 2_003_019 ) diff --git a/tests/test_person.py b/tests/test_person.py index 0e801c7..02c1434 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, ValidationError -from fuzztypes import Person, utils +from fuzztypes import Person, validate_python class MyModel(BaseModel): @@ -11,26 +11,26 @@ class MyModel(BaseModel): def test_example(): - obj = MyModel(person="Mr. John (Johnny) Q. Public IV") - assert str(obj.person) == "Mr. John Q. Public IV (Johnny)" - assert obj.person.last_name_first == "Public, John Q." - assert obj.person.short_name == "John Public" - assert obj.person.legal_name == "John Q. Public IV" - assert obj.person.full_name == "Mr. John Q. Public IV (Johnny)" + person = validate_python(Person, "Mr. John (Johnny) Q. Public IV") + assert str(person) == "Mr. John Q. Public IV (Johnny)" + assert person.last_name_first == "Public, John Q." + assert person.short_name == "John Public" + assert person.legal_name == "John Q. Public IV" + assert person.full_name == "Mr. John Q. Public IV (Johnny)" - assert obj.person.initials == "J. Q. P." - assert obj.person.full_initials == "J. Q. P." - assert obj.person.short_initials == "J. P." + assert person.initials == "J. Q. P." + assert person.full_initials == "J. Q. P." + assert person.short_initials == "J. P." - obj2 = MyModel(person=obj.person) - assert obj2.person == obj.person - assert obj2.person.human_name() == obj.person.human_name() + obj2 = MyModel(person=person) + assert obj2.person == person + assert obj2.person.human_name() == person.human_name() assert obj2.optional is None def test_mixed_capitalization_with_validate_python(): - person = utils.validate_python(Person, "shirley maclaine") + person = validate_python(Person, "shirley maclaine") assert person.first == "Shirley" assert person.last == "MacLaine" From 83aad362508104f2fc5000f06aca5f6383d43ca6 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Sat, 23 Mar 2024 20:51:34 -0700 Subject: [PATCH 10/15] more validation refactoring. --- src/fuzztypes/__init__.py | 4 +- src/fuzztypes/abstract.py | 10 +---- src/fuzztypes/in_memory.py | 2 - src/fuzztypes/on_disk.py | 2 - src/fuzztypes/regex.py | 74 +++++++++++++++++-------------------- src/fuzztypes/validation.py | 36 ++++++++++++++---- tests/test_full_model.py | 6 ++- tests/test_regex.py | 50 ++++++++++++++----------- 8 files changed, 97 insertions(+), 87 deletions(-) diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index a077c3c..fc2954e 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -35,7 +35,7 @@ from .on_disk import OnDisk # Base Non-Entity Types -from .regex import Regex +from .regex import RegexValidator # Usable Types from .ascii import ASCII @@ -67,7 +67,7 @@ "OnDisk", "Person", "Record", - "Regex", + "RegexValidator", "SSN", "Date", "DateValidator", diff --git a/src/fuzztypes/abstract.py b/src/fuzztypes/abstract.py index 5f4361b..63b72cb 100644 --- a/src/fuzztypes/abstract.py +++ b/src/fuzztypes/abstract.py @@ -53,7 +53,6 @@ def AbstractType( input_type: Type[SupportedType] = str, notfound_mode: const.NotFoundMode = "raise", output_type: Optional[Type[T]] = None, - validator_mode: const.ValidatorMode = "before", ) -> _AbstractTypeMeta: """ Factory function to create a specialized AbstractType, which is a Pydantic @@ -84,14 +83,7 @@ def __get_pydantic_core_schema__( This method is used internally by Pydantic to generate the schema based on the provided validation mode and input/output types. """ - validation_function_map: Dict[str, Callable] = { - "before": core_schema.no_info_before_validator_function, - "after": core_schema.no_info_before_validator_function, - "plain": core_schema.no_info_plain_validator_function, - "wrap": core_schema.no_info_wrap_validator_function, - } - - validation_function = validation_function_map[validator_mode] + validation_function = core_schema.no_info_before_validator_function in_schema = handler(input_type) if output_type and output_type != input_type: diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index 61d18c6..cf9c23d 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -195,7 +195,6 @@ def InMemory( notfound_mode: const.NotFoundMode = "raise", search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", - validator_mode: const.ValidatorMode = "before", ): source = InMemoryStorage( source, @@ -213,5 +212,4 @@ def InMemory( EntityType=entity_type, examples=examples, notfound_mode=notfound_mode, - validator_mode=validator_mode, ) diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index 3234410..36f592f 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -240,7 +240,6 @@ def OnDisk( notfound_mode: const.NotFoundMode = "raise", search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", - validator_mode: const.ValidatorMode = "before", ): source = OnDiskStorage( identity, @@ -262,5 +261,4 @@ def OnDisk( examples=examples, input_type=input_type, notfound_mode=notfound_mode, - validator_mode=validator_mode, ) diff --git a/src/fuzztypes/regex.py b/src/fuzztypes/regex.py index f979ec8..e124fa5 100644 --- a/src/fuzztypes/regex.py +++ b/src/fuzztypes/regex.py @@ -1,51 +1,43 @@ import re -from typing import Optional +from typing import Annotated, Optional -from . import Entity, Match, MatchResult, abstract, const +from . import FuzzValidator -def Regex( +def RegexValidator( pattern: str, examples: Optional[list] = None, - notfound_mode: const.NotFoundMode = "raise", - validator_mode: const.ValidatorMode = "before", - tiebreaker_mode: const.TiebreakerMode = "raise", ): regex = re.compile(pattern) - def do_lookup(key: str) -> MatchResult: + def do_regex(key: str) -> Optional[str]: matches = regex.findall(key) - match_list = MatchResult() - - for match in matches: - # Create and append Entity for each match found - entity = Entity(value=match) - match_list.append(Match(key=match, entity=entity, is_alias=False)) - - # Leave tiebreaker and error handling to MatchResult.choose - match_list.choose(min_score=0, tiebreaker_mode=tiebreaker_mode) - - return match_list - - return abstract.AbstractType( - do_lookup, - examples=examples, - notfound_mode=notfound_mode, - validator_mode=validator_mode, - ) - - -Email = Regex( - r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", - examples=["user@example.com"], -) - -SSN = Regex( - r"\b\d{3}-\d{2}-\d{4}\b", - examples=["000-00-0000"], -) - -ZipCode = Regex( - r"\b\d{5}(?:-\d{4})?\b", - examples=["12345", "12345-6789"], -) + if len(matches) == 1: + return matches[0] + + return FuzzValidator(do_regex, examples=examples) + + +Email = Annotated[ + str, + RegexValidator( + r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", + examples=["user@example.com"], + ), +] + +SSN = Annotated[ + str, + RegexValidator( + r"\b\d{3}-\d{2}-\d{4}\b", + examples=["000-00-0000"], + ), +] + +ZipCode = Annotated[ + str, + RegexValidator( + r"\b\d{5}(?:-\d{4})?\b", + examples=["12345", "12345-6789"], + ), +] diff --git a/src/fuzztypes/validation.py b/src/fuzztypes/validation.py index 42b2338..ed01237 100644 --- a/src/fuzztypes/validation.py +++ b/src/fuzztypes/validation.py @@ -1,13 +1,15 @@ -"""This module contains related classes and functions for validation.""" - import dataclasses import sys from functools import lru_cache -from typing import Any, Union, Callable, Dict, cast +from typing import Any, Union, Callable, Dict, cast, Optional -from pydantic import GetCoreSchemaHandler, TypeAdapter -from pydantic_core import core_schema -from fuzztypes import const +from pydantic import ( + GetCoreSchemaHandler, + GetJsonSchemaHandler, + TypeAdapter, + json_schema, +) +from pydantic_core import CoreSchema, core_schema dataclass_kwargs: Dict[str, Any] @@ -52,7 +54,11 @@ def validate_python(cls: Any, value: Any) -> Any: @dataclasses.dataclass(frozen=True, **slots_true) class FuzzValidator: func: Callable[[Any], Any] - notfound_mode: const.NotFoundMode = "raise" + examples: Optional[list] = None + + def __hash__(self): + attrs = (self.func, tuple(self.examples or ())) + return hash(attrs) def __get_pydantic_core_schema__( self, source_type: Any, handler: GetCoreSchemaHandler @@ -63,3 +69,19 @@ def __get_pydantic_core_schema__( return core_schema.no_info_before_validator_function( func, schema=schema ) + + def __get_pydantic_json_schema__( + self, + schema: CoreSchema, + handler: GetJsonSchemaHandler, + ) -> json_schema.JsonSchemaValue: + """ + Generate the JSON schema for the AbstractType. + + This method is used internally by Pydantic to generate the JSON + schema representation of the AbstractType, including any examples. + """ + schema = handler(schema) + if self.examples is not None: + schema["examples"] = self.examples + return schema diff --git a/tests/test_full_model.py b/tests/test_full_model.py index ad0b669..e0541a0 100644 --- a/tests/test_full_model.py +++ b/tests/test_full_model.py @@ -11,7 +11,7 @@ InMemory, Integer, Person, - Regex, + RegexValidator, ZipCode, flags, ) @@ -23,7 +23,9 @@ Inventor = Annotated[str, InMemory(inventors, search_flag=flags.FuzzSearch)] # custom Regex type for finding twitter handles. -Handle = Annotated[str, Regex(r"@\w{1,15}", examples=["@genomoncology"])] +Handle = Annotated[ + str, RegexValidator(r"@\w{1,15}", examples=["@genomoncology"]) +] # define a Pydantic class with 9 fuzzy type attributes diff --git a/tests/test_regex.py b/tests/test_regex.py index 72d5358..b094350 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -1,47 +1,50 @@ -from pydantic_core import PydanticCustomError +from pydantic import ValidationError -from fuzztypes import Email, SSN, ZipCode +from fuzztypes import Email, SSN, ZipCode, validate_python def test_email_regexer(): - assert Email("Jane Doe ") == "jdoe@example.com" - assert Email[""] == "jdoe@example.com" + assert ( + validate_python(Email, "Jane Doe ") + == "jdoe@example.com" + ) + assert validate_python(Email, "") == "jdoe@example.com" try: - assert Email["abc@xyz"] is not None + assert validate_python(Email, "abc@xyz") is not None assert False, "Invalid email did not fail!" - except KeyError: + except ValidationError: pass def test_valid_ssn(): # Value call - assert SSN("Valid SSN: 123-45-6789") == "123-45-6789" + assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" # Entity value comparison - assert SSN["Valid SSN: 123-45-6789"].value == "123-45-6789" + assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" # Entity equivalence to a value - assert SSN["Valid SSN: 123-45-6789"] == "123-45-6789" + assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" def test_valid_ssn_with_touching_bounding_chars(): - assert SSN("Valid SSN:123-45-6789.") == "123-45-6789" + assert validate_python(SSN, "Valid SSN:123-45-6789.") == "123-45-6789" def test_invalid_ssn_format(): try: - SSN("Invalid SSN: 123-456-789") + validate_python(SSN, "Invalid SSN: 123-456-789") assert False, "Invalid SSN format was accepted." - except PydanticCustomError: + except ValidationError: pass def test_ssn_needs_bounding_spaces(): try: - SSN("SSN text: abc123-45-6789xyz") + validate_python(SSN, "SSN text: abc123-45-6789xyz") assert False, "SSNs require some sort of bounding characters." - except PydanticCustomError: + except ValidationError: pass @@ -49,33 +52,36 @@ def test_multiple_ssns(): # This test depends on how you decide to handle multiple SSNs. multi_ssn_string = "Two SSNs: 123-45-6789 and 987-65-4321" try: - assert SSN(multi_ssn_string) is not None + assert validate_python(SSN, multi_ssn_string) is not None assert False, "Invalid SSN format was accepted." - except PydanticCustomError as e: + except ValidationError: pass def test_valid_zip_code_5_digits(): - assert ZipCode("Postal code: 12345") == "12345" + assert validate_python(ZipCode, "Postal code: 12345") == "12345" def test_valid_zip_code_9_digits(): - assert ZipCode("ZIP:12345-6789") == "12345-6789" + assert validate_python(ZipCode, "ZIP:12345-6789") == "12345-6789" def test_zip_code_within_text(): - assert ZipCode("Send it to 98765-4321, please.") == "98765-4321" + assert ( + validate_python(ZipCode, "Send it to 98765-4321, please.") + == "98765-4321" + ) def test_invalid_zip_code(): try: - ZipCode("Invalid ZIP: 1234") + validate_python(ZipCode, "Invalid ZIP: 1234") assert False, "Invalid ZIP code did not fail." - except PydanticCustomError: + except ValidationError: pass def test_zip_code_with_invalid_four_format(): # Python's re module does not support lookbehinds (? Date: Sat, 23 Mar 2024 23:52:53 -0700 Subject: [PATCH 11/15] Finish refactor. --- src/fuzztypes/__init__.py | 15 +- src/fuzztypes/abstract.py | 165 ------------------ src/fuzztypes/in_memory.py | 12 +- src/fuzztypes/language.py | 58 +++--- src/fuzztypes/match.py | 12 -- src/fuzztypes/on_disk.py | 18 +- src/fuzztypes/regex.py | 3 +- src/fuzztypes/storage.py | 31 +++- src/fuzztypes/validation.py | 45 ++++- tests/in_memory/test_in_memory_alias.py | 7 +- tests/in_memory/test_in_memory_fuzz.py | 72 +++++--- tests/in_memory/test_in_memory_name.py | 8 +- .../in_memory/test_in_memory_tags_example.py | 76 +++++--- tests/on_disk/test_on_disk_alias.py | 8 +- tests/on_disk/test_on_disk_name.py | 11 +- tests/test_language.py | 25 +-- tests/test_person.py | 8 +- 17 files changed, 249 insertions(+), 325 deletions(-) delete mode 100644 src/fuzztypes/abstract.py diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index fc2954e..80c4426 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -14,23 +14,21 @@ from . import utils from . import lazy +# Schema +from .entity import Entity, NamedEntity, EntitySource +from .match import Match, MatchResult, Record + # Validation from .validation import ( FuzzValidator, + validate_entity, validate_python, validate_json, get_type_adapter, ) -# Schema -from .entity import Entity, NamedEntity, EntitySource -from .match import Match, MatchResult, Record - -# Hidden Abstract Types -from . import abstract +# Named Entity Storage from . import storage - -# Base Named Entity Types from .in_memory import InMemory from .on_disk import OnDisk @@ -81,6 +79,7 @@ "lazy", "logger", "utils", + "validate_entity", "validate_json", "validate_python", ) diff --git a/src/fuzztypes/abstract.py b/src/fuzztypes/abstract.py deleted file mode 100644 index 63b72cb..0000000 --- a/src/fuzztypes/abstract.py +++ /dev/null @@ -1,165 +0,0 @@ -from datetime import date, datetime -from typing import ( - Any, - Callable, - Dict, - Generic, - Optional, - Type, - TypeVar, - Union, -) - -from pydantic import ( - BaseModel, - GetCoreSchemaHandler, - GetJsonSchemaHandler, - json_schema, -) -from pydantic_core import CoreSchema, PydanticCustomError, core_schema - -from fuzztypes import Entity, MatchResult, const - -T = TypeVar("T") - -SupportedType = Union[ - str, float, int, dict, list, date, datetime, BaseModel, T -] - - -class _AbstractTypeMeta(type, Generic[T]): - def __getitem__(cls: Type[T], key: Any) -> Entity[T]: - """ - Get the entity associated with the given key using dictionary-like - access. - - This method allows retrieving the entity using dictionary-like - syntax (e.g., AbstractType[key]). - - If entity found, it is returned. - If entity not found, raise a KeyError based on PydanticCustomError. - """ - try: - return cls.lookup(key) # type: ignore - except PydanticCustomError as err: - raise KeyError(f"Key Error: {key} [{err}]") from err - - -def AbstractType( - lookup_function: Callable[[T], MatchResult], - *, - EntityType: Type[Entity] = Entity, - examples: Optional[list] = None, - input_type: Type[SupportedType] = str, - notfound_mode: const.NotFoundMode = "raise", - output_type: Optional[Type[T]] = None, -) -> _AbstractTypeMeta: - """ - Factory function to create a specialized AbstractType, which is a Pydantic - based type with added fuzzy matching capabilities. - - :param lookup_function: Function to perform the lookup. - :param EntityType: Type of Entity (e.g. NamedEntity) to return. - :param examples: Example values used in schema generation. - :param input_type: The underlying Python data type. - :param notfound_mode: 'raise' an error, set 'none', or 'allow' unknown key. - :param output_type: Specify only if different from input_type. - :param validator_mode: Validation mode ('before', 'after', 'plain', 'wrap') - - :return: A specialized AbstractType based on the provided specifications. - """ - - # noinspection PyClassHasNoInit - class _AbstractType(metaclass=_AbstractTypeMeta): - @classmethod - def __get_pydantic_core_schema__( - cls, - source_type: type, - handler: GetCoreSchemaHandler, - ) -> CoreSchema: - """ - Generate the Pydantic core schema for the AbstractType. - - This method is used internally by Pydantic to generate the schema - based on the provided validation mode and input/output types. - """ - validation_function = core_schema.no_info_before_validator_function - in_schema = handler(input_type) - - if output_type and output_type != input_type: - # used for Person where name (str) or Person (BaseModel) used. - out_schema = handler(output_type) - in_schema = core_schema.union_schema([in_schema, out_schema]) - - if notfound_mode == "none": - in_schema = core_schema.nullable_schema(in_schema) - - return validation_function(cls, in_schema) - - @classmethod - def __get_pydantic_json_schema__( - cls, - schema: CoreSchema, - handler: GetJsonSchemaHandler, - ) -> json_schema.JsonSchemaValue: - """ - Generate the JSON schema for the AbstractType. - - This method is used internally by Pydantic to generate the JSON - schema representation of the AbstractType, including any examples. - """ - schema = handler(schema) - if examples is not None: - schema["examples"] = examples - return schema - - def __new__(cls, key: T) -> Optional[T]: # type: ignore - """ - Doesn't create an AbstractType, it's actually a class-level - __call__ function. - - Pydantic core schema logic will pass an additional argument - that can be ignored. - - It retrieves the entity associated with the provided key. - If an entity is found, it returns the value of the entity. - If no entity is found, it returns None. - If an exception is raised, it is will not be caught. - """ - entity = cls.lookup(key) - return entity.resolve() if entity else None - - @classmethod - def lookup(cls, key: T) -> Optional[Entity[T]]: - """ - Lookup the entity for the given key. - - This method attempts to find the entity associated with the - provided key. - - If a match is found, it returns the corresponding entity. - - If no match is found, takes action based on the notfound_mode: - "none": returns None (if notfound_mode is "none") - "allow": returns an entity with the key as value - "raise": raises a PydanticCustomError - """ - match_list: MatchResult = lookup_function(key) - - if match_list.choice is not None: - return match_list.entity - - if notfound_mode == "allow": - return EntityType(value=key) - - if notfound_mode == "none": - return None - - msg = "key ({key}) could not be resolved" - ctx: Dict[str, Any] = dict(key=key) - if match_list: - ctx["near"] = [str(m) for m in match_list] - msg += f", closest non-matches = {match_list}" - raise PydanticCustomError("key_not_found", msg, ctx) - - return _AbstractType diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index cf9c23d..0a956ee 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -4,11 +4,11 @@ from pydantic import PositiveInt from fuzztypes import ( + FuzzValidator, Match, MatchResult, NamedEntity, Record, - abstract, const, flags, lazy, @@ -196,20 +196,16 @@ def InMemory( search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", ): - source = InMemoryStorage( + in_memory = InMemoryStorage( source, case_sensitive=case_sensitive, encoder=encoder, fuzz_scorer=fuzz_scorer, limit=limit, min_similarity=min_similarity, + notfound_mode=notfound_mode, search_flag=search_flag, tiebreaker_mode=tiebreaker_mode, ) - return abstract.AbstractType( - source, - EntityType=entity_type, - examples=examples, - notfound_mode=notfound_mode, - ) + return FuzzValidator(in_memory, examples=examples) diff --git a/src/fuzztypes/language.py b/src/fuzztypes/language.py index ac10dea..c27c9bc 100644 --- a/src/fuzztypes/language.py +++ b/src/fuzztypes/language.py @@ -1,6 +1,6 @@ import json from enum import Enum -from typing import Optional, List, Iterable, Type +from typing import Annotated, Optional, List, Iterable, Type from pydantic import TypeAdapter @@ -82,27 +82,35 @@ def do_load() -> Iterable[NamedEntity]: return do_load -LanguageName = OnDisk( - "Language", - EntitySource(load_languages(LanguageNamedEntity)), - entity_type=LanguageNamedEntity, - search_flag=flags.AliasSearch, - tiebreaker_mode="lesser", -) - -LanguageCode = OnDisk( - "Language", - EntitySource(load_languages(LanguageCodeNameEntity)), - entity_type=LanguageCodeNameEntity, - search_flag=flags.AliasSearch, - tiebreaker_mode="lesser", -) - -Language = OnDisk( - "Language", - EntitySource(load_languages(LanguageModelNamedEntity)), - entity_type=LanguageModelNamedEntity, - input_type=LanguageModelNamedEntity, - search_flag=flags.AliasSearch, - tiebreaker_mode="lesser", -) +LanguageName = Annotated[ + str, + OnDisk( + "Language", + EntitySource(load_languages(LanguageNamedEntity)), + entity_type=LanguageNamedEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", + ), +] + +LanguageCode = Annotated[ + str, + OnDisk( + "Language", + EntitySource(load_languages(LanguageCodeNameEntity)), + entity_type=LanguageCodeNameEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", + ), +] + +Language = Annotated[ + LanguageNamedEntity, + OnDisk( + "Language", + EntitySource(load_languages(LanguageModelNamedEntity)), + entity_type=LanguageModelNamedEntity, + search_flag=flags.AliasSearch, + tiebreaker_mode="lesser", + ), +] diff --git a/src/fuzztypes/match.py b/src/fuzztypes/match.py index b0d9dc1..e89664a 100644 --- a/src/fuzztypes/match.py +++ b/src/fuzztypes/match.py @@ -50,18 +50,6 @@ def __str__(self): def entity(self): return self.choice is not None and self.choice.entity - def set( - self, - key: Any, - entity: Entity, - is_alias: bool = False, - term: Optional[str] = None, - ): - """If match is a known winner, just set it and forget it.""" - match = Match(key=key, entity=entity, is_alias=is_alias, term=term) - self.choice = match - self.matches.append(match) - def append(self, match: Match): """Add a match to the list of potential matches.""" self.matches.append(match) diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index 36f592f..b25b5af 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -3,11 +3,11 @@ from pydantic import PositiveInt from fuzztypes import ( + FuzzValidator, Match, MatchResult, NamedEntity, Record, - abstract, const, flags, lazy, @@ -21,7 +21,7 @@ class OnDiskStorage(storage.AbstractStorage): def __init__( self, name: str, - source: Iterable[NamedEntity], + source: Iterable, **kwargs, ): super().__init__(source, **kwargs) @@ -226,7 +226,7 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: def OnDisk( identity: str, - source: Iterable[NamedEntity], + source: Iterable, *, case_sensitive: bool = False, device: Optional[const.DeviceList] = None, @@ -234,14 +234,13 @@ def OnDisk( entity_type: Type[NamedEntity] = NamedEntity, examples: Optional[list] = None, fuzz_scorer: const.FuzzScorer = "token_sort_ratio", - input_type=str, limit: PositiveInt = 10, min_similarity: float = 80.0, notfound_mode: const.NotFoundMode = "raise", search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", ): - source = OnDiskStorage( + on_disk = OnDiskStorage( identity, source, case_sensitive=case_sensitive, @@ -250,15 +249,10 @@ def OnDisk( fuzz_scorer=fuzz_scorer, limit=limit, min_similarity=min_similarity, + notfound_mode=notfound_mode, search_flag=search_flag, encoder=encoder, tiebreaker_mode=tiebreaker_mode, ) - return abstract.AbstractType( - source, - EntityType=entity_type, - examples=examples, - input_type=input_type, - notfound_mode=notfound_mode, - ) + return FuzzValidator(on_disk, examples=examples) diff --git a/src/fuzztypes/regex.py b/src/fuzztypes/regex.py index e124fa5..06226d4 100644 --- a/src/fuzztypes/regex.py +++ b/src/fuzztypes/regex.py @@ -12,8 +12,7 @@ def RegexValidator( def do_regex(key: str) -> Optional[str]: matches = regex.findall(key) - if len(matches) == 1: - return matches[0] + return matches[0] if len(matches) == 1 else None return FuzzValidator(do_regex, examples=examples) diff --git a/src/fuzztypes/storage.py b/src/fuzztypes/storage.py index fef5c14..a8b8ea7 100644 --- a/src/fuzztypes/storage.py +++ b/src/fuzztypes/storage.py @@ -1,4 +1,6 @@ -from typing import Callable, Iterable, List, Type, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union + +from pydantic_core import PydanticCustomError from fuzztypes import NamedEntity, MatchResult, const, flags, lazy @@ -6,7 +8,7 @@ class AbstractStorage: def __init__( self, - source: Iterable[NamedEntity], + source: Iterable, *, case_sensitive: bool = False, encoder: Union[Callable, str, object] = None, @@ -15,6 +17,7 @@ def __init__( fuzz_scorer: str = "token_sort_ratio", limit: int = 10, min_similarity: float = 80.0, + notfound_mode: const.NotFoundMode = "raise", search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", ): @@ -28,6 +31,7 @@ def __init__( self.entity_type = entity_type self.limit = limit self.min_similarity = min_similarity + self.notfound_mode = notfound_mode self.prepped = False self.search_flag = search_flag self.tiebreaker_mode = tiebreaker_mode @@ -37,14 +41,33 @@ def __init__( self._encoder = encoder self._vect_dimensions = None - def __call__(self, key: str) -> MatchResult: + def __call__(self, key: str) -> Optional[Any]: + entity = self[key] + return entity.resolve() if entity else None + + def __getitem__(self, key: str) -> Optional[NamedEntity]: if not self.prepped: self.prepped = True self.prepare() match_list = self.get(key) match_list.choose(self.min_similarity, self.tiebreaker_mode) - return match_list + + if match_list.choice is not None: + return match_list.entity + + if self.notfound_mode == "allow": + return self.entity_type(value=key) + + if self.notfound_mode == "none": + return None + + msg = "key ({key}) could not be resolved" + ctx: Dict[str, Any] = dict(key=key) + if match_list: + ctx["near"] = [str(m) for m in match_list] + msg += f", closest non-matches = {match_list}" + raise PydanticCustomError("key_not_found", msg, ctx) def prepare(self): raise NotImplementedError diff --git a/src/fuzztypes/validation.py b/src/fuzztypes/validation.py index ed01237..70f90cb 100644 --- a/src/fuzztypes/validation.py +++ b/src/fuzztypes/validation.py @@ -1,7 +1,18 @@ import dataclasses +from itertools import chain import sys from functools import lru_cache -from typing import Any, Union, Callable, Dict, cast, Optional +from typing import ( + Any, + Union, + Callable, + Dict, + cast, + Optional, + get_origin, + get_args, +) + from pydantic import ( GetCoreSchemaHandler, @@ -9,13 +20,14 @@ TypeAdapter, json_schema, ) -from pydantic_core import CoreSchema, core_schema +from pydantic_core import CoreSchema, PydanticCustomError, core_schema +from fuzztypes import Entity dataclass_kwargs: Dict[str, Any] slots_true: Dict[str, bool] = {} if sys.version_info >= (3, 10): - slots_true = {"slots": True} + slots_true = {"slots": True} # pragma: no cover @lru_cache(maxsize=None) @@ -48,18 +60,41 @@ def validate_python(cls: Any, value: Any) -> Any: :param value: Python object to validate. :return: Validated Python object. """ - return get_type_adapter(cls).validate_python(value) + ta = get_type_adapter(cls) + return ta.validate_python(value) + + +def validate_entity(cls: Any, value: Any) -> Optional[Entity]: + """ + Returns entity from metadata if cls is a FuzzValidator. + + :param cls: Any object + :param value: input value + :return: Entity if validator is an entity source + """ + metadata = get_args(cls) + entity = None + for item in chain([cls], metadata): + if isinstance(item, FuzzValidator): + entity = item[value] + return entity @dataclasses.dataclass(frozen=True, **slots_true) class FuzzValidator: - func: Callable[[Any], Any] + func: Any examples: Optional[list] = None def __hash__(self): attrs = (self.func, tuple(self.examples or ())) return hash(attrs) + def __getitem__(self, key): + try: + return self.func[key] + except PydanticCustomError as err: + raise KeyError(f"Key Error: {key} [{err}]") from err + def __get_pydantic_core_schema__( self, source_type: Any, handler: GetCoreSchemaHandler ) -> core_schema.CoreSchema: diff --git a/tests/in_memory/test_in_memory_alias.py b/tests/in_memory/test_in_memory_alias.py index 319e1e1..1f3947d 100644 --- a/tests/in_memory/test_in_memory_alias.py +++ b/tests/in_memory/test_in_memory_alias.py @@ -1,4 +1,5 @@ import pytest +from typing import Annotated from pydantic import BaseModel, ValidationError from fuzztypes import InMemory, flags @@ -36,7 +37,7 @@ def test_alias_cased_getitem(CasedMythicalFigure): def test_uncased_alias_str(MythicalFigure): class Example(BaseModel): - value: MythicalFigure + value: Annotated[str, MythicalFigure] # Exact match assert Example(value="Zeus").value == "Zeus" @@ -48,7 +49,7 @@ class Example(BaseModel): def test_cased_alias_str(CasedMythicalFigure): class Example(BaseModel): - value: CasedMythicalFigure + value: Annotated[str, CasedMythicalFigure] # Exact match assert Example(value="Zeus").value == "Zeus" @@ -62,7 +63,7 @@ class Example(BaseModel): def test_duplicate_records(): source = [["c", "b"], ["a", "b"], ["d", "b"]] - A = InMemory(source, tiebreaker_mode="raise") + A = InMemory(source) assert A["a"].value == "a" try: diff --git a/tests/in_memory/test_in_memory_fuzz.py b/tests/in_memory/test_in_memory_fuzz.py index e2eb89d..c0247f5 100644 --- a/tests/in_memory/test_in_memory_fuzz.py +++ b/tests/in_memory/test_in_memory_fuzz.py @@ -1,31 +1,45 @@ +from typing import Annotated, Optional from pydantic import BaseModel, ValidationError -from fuzztypes import NamedEntity, InMemory, flags - -FruitStr = InMemory( - ["Apple", "Banana"], - search_flag=flags.FuzzSearch, -) -DirectionStr = InMemory( - [ - ("Left", "L"), - ("Right", "R"), - ("Middle", "M"), - ], - search_flag=flags.FuzzSearch, -) -LooseStr = InMemory( - ["A B C", "X Y Z"], - min_similarity=10.0, - limit=1, - search_flag=flags.FuzzSearch, -) -StrictStr = InMemory( - ["A B C", "X Y Z"], - min_similarity=95.0, - limit=1, - search_flag=flags.FuzzSearch, -) +from fuzztypes import NamedEntity, InMemory, flags, validate_python + +FruitStr = Annotated[ + Optional[str], + InMemory( + ["Apple", "Banana"], + search_flag=flags.FuzzSearch, + ), +] + +DirectionStr = Annotated[ + Optional[str], + InMemory( + [ + ("Left", "L"), + ("Right", "R"), + ("Middle", "M"), + ], + search_flag=flags.FuzzSearch, + ), +] +LooseStr = Annotated[ + Optional[str], + InMemory( + ["A B C", "X Y Z"], + min_similarity=10.0, + limit=1, + search_flag=flags.FuzzSearch, + ), +] +StrictStr = Annotated[ + str, + InMemory( + ["A B C", "X Y Z"], + min_similarity=95.0, + limit=1, + search_flag=flags.FuzzSearch, + ), +] class Model(BaseModel): @@ -60,12 +74,12 @@ def test_synonyms(): def test_get_item(): - assert DirectionStr["L"].value == "Left" + assert validate_python(DirectionStr, "L") == "Left" try: - assert DirectionStr["XYZ"] + assert validate_python(DirectionStr, "XYZ") raise AssertionError("Didn't throw KeyError") - except KeyError: + except ValidationError: pass diff --git a/tests/in_memory/test_in_memory_name.py b/tests/in_memory/test_in_memory_name.py index 2f2575b..bbaa964 100644 --- a/tests/in_memory/test_in_memory_name.py +++ b/tests/in_memory/test_in_memory_name.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Annotated, Optional from pydantic import BaseModel, ValidationError, Field @@ -31,7 +31,7 @@ def test_namestr_getitem(): def test_uncased_name_str(): class Example(BaseModel): - value: President + value: Annotated[str, President] # exact match assert Example(value="George Washington").value == "George Washington" @@ -42,7 +42,7 @@ class Example(BaseModel): def test_cased_name_str(): class Example(BaseModel): - value: CasedPrez + value: Annotated[str, CasedPrez] # exact match assert Example(value="George Washington").value == "George Washington" @@ -57,7 +57,7 @@ class Example(BaseModel): def test_nullable_name_str(): class Example(BaseModel): - value: Optional[NullPrez] = Field(default=None) + value: Annotated[Optional[str], NullPrez] = Field(default=None) assert Example().model_dump() == {"value": None} assert Example(value="The Rock").model_dump() == {"value": None} diff --git a/tests/in_memory/test_in_memory_tags_example.py b/tests/in_memory/test_in_memory_tags_example.py index d2e0d4b..c660f87 100644 --- a/tests/in_memory/test_in_memory_tags_example.py +++ b/tests/in_memory/test_in_memory_tags_example.py @@ -10,12 +10,19 @@ https://github.com/quickwit-oss/tantivy-py/issues/20 https://docs.rs/tantivy/latest/tantivy/query/struct.FuzzyTermQuery.html """ -from typing import List +from typing import Annotated, List from pydantic import BaseModel from pytest import fixture -from fuzztypes import EntitySource, InMemory, flags +from fuzztypes import ( + EntitySource, + InMemory, + flags, + validate_entity, + validate_python, + Entity, +) @fixture(scope="session") @@ -32,45 +39,64 @@ def Tag(TagSource): # min_similarity is very low for demo # QRatio used because tags are single "words" (e.g. sqlinjection) - return InMemory( - TagSource, - notfound_mode="allow", - search_flag=flags.FuzzSearch, - min_similarity=50.0, - fuzz_scorer="QRatio", - ) + return Annotated[ + str, + InMemory( + TagSource, + notfound_mode="allow", + search_flag=flags.FuzzSearch, + min_similarity=50.0, + fuzz_scorer="QRatio", + ), + ] -def test_fuzzy_tags_priority(Tag): - # exact matches - # priority is topic prevalence, higher wins. - assert Tag["2d"].priority == 3 - assert Tag["3d"].priority == 14 +def test_get_entity_from_annotation(Tag): + entity = validate_entity(Tag, "2d") + assert isinstance(entity, Entity) + assert entity.priority == 3 + + entity = validate_entity(Tag, "3d") + assert isinstance(entity, Entity) + assert entity.priority == 14 + +def test_fuzzy_tags_priority(Tag): # since min_similarity is 50.0, it chooses higher priority - assert Tag("4d") == "3d" + assert validate_python(Tag, "4d") == "3d" # matches because 67% ratio > 50.0 minimum - assert Tag("27d") == "2d" + assert validate_python(Tag, "27d") == "2d" # less than 50% similarity is passed through (notfound_mode="allow") - assert Tag("17d") == "17d" + assert validate_python(Tag, "17d") == "17d" # different - assert Tag("18d") == "i18n" + assert validate_python(Tag, "18d") == "i18n" # todo: collect allowed tags and use for future fuzzy matching - # assert Tag("15d") == "17d" - assert Tag("15d") == "15d" + # assert validate_python(Tag, "15d") == "17d" + assert validate_python(Tag, "15d") == "15d" def test_fuzzy_scoring_edge_cases(Tag): - assert Tag("prompt_injection") == "promptinjection" - assert Tag("promptinjections") == "promptinjection" - assert Tag("prompt injections") == "promptinjection" - + assert validate_python(Tag, "prompt_injection") == "promptinjection" + assert validate_python(Tag, "promptinjections") == "promptinjection" + assert validate_python(Tag, "prompt injections") == "promptinjection" + + +def test_as_a_list_of_tags(TagSource): + Tag = Annotated[ + str, + InMemory( + TagSource, + notfound_mode="allow", + search_flag=flags.FuzzSearch, + min_similarity=50.0, + fuzz_scorer="QRatio", + ), + ] -def test_as_a_list_of_tags(Tag): class Post(BaseModel): text: str tags: List[Tag] diff --git a/tests/on_disk/test_on_disk_alias.py b/tests/on_disk/test_on_disk_alias.py index f64b1cc..e889d15 100644 --- a/tests/on_disk/test_on_disk_alias.py +++ b/tests/on_disk/test_on_disk_alias.py @@ -1,3 +1,5 @@ +from typing import Annotated + import pytest from pydantic import BaseModel, ValidationError @@ -37,7 +39,7 @@ def test_alias_cased_getitem(CasedMythicalFigure): def test_uncased_alias_str(MythicalFigure): class Example(BaseModel): - value: MythicalFigure + value: Annotated[str, MythicalFigure] # Exact match assert Example(value="Zeus").value == "Zeus" @@ -49,7 +51,7 @@ class Example(BaseModel): def test_cased_alias_str(CasedMythicalFigure): class Example(BaseModel): - value: CasedMythicalFigure + value: Annotated[str, CasedMythicalFigure] # Exact match assert Example(value="Zeus").value == "Zeus" @@ -63,7 +65,7 @@ class Example(BaseModel): def test_duplicate_records(): source = [["c", "b"], ["a", "b"], ["d", "b"]] - A = OnDisk("DupeRec", source, tiebreaker_mode="raise") + A = OnDisk("DupeRec", source) assert A["a"].value == "a" try: diff --git a/tests/on_disk/test_on_disk_name.py b/tests/on_disk/test_on_disk_name.py index 414e47d..a45a3e9 100644 --- a/tests/on_disk/test_on_disk_name.py +++ b/tests/on_disk/test_on_disk_name.py @@ -1,8 +1,8 @@ -from typing import Optional +from typing import Annotated, Optional from pydantic import BaseModel, ValidationError, Field -from fuzztypes import NamedEntity, OnDisk, flags +from fuzztypes import NamedEntity, OnDisk, flags, validate_entity names = ["George Washington", "John Adams", "Thomas Jefferson"] President = OnDisk( @@ -34,6 +34,7 @@ def test_namestr_getitem(): entity = NamedEntity(value="Thomas Jefferson") assert President["Thomas Jefferson"] == entity assert President["THOMAS JEFFERSON"] == entity + assert validate_entity(President, "Thomas Jefferson") == entity assert CasedPrez["Thomas Jefferson"] == entity try: @@ -48,7 +49,7 @@ def test_namestr_getitem(): def test_uncased_name_str(): class Example(BaseModel): - value: President + value: Annotated[str, President] # exact match assert Example(value="George Washington").value == "George Washington" @@ -59,7 +60,7 @@ class Example(BaseModel): def test_cased_name_str(): class Example(BaseModel): - value: CasedPrez + value: Annotated[str, CasedPrez] # exact match assert Example(value="George Washington").value == "George Washington" @@ -74,7 +75,7 @@ class Example(BaseModel): def test_nullable_name_str(): class Example(BaseModel): - value: Optional[NullPrez] = Field(default=None) + value: Annotated[Optional[str], NullPrez] = Field(default=None) assert Example().model_dump() == {"value": None} assert Example(value="The Rock").model_dump() == {"value": None} diff --git a/tests/test_language.py b/tests/test_language.py index ae5af53..d316d85 100644 --- a/tests/test_language.py +++ b/tests/test_language.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from fuzztypes import Language, LanguageCode, LanguageName +from fuzztypes import Language, LanguageCode, LanguageName, validate_python from fuzztypes.language import load_languages, LanguageType, LanguageScope @@ -18,10 +18,11 @@ class Model(BaseModel): language_name: LanguageName # Test that Language resolves to the complete language object - model = Model(language="English", language_code="en", language_name="ENG") - assert model.language.scope == LanguageScope.INDIVIDUAL - assert model.language.type == LanguageType.LIVING - assert model.model_dump(exclude_defaults=True, mode="json") == { + data = dict(language="English", language_code="en", language_name="ENG") + obj = validate_python(Model, data) + assert obj.language.scope == LanguageScope.INDIVIDUAL + assert obj.language.type == LanguageType.LIVING + assert obj.model_dump(exclude_defaults=True, mode="json") == { "language": { "aliases": ["en", "eng"], "alpha_2": "en", @@ -37,17 +38,17 @@ class Model(BaseModel): def test_matching_edge_cases(): # 'En' is a proper name of a language - assert LanguageName("En") == "En" - assert LanguageCode("En") == "enc" + assert validate_python(LanguageName, "En") == "En" + assert validate_python(LanguageCode, "En") == "enc" # 'en' is the alpha2 code for English - assert LanguageName("en") == "English" - assert LanguageCode("en") == "en" + assert validate_python(LanguageName, "en") == "English" + assert validate_python(LanguageCode, "en") == "en" # Bangla is common name for Bengali - assert LanguageName("Bangla") == "Bengali" - assert LanguageCode("Bangla") == "bn" - assert Language("Bangla").model_dump( + assert validate_python(LanguageName, "Bangla") == "Bengali" + assert validate_python(LanguageCode, "Bangla") == "bn" + assert validate_python(Language, "Bangla").model_dump( exclude_defaults=True, mode="json" ) == { "aliases": ["bn", "ben", "Bangla"], diff --git a/tests/test_person.py b/tests/test_person.py index 02c1434..eaf0904 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -36,7 +36,7 @@ def test_mixed_capitalization_with_validate_python(): def test_different_nickname_format_oh_well(): - obj = MyModel(person="Arthur 'The Fonz' Fonzerelli") + obj = validate_python(MyModel, dict(person="Arthur 'The Fonz' Fonzerelli")) assert obj.person.first == "Arthur" assert obj.person.last == "Fonzerelli" assert obj.person.middle == "'the Fonz'" @@ -62,13 +62,15 @@ def test_json_serialization(): def test_value_error(): try: - assert MyModel(person=None).person is None + data: dict = {} + validate_python(MyModel, data) assert False, "Didn't fail as expected." except ValidationError: pass try: - assert MyModel(person=5) + data = dict(person=5) + validate_python(MyModel, data) assert False, "Didn't fail as expected." except ValueError: pass From dde6de6d493f9c94cc620a7193559126e08bac82 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Sun, 24 Mar 2024 06:03:42 -0500 Subject: [PATCH 12/15] Finish refactor. --- src/fuzztypes/in_memory.py | 1 + src/fuzztypes/lazy.py | 2 +- src/fuzztypes/match.py | 2 +- src/fuzztypes/person.py | 4 +--- src/fuzztypes/storage.py | 4 +--- src/fuzztypes/validation.py | 15 +++------------ tests/test_person.py | 20 ++++++++++++++++---- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index 0a956ee..c9d954b 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -200,6 +200,7 @@ def InMemory( source, case_sensitive=case_sensitive, encoder=encoder, + entity_type=entity_type, fuzz_scorer=fuzz_scorer, limit=limit, min_similarity=min_similarity, diff --git a/src/fuzztypes/lazy.py b/src/fuzztypes/lazy.py index 93542b6..a8d1516 100644 --- a/src/fuzztypes/lazy.py +++ b/src/fuzztypes/lazy.py @@ -1,7 +1,7 @@ import functools import importlib import os -from typing import Any, List, TypedDict, Callable, Optional, Union +from typing import Any, List, TypedDict, Callable, Optional from fuzztypes import const diff --git a/src/fuzztypes/match.py b/src/fuzztypes/match.py index e89664a..ad93fd3 100644 --- a/src/fuzztypes/match.py +++ b/src/fuzztypes/match.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional, Iterator, Any, Union, Type, Generator +from typing import List, Tuple, Optional, Any, Union, Type from pydantic import BaseModel, Field diff --git a/src/fuzztypes/person.py b/src/fuzztypes/person.py index df08dfa..0c225b9 100644 --- a/src/fuzztypes/person.py +++ b/src/fuzztypes/person.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from fuzztypes import FuzzValidator, const, lazy +from fuzztypes import FuzzValidator, lazy FULL_NAME = "{title} {first} {middle} {last} {suffix} ({nickname})" SHORT_NAME = "{first} {last}" @@ -102,8 +102,6 @@ def to_person(key) -> Optional[PersonModel]: person = key elif isinstance(key, dict): person = PersonModel(**key) - elif key is None: - person = None else: raise ValueError(f"Unexpected key type {type(key)} for {key}.") diff --git a/src/fuzztypes/storage.py b/src/fuzztypes/storage.py index a8b8ea7..7f684fc 100644 --- a/src/fuzztypes/storage.py +++ b/src/fuzztypes/storage.py @@ -99,9 +99,7 @@ def vect_dimensions(self): return self._vect_dimensions def encode(self, values: List[str]): - return self.encoder( - values, - ) + return self.encoder(values) # # fuzzy matching diff --git a/src/fuzztypes/validation.py b/src/fuzztypes/validation.py index 70f90cb..26216f9 100644 --- a/src/fuzztypes/validation.py +++ b/src/fuzztypes/validation.py @@ -1,18 +1,8 @@ import dataclasses -from itertools import chain import sys from functools import lru_cache -from typing import ( - Any, - Union, - Callable, - Dict, - cast, - Optional, - get_origin, - get_args, -) - +from itertools import chain +from typing import Any, Dict, Optional, Union, cast, get_args from pydantic import ( GetCoreSchemaHandler, @@ -21,6 +11,7 @@ json_schema, ) from pydantic_core import CoreSchema, PydanticCustomError, core_schema + from fuzztypes import Entity dataclass_kwargs: Dict[str, Any] diff --git a/tests/test_person.py b/tests/test_person.py index eaf0904..432def5 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -35,6 +35,10 @@ def test_mixed_capitalization_with_validate_python(): assert person.last == "MacLaine" +def test_null_person_ok(): + assert validate_python(Optional[Person], None) is None + + def test_different_nickname_format_oh_well(): obj = validate_python(MyModel, dict(person="Arthur 'The Fonz' Fonzerelli")) assert obj.person.first == "Arthur" @@ -44,20 +48,28 @@ def test_different_nickname_format_oh_well(): def test_json_serialization(): - json = '{"person": "Grace Hopper"}' + json = '{"person": "Grace Hopper", "optional": null}' obj = MyModel.model_validate_json(json) assert str(obj.person) == "Grace Hopper" + assert obj.optional is None - data = dict(person="grace hopper") + data = dict(person="grace hopper", optional="ava lovelace") obj = MyModel.model_validate(data) assert str(obj.person) == "Grace Hopper" + assert str(obj.optional) == "Ava Lovelace" json = obj.model_dump_json(exclude_defaults=True) - assert json == '{"person":{"first":"Grace","last":"Hopper"}}' + assert ( + json == '{"person":{"first":"Grace","last":"Hopper"},' + '"optional":{"first":"Ava","last":"Lovelace"}}' + ) obj = MyModel.model_validate_json(json) data = obj.model_dump(exclude_defaults=True) - assert data == dict(person=dict(first="Grace", last="Hopper")) + assert data == dict( + person=dict(first="Grace", last="Hopper"), + optional=dict(first="Ava", last="Lovelace"), + ) def test_value_error(): From 9e4bf0137548ca5d96b76dcee80d6b03af2f57a6 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Mon, 25 Mar 2024 09:09:46 -0400 Subject: [PATCH 13/15] Added CHANGELOG for 0.1.0 --- CHANGELOG | 21 +++++++++++++++++++++ tests/test_full_model.py | 23 ++++++++++++----------- 2 files changed, 33 insertions(+), 11 deletions(-) create mode 100644 CHANGELOG diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..c153c23 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,21 @@ +## v0.1.0 (2023-03-25) + +#### Added + - FuzzValidator annotation type for creating custom fuzzy types + - Language, LanguageName, and LanguageCode annotation types + - ASCII, Integer, Date, and Datetime annotation types + - RegexValidator annotation type with examples + - validate_python, validate_json, and validate_entity utility functions + - fuzztypes.logger for logging + - fuzztypes.utils module for utility functions + - Full model usage example and tests + - Tests for new annotation types and utility functions + +#### Changed + - Refactored InMemory and OnDisk to use FuzzValidator + - Refactored Person to use FuzzValidator + - Renamed Regex to RegexValidator + +#### Removed + - abstract.py module and AbstractType/AbstractStorage classes + - function.py module and Function annotation type \ No newline at end of file diff --git a/tests/test_full_model.py b/tests/test_full_model.py index e0541a0..cb5fb23 100644 --- a/tests/test_full_model.py +++ b/tests/test_full_model.py @@ -43,17 +43,18 @@ class Fuzzy(BaseModel): def test_full_model(): # create an instance of class Fuzzy - obj = Fuzzy( - ascii="άνθρωπος", - email="John Doe ", - emoji="thought bubble", - handle="Ian Maurer (@imaurer)", - integer="fifty-five", # type: ignore[arg-type] - inventor="ada luvlace", # type: ignore[arg-type] - person="mr. arthur h. fonzarelli (fonzie)", # type: ignore[arg-type] - time="5am on Jan 1, 2025", # type: ignore[arg-type] - zipcode="(Zipcode: 12345-6789)", - ) + data = { + "ascii": "άνθρωπος", + "email": "John Doe ", + "emoji": "thought bubble", + "handle" : "Ian Maurer (@imaurer)", + "integer": "fifty-five", + "inventor": "ada luvlace", + "person": "mr. arthur h. fonzarelli (fonzie)", + "time": "5am on Jan 1, 2025", + "zipcode": "(Zipcode: 12345-6789)", + } + obj = Fuzzy(**data) # test the autocorrecting performed From 64ca172438a7254164b87389be8e9a02d84fda8c Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Mon, 25 Mar 2024 13:15:23 -0400 Subject: [PATCH 14/15] Added CHANGELOG for 0.1.0 --- CHANGELOG | 21 -- CHANGELOG.md | 21 ++ README.md | 256 +++++++++++++----- src/fuzztypes/__init__.py | 20 +- src/fuzztypes/const.py | 2 +- src/fuzztypes/emojis.py | 8 +- src/fuzztypes/in_memory.py | 6 +- src/fuzztypes/language.py | 8 +- src/fuzztypes/on_disk.py | 8 +- src/fuzztypes/regex.py | 13 +- src/fuzztypes/storage.py | 2 +- tests/in_memory/test_in_memory_alias.py | 12 +- tests/in_memory/test_in_memory_fuzz.py | 18 +- tests/in_memory/test_in_memory_name.py | 14 +- tests/in_memory/test_in_memory_similarity.py | 6 +- .../in_memory/test_in_memory_tags_example.py | 8 +- tests/on_disk/test_on_disk_alias.py | 14 +- tests/on_disk/test_on_disk_fuzz.py | 4 +- tests/on_disk/test_on_disk_name.py | 10 +- tests/on_disk/test_on_disk_semantic.py | 10 +- tests/test_entity.py | 10 +- tests/test_language.py | 19 +- tests/{test_full_model.py => test_readme.py} | 109 ++++++-- 23 files changed, 413 insertions(+), 186 deletions(-) delete mode 100644 CHANGELOG create mode 100644 CHANGELOG.md rename tests/{test_full_model.py => test_readme.py} (67%) diff --git a/CHANGELOG b/CHANGELOG deleted file mode 100644 index c153c23..0000000 --- a/CHANGELOG +++ /dev/null @@ -1,21 +0,0 @@ -## v0.1.0 (2023-03-25) - -#### Added - - FuzzValidator annotation type for creating custom fuzzy types - - Language, LanguageName, and LanguageCode annotation types - - ASCII, Integer, Date, and Datetime annotation types - - RegexValidator annotation type with examples - - validate_python, validate_json, and validate_entity utility functions - - fuzztypes.logger for logging - - fuzztypes.utils module for utility functions - - Full model usage example and tests - - Tests for new annotation types and utility functions - -#### Changed - - Refactored InMemory and OnDisk to use FuzzValidator - - Refactored Person to use FuzzValidator - - Renamed Regex to RegexValidator - -#### Removed - - abstract.py module and AbstractType/AbstractStorage classes - - function.py module and Function annotation type \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d44369f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,21 @@ +## v0.1.0 (2023-03-25) + +The project's typing system was validated using mypy and refactored to follow +Annotated types as specified by [PEP 593](https://peps.python.org/pep-0593/). + +#### Added + - FuzzValidator annotation type created to simplify design + - validate_python, validate_json, and validate_entity functions added + - Added Language, LanguageName, and LanguageCode usable types + - fuzztypes.logger and fuzztypes.utils module for downloading iso codes + +#### Changed + - Renamed OnDisk to OnDiskValidator + - Renamed InMemory to MemoryValidator + - Refactored InMemoryValidator and OnDiskValidator to use FuzzValidator + - Refactored Person to use FuzzValidator + - Renamed Regex to RegexValidator + +#### Removed + - abstract.py module and AbstractType class, simplified by FuzzValidator + - function.py module and Function annotation type, replaced by FuzzValidator \ No newline at end of file diff --git a/README.md b/README.md index 3262790..ccebb57 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,6 @@ Designed for simplicity, it provides powerful normalization capabilities (e.g. named entity linking) to ensure structured data is composed of "smart things" not "dumb strings". -*Note: FuzzTypes is currently experimental and there could be breaking -changes to its API over the next few weeks.* ## Getting Started @@ -36,16 +34,20 @@ FuzzTypes expands on the standard data conversions handled by Pydantic and provides a variety of autocorrecting annotation types. ```python +from datetime import datetime +from typing import Annotated + from pydantic import BaseModel + from fuzztypes import ( ASCII, Datetime, Email, Fuzzmoji, - InMemory, + InMemoryValidator, Integer, Person, - Regex, + RegexValidator, ZipCode, flags, ) @@ -53,11 +55,15 @@ from fuzztypes import ( # define a source, see EntitySource for using TSV, CSV, JSONL inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"] -# define a named entity type in memory. use OnDisk for larger data sets. -Inventor = InMemory(inventors, search_flag=flags.FuzzSearch) +# define a in memory validator with fuzz search enabled. +Inventor = Annotated[ + str, InMemoryValidator(inventors, search_flag=flags.FuzzSearch) +] # custom Regex type for finding twitter handles. -Handle = Regex(r'@\w{1,15}', examples=["@genomoncology"]) +Handle = Annotated[ + str, RegexValidator(r"@\w{1,15}", examples=["@genomoncology"]) +] # define a Pydantic class with 9 fuzzy type attributes class Fuzzy(BaseModel): @@ -105,7 +111,7 @@ assert obj.integer == 55 assert obj.inventor == "Ada Lovelace" # human name parser (title, first, middle, last, suffix, nickname) -assert str(obj.person) == 'Mr. Arthur Herbert Fonzarelli (fonzie)' +assert str(obj.person) == "Mr. Arthur H. Fonzarelli (fonzie)" assert obj.person.short_name == "Arthur Fonzarelli" assert obj.person.nickname == "fonzie" assert obj.person.last == "Fonzarelli" @@ -117,7 +123,27 @@ assert obj.time.isoformat() == "2025-01-01T05:00:00" assert obj.zipcode == "12345-6789" # print JSON on success -print(obj.model_dump_json(indent=4)) +assert obj.model_dump() == { + "ascii": "anthropos", + "email": "jdoe@example.com", + "emoji": "💭", + "handle": "@imaurer", + "integer": 55, + "inventor": "Ada Lovelace", + "person": { + "first": "Arthur", + "init_format": "{first} {middle} {last}", + "last": "Fonzarelli", + "middle": "H.", + "name_format": "{title} {first} {middle} {last} {suffix} " + "({nickname})", + "nickname": "fonzie", + "suffix": "", + "title": "Mr.", + }, + "time": datetime(2025, 1, 1, 5), + "zipcode": "12345-6789", +} ``` Types can also be used outside of Pydantic models to validate and normalize data: @@ -164,19 +190,18 @@ There is a read-only notebook that you can copy and edit to try out FuzzTypes: [https://colab.research.google.com/drive/1GNngxcTUXpWDqK_qNsJoP2NhSN9vKCzZ?usp=sharing](https://colab.research.google.com/drive/1GNngxcTUXpWDqK_qNsJoP2NhSN9vKCzZ?usp=sharing) -## Base Types +## Base Validators -Base types are the fundamental building blocks in FuzzTypes. They provide the core functionality and can be used to -create custom annotation types tailored to specific use cases. +Base validators are the building blocks of FuzzTypes that can be used for creating custom "usable types". -| Type | Description | -|------------|--------------------------------------------------------------------------------------------| -| `DateType` | Base type for fuzzy parsing date objects. | -| `Function` | Allows using any function that accepts one value and returns one value for transformation. | -| `InMemory` | Enables matching entities in memory using exact, alias, fuzzy, or semantic search. | -| `OnDisk` | Performs matching entities stored on disk using exact, alias, fuzzy, or semantic search. | -| `Regex` | Allows matching values using a regular expression pattern. | -| `TimeType` | Base type for fuzzy parsing datetime objects (e.g., "tomorrow at 5am"). | +| Type | Description | +|---------------------|---------------------------------------------------------------------------------------------| +| `DateType` | Base date type, pass in arguments such as `date_order`, `strict` and `relative_base`. | +| `FuzzValidator` | Validator class that calls a provided function and handles core and json schema config. | +| `InMemoryValidator` | Enables matching entities in memory using exact, alias, fuzzy, or semantic search. | +| `OnDiskValidator` | Performs matching entities stored on disk using exact, alias, fuzzy, or semantic search. | +| `RegexValidator` | Regular expression pattern matching base validator. | +| `DatetimeType` | Base datetime type, pass in arguments such as `date_order`, `timezone` and `relative_base`. | These base types offer flexibility and extensibility, enabling you to create custom annotation types that suit your specific data validation and normalization requirements. @@ -205,10 +230,9 @@ These usable types provide a wide range of commonly needed data validations and easier to work with various data formats and perform tasks like parsing, extraction, and matching. -## Configuring FuzzTypes +## InMemoryValidator and OnDiskValidator Configuration -FuzzTypes provides a set of configuration options that allow you to customize the behavior of the annotation types. -These options can be passed as arguments when creating an instance of a FuzzType. +The InMemory and OnDisk Validator objects work with lists of Entities. The following table describes the available configuration options: @@ -224,11 +248,6 @@ The following table describes the available configuration options: | `notfound_mode` | `Literal["raise", "none", "allow"]` | `"raise"` | The action to take when a matching entity is not found. Available options are "raise" (raises an exception), "none" (returns `None`), and "allow" (returns the input key as the value). | | `search_flag` | `flags.SearchFlag` | `flags.DefaultSearch` | The search strategy to use for finding matches. It is a combination of flags that determine which fields of the `NamedEntity` are considered for matching and whether fuzzy or semantic search is enabled. Available options are defined in the `flags` module. | | `tiebreaker_mode` | `Literal["raise", "lesser", "greater"]` | `"raise"` | The strategy to use for resolving ties when multiple matches have the same similarity score. Available options are "raise" (raises an exception), "lesser" (returns the match with the lower value), and "greater" (returns the match with the greater value). | -| `validator_mode` | `Literal["before"]` | `"before"` | The validation mode to use for Pydantic. Currently, only the "before" mode is fully tested and supported, which resolves the value before validation. | - -These configuration options provide flexibility in tailoring the behavior of FuzzTypes to suit your specific use case. -By adjusting these options, you can control aspects such as case sensitivity, device selection, encoding mechanism, -search strategy, similarity thresholds, and more. ## Lazy Dependencies @@ -252,22 +271,22 @@ pip install anyascii dateparser emoji lancedb nameparser number-parser rapidfuzz ``` -| Fuzz Type | Library | License | Purpose | -|------------|--------------------------------------------------------------------------|------------|------------------------------------------------------------| -| ASCII | [anyascii](https://github.com/anyascii/anyascii) | ISC | Converting Unicode into ASCII equivalents (not GPL) | -| ASCII | [unidecode](https://github.com/avian2/unidecode) | GPL | Converting Unicode into ASCII equivalents (better quality) | -| Date | [dateparser](https://github.com/scrapinghub/dateparser) | BSD-3 | Parsing dates from strings | -| Emoji | [emoji](https://github.com/carpedm20/emoji/) | BSD | Handling and manipulating emoji characters | -| Fuzz | [rapidfuzz](https://github.com/rapidfuzz/RapidFuzz) | MIT | Performing fuzzy string matching | -| InMemory | [numpy](https://numpy.org/) | BSD | Numerical computing in Python | -| InMemory | [scikit-learn](https://scikit-learn.org/) | BSD | Machine learning in Python | -| InMemory | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | -| Integer | [number-parser](https://github.com/scrapinghub/number-parser) | BSD-3 | Parsing numbers from strings | -| OnDisk | [lancedb](https://github.com/lancedb/lancedb) | Apache-2.0 | High-performance, on-disk vector database | -| OnDisk | [pyarrow](https://github.com/apache/arrow) | Apache-2.0 | In-memory columnar data format and processing library | -| OnDisk | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | -| OnDisk | [tantivy](https://github.com/quickwit-oss/tantivy-py) | MIT | Full-text search (FTS) for LanceDB. | -| Person | [nameparser](https://github.com/derek73/python-nameparser) | LGPL | Parsing person names | +| Fuzz Type | Library | License | Purpose | +|-------------------|--------------------------------------------------------------------------|------------|------------------------------------------------------------| +| ASCII | [anyascii](https://github.com/anyascii/anyascii) | ISC | Converting Unicode into ASCII equivalents (not GPL) | +| ASCII | [unidecode](https://github.com/avian2/unidecode) | GPL | Converting Unicode into ASCII equivalents (better quality) | +| Date | [dateparser](https://github.com/scrapinghub/dateparser) | BSD-3 | Parsing dates from strings | +| Emoji | [emoji](https://github.com/carpedm20/emoji/) | BSD | Handling and manipulating emoji characters | +| Fuzz | [rapidfuzz](https://github.com/rapidfuzz/RapidFuzz) | MIT | Performing fuzzy string matching | +| InMemoryValidator | [numpy](https://numpy.org/) | BSD | Numerical computing in Python | +| InMemoryValidator | [scikit-learn](https://scikit-learn.org/) | BSD | Machine learning in Python | +| InMemoryValidator | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | +| Integer | [number-parser](https://github.com/scrapinghub/number-parser) | BSD-3 | Parsing numbers from strings | +| OnDiskValidator | [lancedb](https://github.com/lancedb/lancedb) | Apache-2.0 | High-performance, on-disk vector database | +| OnDiskValidator | [pyarrow](https://github.com/apache/arrow) | Apache-2.0 | In-memory columnar data format and processing library | +| OnDiskValidator | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | +| OnDiskValidator | [tantivy](https://github.com/quickwit-oss/tantivy-py) | MIT | Full-text search (FTS) for LanceDB. | +| Person | [nameparser](https://github.com/derek73/python-nameparser) | LGPL | Parsing person names | ## Maintainer @@ -283,7 +302,7 @@ offerings. Additional capabilities will soon be added: -- Complete OnDisk [fuzzy string matching](https://github.com/quickwit-oss/tantivy-py/issues/20). +- Complete OnDiskValidator [fuzzy string matching](https://github.com/quickwit-oss/tantivy-py/issues/20). - Reranking models - Hybrid search (linear and reciprocal rank fusion using fuzzy and semantic) - Trie-based autocomplete and aho-corasick search @@ -355,6 +374,7 @@ loading entities from a callable function. Example: ```python +from pathlib import Path from fuzztypes import EntitySource, NamedEntity # Load entities from a CSV file @@ -370,20 +390,24 @@ def load_animals(): animal_source = EntitySource(load_animals) ``` -### InMemory Base Type +### InMemoryValidator Base Type -The `InMemory` base type enables matching entities in memory using +The `InMemoryValidator` base type enables matching entities in memory using exact, alias, fuzzy, or semantic search. It is suitable for small to medium-sized datasets that can fit in memory and provides fast matching capabilities. Example: ```python -from fuzztypes import InMemory, flags +from typing import Annotated +from pydantic import BaseModel +from fuzztypes import InMemoryValidator, flags # Create a custom annotation type for matching fruits fruits = ["Apple", "Banana", "Orange"] -Fruit = InMemory(fruits, search_flag=flags.FuzzSearch) +Fruit = Annotated[ + str, InMemoryValidator(fruits, search_flag=flags.FuzzSearch) +] class MyModel(BaseModel): fruit: Fruit @@ -392,53 +416,86 @@ model = MyModel(fruit="appel") assert model.fruit == "Apple" ``` -### OnDisk Base Type +### OnDiskValidator Base Type -The `OnDisk` base type performs matching entities stored on disk +The `OnDiskValidator` base type performs matching entities stored on disk using exact, alias, fuzzy, or semantic search. It leverages the LanceDB library for efficient storage and retrieval of entities. -`OnDisk` is recommended for large datasets that cannot fit in memory. +`OnDiskValidator` is recommended for large datasets that cannot fit in memory. Example: ```python -from fuzztypes import OnDisk, flags +from typing import Annotated +from pydantic import BaseModel +from fuzztypes import OnDiskValidator -# Create a custom annotation type for matching countries -countries = ["United States", "United Kingdom", "Canada"] -Country = OnDisk("Country", countries, search_flag=flags.FuzzSearch) +# Create a custom annotation type for matching countries stored on disk +countries = [ + ("United States", "US"), + ("United Kingdom", "UK"), + ("Canada", "CA"), +] +Country = Annotated[str, OnDiskValidator("Country", countries)] class MyModel(BaseModel): country: Country -model = MyModel(country="USA") -assert model.country == "United States" +assert MyModel(country="Canada").country == "Canada" +assert MyModel(country="US").country == "United States" ``` ### DateType and TimeType -The `DateType` and `TimeType` base types provide fuzzy parsing +The `DateValidator` and `DatetimeValidator` base types provide fuzzy parsing capabilities for date and datetime objects, respectively. They allow you to define flexible date and time formats and perform parsing based on specified settings such as date order, timezone, and relative base. +Example: + +```python +from datetime import date, datetime +from pydantic import BaseModel +from typing import Annotated +from fuzztypes import DateValidator, DatetimeValidator + +MyDate = Annotated[date, DateValidator(date_order="MDY")] +MyTime = Annotated[datetime, DatetimeValidator(timezone="UTC")] + +class MyModel(BaseModel): + date: MyDate + time: MyTime + +model = MyModel(date="1/1/2023", time="1/1/23 at 10:30 PM") +assert model.date.isoformat() == "2023-01-01" +assert model.time.isoformat() == "2023-01-01T22:30:00+00:00" +``` + + +### FuzzValidator + +The `FuzzValidator` is the base of the fuzztypes typing system. +It can be used directly to wrap any python function. + Example: ```python -from fuzztypes import DateType, DatetimeType +from typing import Annotated +from pydantic import BaseModel +from fuzztypes import FuzzValidator -# Create custom annotation types for parsing dates and times -Date = DateType(date_order="MDY") -Time = DatetimeType(timezone="UTC") +# Create a custom annotation type that converts a value to uppercase +UpperCase = Annotated[str, FuzzValidator(str.upper)] class MyModel(BaseModel): - date: Date - time: Time + name: UpperCase -model = MyModel(date="4/20/2023", time="10:30 PM") -print(model.date) # Output: datetime.date(2023, 4, 20) -print(model.time) # Output: datetime.datetime(2023, 4, 20, 22, 30, tzinfo=) +model = MyModel(name="john") +assert model.name == "JOHN" ``` + + ### Regex The `Regex` base type allows matching values using a regular @@ -447,15 +504,68 @@ validate and extract specific patterns from input values. Example: ```python -from fuzztypes import Regex +from typing import Annotated +from pydantic import BaseModel +from fuzztypes import RegexValidator # Create a custom annotation type for matching email addresses -Email = Regex(r"[\w\.-]+@[\w\.-]+\.\w+") +IPAddress = Annotated[ + str, RegexValidator(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}$") +] class MyModel(BaseModel): - email: Email + ip_address: IPAddress -model = MyModel(email="john.doe@example.com") -assert model.email == "john.doe@example.com" +model = MyModel(ip_address="My internet IP address is 192.168.127.12") +assert model.ip_address == "192.168.127.12" ``` +### Languages + +Languages are loaded from the [Debian iso-codes](https://salsa.debian.org/iso-codes-team/iso-codes/) project. + +Languages are resolved using their preferred, common, inverted, bibliographic name, or 2 or 3 letter alpha code. + +Languages can be included as a string name (LanguageName), string code (LanguageCode) or full language object. + +The preferred code is the 2 letter version and will be used if available. Otherwise, the 3 letter alpha code is used. + +Example: + +```python +from pydantic import BaseModel +from fuzztypes import ( + Language, + LanguageName, + LanguageCode, + LanguageScope, + LanguageType, + LanguageNamedEntity, + validate_python, +) +class Model(BaseModel): + language_code: LanguageCode + language_name: LanguageName + language: Language + +# Test that Language resolves to the complete language object +data = dict(language_code="en", language="English", language_name="ENG") +obj = validate_python(Model, data) +assert obj.language_code == "en" +assert obj.language_name == "English" +assert obj.language.scope == LanguageScope.INDIVIDUAL +assert obj.language.type == LanguageType.LIVING +assert isinstance(obj.language, LanguageNamedEntity) +assert obj.model_dump(exclude_defaults=True, mode="json") == { + "language": { + "aliases": ["en", "eng"], + "alpha_2": "en", + "alpha_3": "eng", + "scope": "I", + "type": "L", + "value": "English", + }, + "language_code": "en", + "language_name": "English", +} +``` \ No newline at end of file diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index 80c4426..cbf41ef 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -29,8 +29,8 @@ # Named Entity Storage from . import storage -from .in_memory import InMemory -from .on_disk import OnDisk +from .in_memory import InMemoryValidator +from .on_disk import OnDiskValidator # Base Non-Entity Types from .regex import RegexValidator @@ -40,7 +40,14 @@ from .date import Date, DateValidator, Datetime, DatetimeValidator from .emojis import Emoji, Fuzzmoji, Vibemoji from .integer import Integer -from .language import Language, LanguageName, LanguageCode +from .language import ( + Language, + LanguageCode, + LanguageName, + LanguageNamedEntity, + LanguageScope, + LanguageType, +) from .person import Person from .regex import Email, SSN, ZipCode @@ -54,15 +61,18 @@ "EntitySource", "Fuzzmoji", "FuzzValidator", - "InMemory", + "InMemoryValidator", "Integer", "Language", "LanguageCode", "LanguageName", + "LanguageNamedEntity", + "LanguageScope", + "LanguageType", "Match", "MatchResult", "NamedEntity", - "OnDisk", + "OnDiskValidator", "Person", "Record", "RegexValidator", diff --git a/src/fuzztypes/const.py b/src/fuzztypes/const.py index ad04e66..2728454 100644 --- a/src/fuzztypes/const.py +++ b/src/fuzztypes/const.py @@ -4,7 +4,7 @@ # Home directory of fuzztypes library. FuzzHome = "~/.local/fuzztypes/" FuzzHome = os.path.expanduser(os.environ.get("FUZZTYPES_HOME", FuzzHome)) -OnDiskPath = os.path.join(FuzzHome, "on_disk") +StoredValidatorPath = os.path.join(FuzzHome, "on_disk") DownloadsPath = os.path.join(FuzzHome, "downloads") # Default encoder to use when generating semantic embeddings. diff --git a/src/fuzztypes/emojis.py b/src/fuzztypes/emojis.py index 0246655..3e2c09a 100644 --- a/src/fuzztypes/emojis.py +++ b/src/fuzztypes/emojis.py @@ -2,7 +2,7 @@ from typing import Annotated, List from pydantic import TypeAdapter -from fuzztypes import NamedEntity, EntitySource, OnDisk, flags, lazy +from fuzztypes import NamedEntity, EntitySource, OnDiskValidator, flags, lazy def load_emoji_entities() -> List[NamedEntity]: @@ -23,7 +23,7 @@ def load_emoji_entities() -> List[NamedEntity]: Emoji = Annotated[ str, - OnDisk( + OnDiskValidator( "Emoji", EmojiSource, search_flag=flags.AliasSearch, @@ -33,7 +33,7 @@ def load_emoji_entities() -> List[NamedEntity]: Fuzzmoji = Annotated[ str, - OnDisk( + OnDiskValidator( "Fuzzmoji", EmojiSource, search_flag=flags.FuzzSearch, @@ -45,7 +45,7 @@ def load_emoji_entities() -> List[NamedEntity]: Vibemoji = Annotated[ str, - OnDisk( + OnDiskValidator( "Vibemoji", EmojiSource, search_flag=flags.SemanticSearch, diff --git a/src/fuzztypes/in_memory.py b/src/fuzztypes/in_memory.py index c9d954b..63acb3e 100644 --- a/src/fuzztypes/in_memory.py +++ b/src/fuzztypes/in_memory.py @@ -16,7 +16,7 @@ ) -class InMemoryStorage(storage.AbstractStorage): +class InMemoryValidatorStorage(storage.AbstractStorage): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -182,7 +182,7 @@ def find_knn(self, key: str) -> tuple: return k_nearest_indices, top_k_scores -def InMemory( +def InMemoryValidator( source: Iterable, *, case_sensitive: bool = False, @@ -196,7 +196,7 @@ def InMemory( search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", ): - in_memory = InMemoryStorage( + in_memory = InMemoryValidatorStorage( source, case_sensitive=case_sensitive, encoder=encoder, diff --git a/src/fuzztypes/language.py b/src/fuzztypes/language.py index c27c9bc..937bec5 100644 --- a/src/fuzztypes/language.py +++ b/src/fuzztypes/language.py @@ -4,7 +4,7 @@ from pydantic import TypeAdapter -from fuzztypes import EntitySource, NamedEntity, OnDisk, flags, utils +from fuzztypes import EntitySource, NamedEntity, OnDiskValidator, flags, utils class LanguageScope(Enum): @@ -84,7 +84,7 @@ def do_load() -> Iterable[NamedEntity]: LanguageName = Annotated[ str, - OnDisk( + OnDiskValidator( "Language", EntitySource(load_languages(LanguageNamedEntity)), entity_type=LanguageNamedEntity, @@ -95,7 +95,7 @@ def do_load() -> Iterable[NamedEntity]: LanguageCode = Annotated[ str, - OnDisk( + OnDiskValidator( "Language", EntitySource(load_languages(LanguageCodeNameEntity)), entity_type=LanguageCodeNameEntity, @@ -106,7 +106,7 @@ def do_load() -> Iterable[NamedEntity]: Language = Annotated[ LanguageNamedEntity, - OnDisk( + OnDiskValidator( "Language", EntitySource(load_languages(LanguageModelNamedEntity)), entity_type=LanguageModelNamedEntity, diff --git a/src/fuzztypes/on_disk.py b/src/fuzztypes/on_disk.py index b25b5af..46ee3a4 100644 --- a/src/fuzztypes/on_disk.py +++ b/src/fuzztypes/on_disk.py @@ -17,7 +17,7 @@ accelerators = {"cuda", "mps"} -class OnDiskStorage(storage.AbstractStorage): +class StoredValidatorStorage(storage.AbstractStorage): def __init__( self, name: str, @@ -34,7 +34,7 @@ def __init__( def conn(self) -> Any: if self._conn is None: lancedb = lazy.lazy_import("lancedb") - self._conn = lancedb.connect(const.OnDiskPath) + self._conn = lancedb.connect(const.StoredValidatorPath) return self._conn @property @@ -224,7 +224,7 @@ def run_query(self, key, where=None, vector=None) -> List[Match]: return match_list -def OnDisk( +def OnDiskValidator( identity: str, source: Iterable, *, @@ -240,7 +240,7 @@ def OnDisk( search_flag: flags.SearchFlag = flags.DefaultSearch, tiebreaker_mode: const.TiebreakerMode = "raise", ): - on_disk = OnDiskStorage( + on_disk = StoredValidatorStorage( identity, source, case_sensitive=case_sensitive, diff --git a/src/fuzztypes/regex.py b/src/fuzztypes/regex.py index 06226d4..a4821fa 100644 --- a/src/fuzztypes/regex.py +++ b/src/fuzztypes/regex.py @@ -10,9 +10,18 @@ def RegexValidator( ): regex = re.compile(pattern) - def do_regex(key: str) -> Optional[str]: + def do_regex(key: str) -> str: matches = regex.findall(key) - return matches[0] if len(matches) == 1 else None + if len(matches) == 1: + return matches[0] + elif len(matches) > 1: + raise ValueError( + f"Multiple matches found for pattern '{pattern}' in '{key}'" + ) + else: + raise ValueError( + f"No matches found for pattern '{pattern}' in '{key}'" + ) return FuzzValidator(do_regex, examples=examples) diff --git a/src/fuzztypes/storage.py b/src/fuzztypes/storage.py index 7f684fc..3e3484e 100644 --- a/src/fuzztypes/storage.py +++ b/src/fuzztypes/storage.py @@ -11,9 +11,9 @@ def __init__( source: Iterable, *, case_sensitive: bool = False, + device: const.DeviceList = "cpu", encoder: Union[Callable, str, object] = None, entity_type: Type[NamedEntity] = NamedEntity, - device: const.DeviceList = "cpu", fuzz_scorer: str = "token_sort_ratio", limit: int = 10, min_similarity: float = 80.0, diff --git a/tests/in_memory/test_in_memory_alias.py b/tests/in_memory/test_in_memory_alias.py index 1f3947d..6aa884c 100644 --- a/tests/in_memory/test_in_memory_alias.py +++ b/tests/in_memory/test_in_memory_alias.py @@ -2,17 +2,17 @@ from typing import Annotated from pydantic import BaseModel, ValidationError -from fuzztypes import InMemory, flags +from fuzztypes import InMemoryValidator, flags @pytest.fixture(scope="session") def MythicalFigure(MythSource): - return InMemory(MythSource, search_flag=flags.AliasSearch) + return InMemoryValidator(MythSource, search_flag=flags.AliasSearch) @pytest.fixture(scope="session") def CasedMythicalFigure(MythSource): - return InMemory( + return InMemoryValidator( MythSource, search_flag=flags.AliasSearch, case_sensitive=True, @@ -63,7 +63,7 @@ class Example(BaseModel): def test_duplicate_records(): source = [["c", "b"], ["a", "b"], ["d", "b"]] - A = InMemory(source) + A = InMemoryValidator(source) assert A["a"].value == "a" try: @@ -76,8 +76,8 @@ def test_duplicate_records(): "100.0], b => d [100.0]]'" ) - A = InMemory(source, tiebreaker_mode="lesser") + A = InMemoryValidator(source, tiebreaker_mode="lesser") assert A["b"].value == "a" - A = InMemory(source, tiebreaker_mode="greater") + A = InMemoryValidator(source, tiebreaker_mode="greater") assert A["b"].value == "d" diff --git a/tests/in_memory/test_in_memory_fuzz.py b/tests/in_memory/test_in_memory_fuzz.py index c0247f5..bae3277 100644 --- a/tests/in_memory/test_in_memory_fuzz.py +++ b/tests/in_memory/test_in_memory_fuzz.py @@ -1,11 +1,11 @@ from typing import Annotated, Optional from pydantic import BaseModel, ValidationError -from fuzztypes import NamedEntity, InMemory, flags, validate_python +from fuzztypes import NamedEntity, InMemoryValidator, flags, validate_python FruitStr = Annotated[ Optional[str], - InMemory( + InMemoryValidator( ["Apple", "Banana"], search_flag=flags.FuzzSearch, ), @@ -13,7 +13,7 @@ DirectionStr = Annotated[ Optional[str], - InMemory( + InMemoryValidator( [ ("Left", "L"), ("Right", "R"), @@ -24,7 +24,7 @@ ] LooseStr = Annotated[ Optional[str], - InMemory( + InMemoryValidator( ["A B C", "X Y Z"], min_similarity=10.0, limit=1, @@ -33,7 +33,7 @@ ] StrictStr = Annotated[ str, - InMemory( + InMemoryValidator( ["A B C", "X Y Z"], min_similarity=95.0, limit=1, @@ -126,7 +126,7 @@ def test_with_priority(): assert sorted(entities)[1].value == "WP1" # validate that priority wins - WithPriority = InMemory( + WithPriority = InMemoryValidator( entities, min_similarity=65.0, search_flag=flags.FuzzSearch, @@ -136,7 +136,7 @@ def test_with_priority(): def test_without_tiebreaker(): entities = ["NT1", "NT2", "NT3"] - WithoutPriority = InMemory( + WithoutPriority = InMemoryValidator( entities, min_similarity=65.0, search_flag=flags.FuzzSearch, @@ -149,7 +149,7 @@ def test_without_tiebreaker(): def test_with_lesser_tiebreaker(): entities = ["NT1", "NT2", "NT3"] - LesserTiebreak = InMemory( + LesserTiebreak = InMemoryValidator( entities, min_similarity=65, tiebreaker_mode="lesser", @@ -160,7 +160,7 @@ def test_with_lesser_tiebreaker(): def test_with_greater_tiebreaker(): entities = ["NT1", "NT2", "NT3", "XX5"] - GreaterTiebreak = InMemory( + GreaterTiebreak = InMemoryValidator( entities, min_similarity=0, tiebreaker_mode="greater", diff --git a/tests/in_memory/test_in_memory_name.py b/tests/in_memory/test_in_memory_name.py index bbaa964..7a3daf8 100644 --- a/tests/in_memory/test_in_memory_name.py +++ b/tests/in_memory/test_in_memory_name.py @@ -2,13 +2,17 @@ from pydantic import BaseModel, ValidationError, Field -from fuzztypes import NamedEntity, InMemory, flags +from fuzztypes import NamedEntity, InMemoryValidator, flags names = ["George Washington", "John Adams", "Thomas Jefferson"] -President = InMemory(names, search_flag=flags.NameSearch) -CasedPrez = InMemory(names, case_sensitive=True, search_flag=flags.NameSearch) -NullPrez = InMemory(names, notfound_mode="none", search_flag=flags.NameSearch) -AllowPrez = InMemory( +President = InMemoryValidator(names, search_flag=flags.NameSearch) +CasedPrez = InMemoryValidator( + names, case_sensitive=True, search_flag=flags.NameSearch +) +NullPrez = InMemoryValidator( + names, notfound_mode="none", search_flag=flags.NameSearch +) +AllowPrez = InMemoryValidator( names, notfound_mode="allow", search_flag=flags.NameSearch ) diff --git a/tests/in_memory/test_in_memory_similarity.py b/tests/in_memory/test_in_memory_similarity.py index 9ad8ea2..e96671c 100644 --- a/tests/in_memory/test_in_memory_similarity.py +++ b/tests/in_memory/test_in_memory_similarity.py @@ -1,13 +1,15 @@ import pytest from fuzztypes import flags -from fuzztypes.in_memory import InMemoryStorage +from fuzztypes.in_memory import InMemoryValidatorStorage from fuzztypes.lazy import create_reranker @pytest.fixture(scope="session") def EmotionMemoryStorage(EmotionSource): - storage = InMemoryStorage(EmotionSource, search_flag=flags.SemanticSearch) + storage = InMemoryValidatorStorage( + EmotionSource, search_flag=flags.SemanticSearch + ) storage.prepare() return storage diff --git a/tests/in_memory/test_in_memory_tags_example.py b/tests/in_memory/test_in_memory_tags_example.py index c660f87..9721494 100644 --- a/tests/in_memory/test_in_memory_tags_example.py +++ b/tests/in_memory/test_in_memory_tags_example.py @@ -5,7 +5,7 @@ Collected tags from his website here: https://simonwillison.net/tags/ -Future Goal: Move to OnDisk implementation with NotFound=Allow where the +Future Goal: Move to OnDiskValidator implementation with NotFound=Allow where the tags are added to the database incrementally for future fuzzy matching. https://github.com/quickwit-oss/tantivy-py/issues/20 https://docs.rs/tantivy/latest/tantivy/query/struct.FuzzyTermQuery.html @@ -17,7 +17,7 @@ from fuzztypes import ( EntitySource, - InMemory, + InMemoryValidator, flags, validate_entity, validate_python, @@ -41,7 +41,7 @@ def Tag(TagSource): return Annotated[ str, - InMemory( + InMemoryValidator( TagSource, notfound_mode="allow", search_flag=flags.FuzzSearch, @@ -88,7 +88,7 @@ def test_fuzzy_scoring_edge_cases(Tag): def test_as_a_list_of_tags(TagSource): Tag = Annotated[ str, - InMemory( + InMemoryValidator( TagSource, notfound_mode="allow", search_flag=flags.FuzzSearch, diff --git a/tests/on_disk/test_on_disk_alias.py b/tests/on_disk/test_on_disk_alias.py index e889d15..3289cc5 100644 --- a/tests/on_disk/test_on_disk_alias.py +++ b/tests/on_disk/test_on_disk_alias.py @@ -3,17 +3,19 @@ import pytest from pydantic import BaseModel, ValidationError -from fuzztypes import OnDisk, flags +from fuzztypes import OnDiskValidator, flags @pytest.fixture(scope="session") def MythicalFigure(MythSource): - return OnDisk("MythicalFigure", MythSource, search_flag=flags.AliasSearch) + return OnDiskValidator( + "MythicalFigure", MythSource, search_flag=flags.AliasSearch + ) @pytest.fixture(scope="session") def CasedMythicalFigure(MythSource): - return OnDisk( + return OnDiskValidator( "CasedMythicalFigure", MythSource, search_flag=flags.AliasSearch, @@ -65,7 +67,7 @@ class Example(BaseModel): def test_duplicate_records(): source = [["c", "b"], ["a", "b"], ["d", "b"]] - A = OnDisk("DupeRec", source) + A = OnDiskValidator("DupeRec", source) assert A["a"].value == "a" try: @@ -78,8 +80,8 @@ def test_duplicate_records(): "100.0], b => d [100.0]]'" ) - A = OnDisk("DupeRec", source, tiebreaker_mode="lesser") + A = OnDiskValidator("DupeRec", source, tiebreaker_mode="lesser") assert A["b"].value == "a" - A = OnDisk("DupeRec", source, tiebreaker_mode="greater") + A = OnDiskValidator("DupeRec", source, tiebreaker_mode="greater") assert A["b"].value == "d" diff --git a/tests/on_disk/test_on_disk_fuzz.py b/tests/on_disk/test_on_disk_fuzz.py index 8895506..ba5b0ed 100644 --- a/tests/on_disk/test_on_disk_fuzz.py +++ b/tests/on_disk/test_on_disk_fuzz.py @@ -16,7 +16,9 @@ def test_tantivy(): schema = schema_builder.build() # create the index - path = os.path.join(const.OnDiskPath, "Fuzzmoji.lance/_indices/tantivy") + path = os.path.join( + const.StoredValidatorPath, "Fuzzmoji.lance/_indices/tantivy" + ) index = tantivy.Index(schema, path=path) searcher = index.searcher() diff --git a/tests/on_disk/test_on_disk_name.py b/tests/on_disk/test_on_disk_name.py index a45a3e9..7578e3f 100644 --- a/tests/on_disk/test_on_disk_name.py +++ b/tests/on_disk/test_on_disk_name.py @@ -2,27 +2,27 @@ from pydantic import BaseModel, ValidationError, Field -from fuzztypes import NamedEntity, OnDisk, flags, validate_entity +from fuzztypes import NamedEntity, OnDiskValidator, flags, validate_entity names = ["George Washington", "John Adams", "Thomas Jefferson"] -President = OnDisk( +President = OnDiskValidator( "President", names, search_flag=flags.NameSearch, ) -CasedPrez = OnDisk( +CasedPrez = OnDiskValidator( "CasedPrez", names, case_sensitive=True, search_flag=flags.NameSearch, ) -NullPrez = OnDisk( +NullPrez = OnDiskValidator( "NullPrez", names, notfound_mode="none", search_flag=flags.NameSearch, ) -AllowPrez = OnDisk( +AllowPrez = OnDiskValidator( "AllowPrez", names, notfound_mode="allow", diff --git a/tests/on_disk/test_on_disk_semantic.py b/tests/on_disk/test_on_disk_semantic.py index 3e05f4a..f4e8e4b 100644 --- a/tests/on_disk/test_on_disk_semantic.py +++ b/tests/on_disk/test_on_disk_semantic.py @@ -5,21 +5,21 @@ @pytest.fixture(scope="session") -def EmotionOnDiskStorage(EmotionSource): - storage = on_disk.OnDiskStorage( +def EmotionStoredValidatorStorage(EmotionSource): + storage = on_disk.StoredValidatorStorage( "Emotions", EmotionSource, search_flag=flags.SemanticSearch ) storage.prepare(force_drop_table=True) return storage -def test_check_storage_directly(EmotionOnDiskStorage): - matches = EmotionOnDiskStorage.get("happiness") +def test_check_storage_directly(EmotionStoredValidatorStorage): + matches = EmotionStoredValidatorStorage.get("happiness") assert len(matches) == 1 assert matches[0].entity.value == "Happiness" assert matches[0].score == 100.0 - matches = EmotionOnDiskStorage.get("scared") + matches = EmotionStoredValidatorStorage.get("scared") assert len(matches) == 10 assert matches[0].entity.value == "Fear" assert matches[0].score == pytest.approx(91.23) diff --git a/tests/test_entity.py b/tests/test_entity.py index 352a043..3fc6ed8 100644 --- a/tests/test_entity.py +++ b/tests/test_entity.py @@ -1,4 +1,4 @@ -from fuzztypes import NamedEntity, InMemory, EntitySource +from fuzztypes import NamedEntity, InMemoryValidator, EntitySource def test_entity_conv(): @@ -47,7 +47,7 @@ def test_meta_edge_cases(): def test_csv_load(EmojiSource): - Emoji = InMemory(EmojiSource) + Emoji = InMemoryValidator(EmojiSource) assert Emoji["happy"].value == "happy" assert Emoji["🎉"].value == "party" assert Emoji["party"].rank < Emoji["celebrate"].rank @@ -56,13 +56,13 @@ def test_csv_load(EmojiSource): def test_jsonl_load_animal(AnimalSource): assert AnimalSource[0].value == "Dog" - AnimalStr = InMemory(AnimalSource) + AnimalStr = InMemoryValidator(AnimalSource) assert AnimalStr["dog"] == AnimalSource[0] assert AnimalStr["Bird of prey"].value == "Eagle" def test_jsonl_label_source(FruitSource): - FruitStr = InMemory( + FruitStr = InMemoryValidator( FruitSource, case_sensitive=True, notfound_mode="none", @@ -72,7 +72,7 @@ def test_jsonl_label_source(FruitSource): def test_tsv_load(MythSource): - Myth = InMemory(MythSource) + Myth = InMemoryValidator(MythSource) assert Myth["Pallas"].value == "Athena" assert Myth["Jupiter"].value == "Zeus" diff --git a/tests/test_language.py b/tests/test_language.py index d316d85..ec3f56f 100644 --- a/tests/test_language.py +++ b/tests/test_language.py @@ -1,7 +1,15 @@ from pydantic import BaseModel -from fuzztypes import Language, LanguageCode, LanguageName, validate_python -from fuzztypes.language import load_languages, LanguageType, LanguageScope +from fuzztypes import ( + Language, + LanguageCode, + LanguageName, + validate_python, + LanguageNamedEntity, + LanguageScope, + LanguageType, +) +from fuzztypes.language import load_languages def test_load_languages(): @@ -13,15 +21,18 @@ def test_load_languages(): def test_language_model_resolution(): class Model(BaseModel): - language: Language language_code: LanguageCode language_name: LanguageName + language: Language # Test that Language resolves to the complete language object - data = dict(language="English", language_code="en", language_name="ENG") + data = dict(language_code="en", language="English", language_name="ENG") obj = validate_python(Model, data) + assert obj.language_code == "en" + assert obj.language_name == "English" assert obj.language.scope == LanguageScope.INDIVIDUAL assert obj.language.type == LanguageType.LIVING + assert isinstance(obj.language, LanguageNamedEntity) assert obj.model_dump(exclude_defaults=True, mode="json") == { "language": { "aliases": ["en", "eng"], diff --git a/tests/test_full_model.py b/tests/test_readme.py similarity index 67% rename from tests/test_full_model.py rename to tests/test_readme.py index cb5fb23..445dd77 100644 --- a/tests/test_full_model.py +++ b/tests/test_readme.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import date, datetime from typing import Annotated from pydantic import BaseModel @@ -8,7 +8,7 @@ Datetime, Email, Fuzzmoji, - InMemory, + InMemoryValidator, Integer, Person, RegexValidator, @@ -16,11 +16,14 @@ flags, ) + # define a source, see EntitySource for using TSV, CSV, JSONL inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"] -# define a named entity type in memory. use OnDisk for larger data sets. -Inventor = Annotated[str, InMemory(inventors, search_flag=flags.FuzzSearch)] +# define a in memory validator with fuzz search enabled. +Inventor = Annotated[ + str, InMemoryValidator(inventors, search_flag=flags.FuzzSearch) +] # custom Regex type for finding twitter handles. Handle = Annotated[ @@ -43,18 +46,17 @@ class Fuzzy(BaseModel): def test_full_model(): # create an instance of class Fuzzy - data = { - "ascii": "άνθρωπος", - "email": "John Doe ", - "emoji": "thought bubble", - "handle" : "Ian Maurer (@imaurer)", - "integer": "fifty-five", - "inventor": "ada luvlace", - "person": "mr. arthur h. fonzarelli (fonzie)", - "time": "5am on Jan 1, 2025", - "zipcode": "(Zipcode: 12345-6789)", - } - obj = Fuzzy(**data) + obj = Fuzzy( + ascii="άνθρωπος", + email="John Doe ", + emoji="thought bubble", + handle="Ian Maurer (@imaurer)", + integer="fifty-five", # type: ignore[arg-type] + inventor="ada luvlace", + person="mr. arthur h. fonzarelli (fonzie)", # type: ignore[arg-type] + time="5am on Jan 1, 2025", # type: ignore[arg-type] + zipcode="(Zipcode: 12345-6789)", + ) # test the autocorrecting performed @@ -202,3 +204,78 @@ def test_json_schema(): "type": "object", } assert data == expected_data + + +def test_in_memory_validator(): + # Create a custom annotation type for matching fruits in memory + fruits = ["Apple", "Banana", "Orange"] + Fruit = Annotated[ + str, InMemoryValidator(fruits, search_flag=flags.FuzzSearch) + ] + + class MyModel(BaseModel): + fruit: Fruit + + model = MyModel(fruit="appel") + assert model.fruit == "Apple" + + +def test_on_disk_validator(): + from fuzztypes import OnDiskValidator + + # Create a custom annotation type for matching countries stored on disk + countries = [ + ("United States", "US"), + ("United Kingdom", "UK"), + ("Canada", "CA"), + ] + Country = Annotated[str, OnDiskValidator("Country", countries)] + + class MyModel(BaseModel): + country: Country + + assert MyModel(country="Canada").country == "Canada" + assert MyModel(country="US").country == "United States" + + +def test_date_validators(): + from fuzztypes import DateValidator, DatetimeValidator + + MyDate = Annotated[date, DateValidator(date_order="MDY")] + MyTime = Annotated[datetime, DatetimeValidator(timezone="UTC")] + + class MyModel(BaseModel): + date: MyDate + time: MyTime + + model = MyModel(date="1/1/2023", time="1/1/23 at 10:30 PM") # type: ignore + assert model.date.isoformat() == "2023-01-01" + assert model.time.isoformat() == "2023-01-01T22:30:00+00:00" + + +def test_fuzz_validator(): + from fuzztypes import FuzzValidator + + # Create a custom annotation type that converts a value to uppercase + UpperCase = Annotated[str, FuzzValidator(str.upper)] + + class MyModel(BaseModel): + name: UpperCase + + model = MyModel(name="john") + assert model.name == "JOHN" + + +def test_regex_validator(): + from fuzztypes import RegexValidator + + # Create a custom annotation type for matching email addresses + IPAddress = Annotated[ + str, RegexValidator(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}$") + ] + + class MyModel(BaseModel): + ip_address: IPAddress + + model = MyModel(ip_address="My internet IP address is 192.168.127.12") + assert model.ip_address == "192.168.127.12" From 7d3f3679fa8336b684afe4a78673d124e1347084 Mon Sep 17 00:00:00 2001 From: Ian Maurer Date: Mon, 25 Mar 2024 13:56:11 -0400 Subject: [PATCH 15/15] Simplified error format to "did you mean" format. --- CHANGELOG.md | 3 ++- src/fuzztypes/__init__.py | 2 +- src/fuzztypes/match.py | 9 --------- src/fuzztypes/storage.py | 8 +++++--- tests/in_memory/test_in_memory_alias.py | 8 ++++---- tests/in_memory/test_in_memory_fuzz.py | 14 ++------------ tests/on_disk/test_on_disk_alias.py | 8 ++++---- 7 files changed, 18 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d44369f..3457774 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,10 +11,11 @@ Annotated types as specified by [PEP 593](https://peps.python.org/pep-0593/). #### Changed - Renamed OnDisk to OnDiskValidator - - Renamed InMemory to MemoryValidator + - Renamed InMemory to InMemoryValidator - Refactored InMemoryValidator and OnDiskValidator to use FuzzValidator - Refactored Person to use FuzzValidator - Renamed Regex to RegexValidator + - Changed error message to more common "did you mean" message format #### Removed - abstract.py module and AbstractType class, simplified by FuzzValidator diff --git a/src/fuzztypes/__init__.py b/src/fuzztypes/__init__.py index cbf41ef..05f5004 100644 --- a/src/fuzztypes/__init__.py +++ b/src/fuzztypes/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.0.2" +__version__ = "0.1.0" # logging import logging diff --git a/src/fuzztypes/match.py b/src/fuzztypes/match.py index ad93fd3..9312019 100644 --- a/src/fuzztypes/match.py +++ b/src/fuzztypes/match.py @@ -23,12 +23,6 @@ def rank_value(self) -> Tuple[Tuple[float, int], Any]: def __lt__(self, other: "Match"): return self.rank_value < other.rank_value - def __str__(self): - if self.is_alias: - return f"{self.key} => {self.entity.value} [{self.score:.1f}]" - else: - return f"{self.entity.value} [{self.score:.1f}]" - class MatchResult(BaseModel): matches: List[Match] = Field(default_factory=list) @@ -43,9 +37,6 @@ def __len__(self): def __getitem__(self, item): return self.matches[item] - def __str__(self): - return ", ".join(map(str, self.matches)) - @property def entity(self): return self.choice is not None and self.choice.entity diff --git a/src/fuzztypes/storage.py b/src/fuzztypes/storage.py index 3e3484e..b9f3e8b 100644 --- a/src/fuzztypes/storage.py +++ b/src/fuzztypes/storage.py @@ -62,11 +62,13 @@ def __getitem__(self, key: str) -> Optional[NamedEntity]: if self.notfound_mode == "none": return None - msg = "key ({key}) could not be resolved" + msg = '"{key}" could not be resolved' ctx: Dict[str, Any] = dict(key=key) if match_list: - ctx["near"] = [str(m) for m in match_list] - msg += f", closest non-matches = {match_list}" + near = [f'"{match.entity.value}"' for match in match_list.matches] + if len(near) > 1: + near[-1] = "or " + near[-1] + msg += f", did you mean {', '.join(near)}?" raise PydanticCustomError("key_not_found", msg, ctx) def prepare(self): diff --git a/tests/in_memory/test_in_memory_alias.py b/tests/in_memory/test_in_memory_alias.py index 6aa884c..56f20b8 100644 --- a/tests/in_memory/test_in_memory_alias.py +++ b/tests/in_memory/test_in_memory_alias.py @@ -70,10 +70,10 @@ def test_duplicate_records(): assert A["b"].value == "a" assert False, "Didn't raise exception!" except KeyError as e: - assert str(e) == ( - "'Key Error: b [key (b) could not be resolved, " - "closest non-matches = b => c [100.0], b => a [" - "100.0], b => d [100.0]]'" + msg = str(e.args[0]) + assert ( + msg == "Key Error: b " + '["b" could not be resolved, did you mean "c", "a", or "d"?]' ) A = InMemoryValidator(source, tiebreaker_mode="lesser") diff --git a/tests/in_memory/test_in_memory_fuzz.py b/tests/in_memory/test_in_memory_fuzz.py index bae3277..bbbbced 100644 --- a/tests/in_memory/test_in_memory_fuzz.py +++ b/tests/in_memory/test_in_memory_fuzz.py @@ -93,20 +93,10 @@ def test_min_score(): except ValidationError as e: assert e.errors(include_url=False) == [ { - "ctx": { - "key": "B K L", - "near": [ - "('matches', [Match(key='a b c', entity=NamedEntity(" - "value='A B C', label=None, meta=None, " - "priority=None, aliases=[]), is_alias=False, " - "score=40.0, term=None)])", - "('choice', None)", - ], - }, + "ctx": {"key": "B K L"}, "input": "B K L", "loc": ("strict",), - "msg": "key (B K L) could not be resolved, closest " - "non-matches = A B C [40.0]", + "msg": '"B K L" could not be resolved, did you mean "A B C"?', "type": "key_not_found", } ] diff --git a/tests/on_disk/test_on_disk_alias.py b/tests/on_disk/test_on_disk_alias.py index 3289cc5..1349060 100644 --- a/tests/on_disk/test_on_disk_alias.py +++ b/tests/on_disk/test_on_disk_alias.py @@ -74,10 +74,10 @@ def test_duplicate_records(): assert A["b"].value == "a" assert False, "Didn't raise exception!" except KeyError as e: - assert str(e) == ( - "'Key Error: b [key (b) could not be resolved, " - "closest non-matches = b => c [100.0], b => a [" - "100.0], b => d [100.0]]'" + assert ( + str(e) + == '\'Key Error: b ' + '["b" could not be resolved, did you mean "c", "a", or "d"?]\'' ) A = OnDiskValidator("DupeRec", source, tiebreaker_mode="lesser")