diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index d2a0370..74c5e83 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -29,8 +29,8 @@ """ import re import sys -from collections.abc import Iterable, Iterator, Mapping, Set -from typing import Any, TypeVar +from collections.abc import Callable, Iterable, Iterator, Mapping, Set +from typing import Any, TypeVar, overload if sys.version_info >= (3, 11): from typing import Self @@ -62,8 +62,14 @@ class SetManager(Set): ''' + _on_change: Callable[[], None] | None + def __init__(self, elements: Iterable[str]) -> None: self.elements = set(elements) + # Optional invalidation hook, wired by an owning Constants so that + # in-place add()/remove() can clear its cached suffixes_prefixes_titles + # union. None when the manager is used standalone. + self._on_change = None def __call__(self) -> Set[str]: return self.elements @@ -93,6 +99,8 @@ def add_with_encoding(self, s: str, encoding: str | None = None) -> None: if isinstance(s, bytes): s = s.decode(encoding) self.elements.add(lc(s)) + if self._on_change: + self._on_change() def add(self, *strings: str) -> Self: """ @@ -109,10 +117,13 @@ def remove(self, *strings: str) -> Self: Remove the lower case and no-period version of the string arguments from the set. Returns ``self`` for chaining. """ + changed = False for s in strings: if (lower := lc(s)) in self.elements: self.elements.remove(lower) - + changed = True + if changed and self._on_change: + self._on_change() return self @@ -164,6 +175,46 @@ def __getattr__(self, attr: str) -> re.Pattern[str]: return self.get(attr, EMPTY_REGEX) +class _CachedUnionMember: + """Descriptor for the four ``SetManager`` attributes whose union ``Constants`` + caches in ``_pst`` (``prefixes``, ``suffix_acronyms``, ``suffix_not_acronyms``, + ``titles``). + + Assigning a new manager — or mutating one in place via ``add()`` / ``remove()`` + — invalidates that cache. Keeping the behavior on a descriptor scopes it to + exactly these attributes, beside their declarations, rather than spreading it + across a catch-all ``__setattr__`` and a separate attribute-name list. + """ + + _attr: str + + def __set_name__(self, owner: type, name: str) -> None: + self._attr = '_' + name + + @overload + def __get__(self, obj: None, objtype: type | None = None) -> '_CachedUnionMember': ... + @overload + def __get__(self, obj: 'Constants', objtype: type | None = None) -> SetManager: ... + + def __get__(self, obj: 'Constants | None', objtype: type | None = None) -> 'SetManager | _CachedUnionMember': + if obj is None: + return self + return getattr(obj, self._attr) + + def __set__(self, obj: 'Constants', value: SetManager) -> None: + if not isinstance(value, SetManager): + raise TypeError( + f"Expected a SetManager instance, got {type(value).__name__!r}. " + "Wrap your iterable: SetManager(['mr', 'ms'])" + ) + previous = getattr(obj, self._attr, None) + if isinstance(previous, SetManager): + previous._on_change = None # detach the replaced manager so it no longer invalidates + value._on_change = obj._invalidate_pst + obj._invalidate_pst() + setattr(obj, self._attr, value) + + class Constants: """ An instance of this class hold all of the configuration constants for the parser. @@ -188,15 +239,14 @@ class Constants: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ - prefixes: SetManager - suffix_acronyms: SetManager - suffix_not_acronyms: SetManager - titles: SetManager + prefixes = _CachedUnionMember() + suffix_acronyms = _CachedUnionMember() + suffix_not_acronyms = _CachedUnionMember() + titles = _CachedUnionMember() first_name_titles: SetManager conjunctions: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager - _pst: Set[str] | None string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" @@ -272,6 +322,9 @@ def __init__(self, capitalization_exceptions: TupleManager[str] | Iterable[tuple[str, str]] = CAPITALIZATION_EXCEPTIONS, regexes: RegexTupleManager | TupleManager[re.Pattern[str]] | Iterable[tuple[str, re.Pattern[str]]] = REGEXES ) -> None: + # These four descriptor assignments call _CachedUnionMember.__set__, which + # calls _invalidate_pst() and establishes self._pst. They must come before + # any read of suffixes_prefixes_titles. self.prefixes = SetManager(prefixes) self.suffix_acronyms = SetManager(suffix_acronyms) self.suffix_not_acronyms = SetManager(suffix_not_acronyms) @@ -280,11 +333,13 @@ def __init__(self, self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) + + def _invalidate_pst(self) -> None: self._pst = None @property def suffixes_prefixes_titles(self) -> Set[str]: - if not self._pst: + if self._pst is None: self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles return self._pst @@ -309,16 +364,35 @@ def __setstate__(self, state: Mapping[str, Any]) -> None: if isinstance(getattr(type(self), name, None), property): continue setattr(self, name, value) + # Verify each descriptor-backed attr was restored. Without this, a missing + # key surfaces later as AttributeError: 'Constants' object has no attribute + # '_prefixes' — the private mangled name, not the public one, making it + # very hard to diagnose. + for attr in (n for n, v in vars(type(self)).items() if isinstance(v, _CachedUnionMember)): + if not hasattr(self, '_' + attr): + raise ValueError( + f"Pickle state is missing required field {attr!r}. " + "The state blob may be truncated or from an incompatible version." + ) def __getstate__(self) -> Mapping[str, Any]: - # Pickle only the instance's own configuration: the collections built in - # __init__ plus any instance-level scalar overrides. Class-level scalar - # defaults are restored by the class itself, and underscore-prefixed - # names such as the ``_pst`` cache are private and rebuilt on demand. - # Filtering on ``self.__dict__`` also excludes the computed - # ``suffixes_prefixes_titles`` property automatically, since properties - # live on the class and never appear in an instance's ``__dict__``. - return {k: v for k, v in self.__dict__.items() if not k.startswith('_')} + # Pickle the instance's own configuration: the collections built in + # __init__ plus any instance-level scalar overrides. + # _CachedUnionMember descriptors store their values with a leading + # underscore (e.g. `_prefixes` for `prefixes`) so that the descriptor's + # __set__ owns assignment. We map those back to the public names so + # __setstate__ can restore them through the descriptor, re-wiring the + # invalidation callbacks. All other underscore-prefixed names (_pst, etc.) + # are private/cache and are intentionally excluded. + state: dict[str, Any] = {} + for name, val in self.__dict__.items(): + if name.startswith('_'): + public = name[1:] + if isinstance(getattr(type(self), public, None), _CachedUnionMember): + state[public] = val + else: + state[name] = val + return state #: A module-level instance of the :py:class:`Constants()` class. diff --git a/tests/base.py b/tests/base.py index 6b09ea1..fef4293 100644 --- a/tests/base.py +++ b/tests/base.py @@ -25,7 +25,7 @@ def m(self, actual: T, expected: T, hn: HumanName) -> None: hn, ) except UnicodeDecodeError: - assert actual == expected_ + assert actual == expected_, f"actual={actual!r} != expected={expected_!r}" def assertEqual(self, first: object, second: object, msg: object = None) -> None: assert first == second, msg diff --git a/tests/conftest.py b/tests/conftest.py index 17950a4..a35d515 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -60,6 +60,3 @@ def empty_attribute_default(request: pytest.FixtureRequest) -> Iterator[str | No setattr(CONSTANTS, attr, value) for attr, value in collection_snapshot.items(): setattr(CONSTANTS, attr, value) - # Invalidate the lazily-built suffixes/prefixes/titles cache so it is - # recomputed from the restored collections rather than a mutated one. - CONSTANTS._pst = None diff --git a/tests/test_constants.py b/tests/test_constants.py index 0e8ddbd..96c4fa8 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -232,3 +232,121 @@ def test_unpickle_legacy_state_with_property_key(self) -> None: # The real customization is recovered and the property key is ignored. self.assertIn('legacytitle', restored.titles) + + def test_suffixes_prefixes_titles_reflects_add_title(self) -> None: + """suffixes_prefixes_titles must include titles added after construction.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_prefix(self) -> None: + """suffixes_prefixes_titles must include prefixes added after construction.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised + c.prefixes.add('xpfx') + self.assertIn('xpfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_title(self) -> None: + """suffixes_prefixes_titles must not include a word that was only in titles and is then removed.""" + c = Constants() + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + c.titles.remove('emerita') + self.assertNotIn('emerita', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_prefix(self) -> None: + """suffixes_prefixes_titles must not include a word that was only in prefixes and is then removed.""" + c = Constants() + c.prefixes.add('xpfx') + self.assertIn('xpfx', c.suffixes_prefixes_titles) + c.prefixes.remove('xpfx') + self.assertNotIn('xpfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_suffix_acronym(self) -> None: + """suffixes_prefixes_titles must include suffix acronyms added after construction.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised + c.suffix_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_suffix_not_acronym(self) -> None: + """suffixes_prefixes_titles must include non-acronym suffixes added after construction.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised + c.suffix_not_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) + + def test_pickle_roundtrip_rewires_invalidation_callbacks(self) -> None: + """Mutations on a deserialized Constants must still invalidate the cache.""" + c = Constants() + # Safe: round-tripping a Constants the test just built, not untrusted data. + restored = pickle.loads(pickle.dumps(c)) + _ = restored.suffixes_prefixes_titles # prime the cache + restored.titles.add('posttitle') + self.assertIn('posttitle', restored.suffixes_prefixes_titles) + + def test_is_rootname_consistent_with_is_title(self) -> None: + """is_rootname must return False for words recognised by is_title.""" + hn = HumanName("", constants=None) + _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable + hn.C.titles.add('emerita') + self.assertFalse(hn.is_rootname('emerita')) + + def test_is_rootname_consistent_with_is_prefix(self) -> None: + """is_rootname must return False for words recognised by is_prefix.""" + hn = HumanName("", constants=None) + _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable + hn.C.prefixes.add('xpfx') + self.assertFalse(hn.is_rootname('xpfx')) + + def test_suffixes_prefixes_titles_reflects_remove_suffix_acronym(self) -> None: + """suffixes_prefixes_titles must reflect a suffix acronym removed after the cache is primed.""" + c = Constants() + c.suffix_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache + c.suffix_acronyms.remove('xsfx') + self.assertNotIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_suffix_not_acronym(self) -> None: + """suffixes_prefixes_titles must reflect a non-acronym suffix removed after the cache is primed.""" + c = Constants() + c.suffix_not_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache + c.suffix_not_acronyms.remove('xsfx') + self.assertNotIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_with_encoding(self) -> None: + """add_with_encoding must invalidate the cache like add()/remove() do.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles.add_with_encoding(b'b\351ck', encoding='latin_1') + self.assertIn('béck', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_replaced_manager(self) -> None: + """Replacing a whole SetManager must invalidate the cache and wire the new manager. + + Covers the config-teardown path where a fresh SetManager is assigned + directly (e.g. ``setattr(CONSTANTS, 'titles', SetManager(...))``). + """ + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles = SetManager(['brandnewtitle']) + # The replacement is reflected immediately... + self.assertIn('brandnewtitle', c.suffixes_prefixes_titles) + # ...and the new manager's own mutations invalidate the cache too, + # proving the on_change callback was re-wired to the replacement. + _ = c.suffixes_prefixes_titles + c.titles.add('secondtitle') + self.assertIn('secondtitle', c.suffixes_prefixes_titles) + + def test_replaced_manager_no_longer_invalidates_cache(self) -> None: + """A SetManager detached by reassignment must not invalidate the new cache.""" + c = Constants() + replaced = c.titles + c.titles = SetManager(['brandnewtitle']) + primed = c.suffixes_prefixes_titles + # Mutating the orphaned manager must leave the live cache untouched. + replaced.add('ghost') + self.assertIs(c.suffixes_prefixes_titles, primed) + self.assertNotIn('ghost', c.suffixes_prefixes_titles)