Skip to content
108 changes: 91 additions & 17 deletions nameparser/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
"""
import re
import sys
from collections.abc import Iterable, Iterator, Mapping, Set
from typing import Any, TypeVar
from collections.abc import Callable, Iterable, Iterator, Mapping, Set
from typing import Any, TypeVar, overload

if sys.version_info >= (3, 11):
from typing import Self
Expand Down Expand Up @@ -62,8 +62,14 @@ class SetManager(Set):

'''

_on_change: Callable[[], None] | None

def __init__(self, elements: Iterable[str]) -> None:
self.elements = set(elements)
# Optional invalidation hook, wired by an owning Constants so that
# in-place add()/remove() can clear its cached suffixes_prefixes_titles
# union. None when the manager is used standalone.
self._on_change = None

def __call__(self) -> Set[str]:
return self.elements
Expand Down Expand Up @@ -93,6 +99,8 @@ def add_with_encoding(self, s: str, encoding: str | None = None) -> None:
if isinstance(s, bytes):
s = s.decode(encoding)
self.elements.add(lc(s))
if self._on_change:
self._on_change()

def add(self, *strings: str) -> Self:
"""
Expand All @@ -109,10 +117,13 @@ def remove(self, *strings: str) -> Self:
Remove the lower case and no-period version of the string arguments from the set.
Returns ``self`` for chaining.
"""
changed = False
for s in strings:
if (lower := lc(s)) in self.elements:
self.elements.remove(lower)

changed = True
if changed and self._on_change:
self._on_change()
return self


Expand Down Expand Up @@ -164,6 +175,46 @@ def __getattr__(self, attr: str) -> re.Pattern[str]:
return self.get(attr, EMPTY_REGEX)


class _CachedUnionMember:
"""Descriptor for the four ``SetManager`` attributes whose union ``Constants``
caches in ``_pst`` (``prefixes``, ``suffix_acronyms``, ``suffix_not_acronyms``,
``titles``).

Assigning a new manager — or mutating one in place via ``add()`` / ``remove()``
— invalidates that cache. Keeping the behavior on a descriptor scopes it to
exactly these attributes, beside their declarations, rather than spreading it
across a catch-all ``__setattr__`` and a separate attribute-name list.
"""

_attr: str

def __set_name__(self, owner: type, name: str) -> None:
self._attr = '_' + name

@overload
def __get__(self, obj: None, objtype: type | None = None) -> '_CachedUnionMember': ...
@overload
def __get__(self, obj: 'Constants', objtype: type | None = None) -> SetManager: ...

def __get__(self, obj: 'Constants | None', objtype: type | None = None) -> 'SetManager | _CachedUnionMember':
if obj is None:
return self
return getattr(obj, self._attr)

def __set__(self, obj: 'Constants', value: SetManager) -> None:
if not isinstance(value, SetManager):
raise TypeError(
f"Expected a SetManager instance, got {type(value).__name__!r}. "
"Wrap your iterable: SetManager(['mr', 'ms'])"
)
previous = getattr(obj, self._attr, None)
if isinstance(previous, SetManager):
previous._on_change = None # detach the replaced manager so it no longer invalidates
value._on_change = obj._invalidate_pst
obj._invalidate_pst()
setattr(obj, self._attr, value)


class Constants:
"""
An instance of this class hold all of the configuration constants for the parser.
Expand All @@ -188,15 +239,14 @@ class Constants:
:py:attr:`regexes` wrapped with :py:class:`TupleManager`.
"""

prefixes: SetManager
suffix_acronyms: SetManager
suffix_not_acronyms: SetManager
titles: SetManager
prefixes = _CachedUnionMember()
suffix_acronyms = _CachedUnionMember()
suffix_not_acronyms = _CachedUnionMember()
titles = _CachedUnionMember()
first_name_titles: SetManager
conjunctions: SetManager
capitalization_exceptions: TupleManager[str]
regexes: RegexTupleManager

_pst: Set[str] | None

string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
Expand Down Expand Up @@ -272,6 +322,9 @@ def __init__(self,
capitalization_exceptions: TupleManager[str] | Iterable[tuple[str, str]] = CAPITALIZATION_EXCEPTIONS,
regexes: RegexTupleManager | TupleManager[re.Pattern[str]] | Iterable[tuple[str, re.Pattern[str]]] = REGEXES
) -> None:
# These four descriptor assignments call _CachedUnionMember.__set__, which
# calls _invalidate_pst() and establishes self._pst. They must come before
# any read of suffixes_prefixes_titles.
self.prefixes = SetManager(prefixes)
self.suffix_acronyms = SetManager(suffix_acronyms)
self.suffix_not_acronyms = SetManager(suffix_not_acronyms)
Expand All @@ -280,11 +333,13 @@ def __init__(self,
self.conjunctions = SetManager(conjunctions)
self.capitalization_exceptions = TupleManager(capitalization_exceptions)
self.regexes = RegexTupleManager(regexes)

def _invalidate_pst(self) -> None:
self._pst = None

@property
def suffixes_prefixes_titles(self) -> Set[str]:
if not self._pst:
if self._pst is None:
self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles
return self._pst

Expand All @@ -309,16 +364,35 @@ def __setstate__(self, state: Mapping[str, Any]) -> None:
if isinstance(getattr(type(self), name, None), property):
continue
setattr(self, name, value)
# Verify each descriptor-backed attr was restored. Without this, a missing
# key surfaces later as AttributeError: 'Constants' object has no attribute
# '_prefixes' — the private mangled name, not the public one, making it
# very hard to diagnose.
for attr in (n for n, v in vars(type(self)).items() if isinstance(v, _CachedUnionMember)):
if not hasattr(self, '_' + attr):
raise ValueError(
f"Pickle state is missing required field {attr!r}. "
"The state blob may be truncated or from an incompatible version."
)

def __getstate__(self) -> Mapping[str, Any]:
# Pickle only the instance's own configuration: the collections built in
# __init__ plus any instance-level scalar overrides. Class-level scalar
# defaults are restored by the class itself, and underscore-prefixed
# names such as the ``_pst`` cache are private and rebuilt on demand.
# Filtering on ``self.__dict__`` also excludes the computed
# ``suffixes_prefixes_titles`` property automatically, since properties
# live on the class and never appear in an instance's ``__dict__``.
return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}
# Pickle the instance's own configuration: the collections built in
# __init__ plus any instance-level scalar overrides.
# _CachedUnionMember descriptors store their values with a leading
# underscore (e.g. `_prefixes` for `prefixes`) so that the descriptor's
# __set__ owns assignment. We map those back to the public names so
# __setstate__ can restore them through the descriptor, re-wiring the
# invalidation callbacks. All other underscore-prefixed names (_pst, etc.)
# are private/cache and are intentionally excluded.
state: dict[str, Any] = {}
for name, val in self.__dict__.items():
if name.startswith('_'):
public = name[1:]
if isinstance(getattr(type(self), public, None), _CachedUnionMember):
state[public] = val
else:
state[name] = val
return state


#: A module-level instance of the :py:class:`Constants()` class.
Expand Down
2 changes: 1 addition & 1 deletion tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def m(self, actual: T, expected: T, hn: HumanName) -> None:
hn,
)
except UnicodeDecodeError:
assert actual == expected_
assert actual == expected_, f"actual={actual!r} != expected={expected_!r}"

def assertEqual(self, first: object, second: object, msg: object = None) -> None:
assert first == second, msg
Expand Down
3 changes: 0 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,3 @@ def empty_attribute_default(request: pytest.FixtureRequest) -> Iterator[str | No
setattr(CONSTANTS, attr, value)
for attr, value in collection_snapshot.items():
setattr(CONSTANTS, attr, value)
# Invalidate the lazily-built suffixes/prefixes/titles cache so it is
# recomputed from the restored collections rather than a mutated one.
CONSTANTS._pst = None
118 changes: 118 additions & 0 deletions tests/test_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,121 @@ def test_unpickle_legacy_state_with_property_key(self) -> None:

# The real customization is recovered and the property key is ignored.
self.assertIn('legacytitle', restored.titles)

def test_suffixes_prefixes_titles_reflects_add_title(self) -> None:
"""suffixes_prefixes_titles must include titles added after construction."""
c = Constants()
_ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised
c.titles.add('emerita')
self.assertIn('emerita', c.suffixes_prefixes_titles)

def test_suffixes_prefixes_titles_reflects_add_prefix(self) -> None:
"""suffixes_prefixes_titles must include prefixes added after construction."""
c = Constants()
_ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised
c.prefixes.add('xpfx')
self.assertIn('xpfx', c.suffixes_prefixes_titles)

def test_suffixes_prefixes_titles_reflects_remove_title(self) -> None:
"""suffixes_prefixes_titles must not include a word that was only in titles and is then removed."""
c = Constants()
c.titles.add('emerita')
self.assertIn('emerita', c.suffixes_prefixes_titles)
c.titles.remove('emerita')
self.assertNotIn('emerita', c.suffixes_prefixes_titles)

def test_suffixes_prefixes_titles_reflects_remove_prefix(self) -> None:
"""suffixes_prefixes_titles must not include a word that was only in prefixes and is then removed."""
c = Constants()
c.prefixes.add('xpfx')
self.assertIn('xpfx', c.suffixes_prefixes_titles)
c.prefixes.remove('xpfx')
self.assertNotIn('xpfx', c.suffixes_prefixes_titles)

def test_suffixes_prefixes_titles_reflects_add_suffix_acronym(self) -> None:
"""suffixes_prefixes_titles must include suffix acronyms added after construction."""
c = Constants()
_ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised
c.suffix_acronyms.add('xsfx')
self.assertIn('xsfx', c.suffixes_prefixes_titles)

def test_suffixes_prefixes_titles_reflects_add_suffix_not_acronym(self) -> None:
"""suffixes_prefixes_titles must include non-acronym suffixes added after construction."""
c = Constants()
_ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised
c.suffix_not_acronyms.add('xsfx')
self.assertIn('xsfx', c.suffixes_prefixes_titles)

def test_pickle_roundtrip_rewires_invalidation_callbacks(self) -> None:
"""Mutations on a deserialized Constants must still invalidate the cache."""
c = Constants()
# Safe: round-tripping a Constants the test just built, not untrusted data.
restored = pickle.loads(pickle.dumps(c))
_ = restored.suffixes_prefixes_titles # prime the cache
restored.titles.add('posttitle')
self.assertIn('posttitle', restored.suffixes_prefixes_titles)

def test_is_rootname_consistent_with_is_title(self) -> None:
"""is_rootname must return False for words recognised by is_title."""
hn = HumanName("", constants=None)
_ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable
hn.C.titles.add('emerita')
self.assertFalse(hn.is_rootname('emerita'))

def test_is_rootname_consistent_with_is_prefix(self) -> None:
"""is_rootname must return False for words recognised by is_prefix."""
hn = HumanName("", constants=None)
_ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable
hn.C.prefixes.add('xpfx')
self.assertFalse(hn.is_rootname('xpfx'))

def test_suffixes_prefixes_titles_reflects_remove_suffix_acronym(self) -> None:
"""suffixes_prefixes_titles must reflect a suffix acronym removed after the cache is primed."""
c = Constants()
c.suffix_acronyms.add('xsfx')
self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache
c.suffix_acronyms.remove('xsfx')
self.assertNotIn('xsfx', c.suffixes_prefixes_titles)

def test_suffixes_prefixes_titles_reflects_remove_suffix_not_acronym(self) -> None:
"""suffixes_prefixes_titles must reflect a non-acronym suffix removed after the cache is primed."""
c = Constants()
c.suffix_not_acronyms.add('xsfx')
self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache
c.suffix_not_acronyms.remove('xsfx')
self.assertNotIn('xsfx', c.suffixes_prefixes_titles)

def test_suffixes_prefixes_titles_reflects_add_with_encoding(self) -> None:
"""add_with_encoding must invalidate the cache like add()/remove() do."""
c = Constants()
_ = c.suffixes_prefixes_titles # prime the cache
c.titles.add_with_encoding(b'b\351ck', encoding='latin_1')
self.assertIn('béck', c.suffixes_prefixes_titles)

def test_suffixes_prefixes_titles_reflects_replaced_manager(self) -> None:
"""Replacing a whole SetManager must invalidate the cache and wire the new manager.

Covers the config-teardown path where a fresh SetManager is assigned
directly (e.g. ``setattr(CONSTANTS, 'titles', SetManager(...))``).
"""
c = Constants()
_ = c.suffixes_prefixes_titles # prime the cache
c.titles = SetManager(['brandnewtitle'])
# The replacement is reflected immediately...
self.assertIn('brandnewtitle', c.suffixes_prefixes_titles)
# ...and the new manager's own mutations invalidate the cache too,
# proving the on_change callback was re-wired to the replacement.
_ = c.suffixes_prefixes_titles
c.titles.add('secondtitle')
self.assertIn('secondtitle', c.suffixes_prefixes_titles)

def test_replaced_manager_no_longer_invalidates_cache(self) -> None:
"""A SetManager detached by reassignment must not invalidate the new cache."""
c = Constants()
replaced = c.titles
c.titles = SetManager(['brandnewtitle'])
primed = c.suffixes_prefixes_titles
# Mutating the orphaned manager must leave the live cache untouched.
replaced.add('ghost')
self.assertIs(c.suffixes_prefixes_titles, primed)
self.assertNotIn('ghost', c.suffixes_prefixes_titles)