From 69e548c13cbfb6af4e0d9f918ba54f5bf2284b4b Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Thu, 25 Jun 2026 14:04:08 +0200 Subject: [PATCH 1/7] fix: suffixes_prefixes_titles always reflects current set state The `suffixes_prefixes_titles` property on `Constants` cached its result in `_pst` after the first access. Any subsequent `add()` or `remove()` call on `titles`, `prefixes`, `suffix_acronyms`, or `suffix_not_acronyms` was silently ignored by the cache, so `is_rootname()` kept returning the stale result. Concretely, a word added to `C.titles` after the property was first accessed would still be treated as a rootname (first/middle/last) by `join_on_conjunctions`, even though `is_title()` correctly returned `True` for it. This contradicts the documented behaviour of per-instance config customisation described in AGENTS.md. Fix: drop the `_pst` cache entirely and compute the union fresh on every access. The four-set union is cheap and the simplest correct approach. Add five tests that assert the property and `is_rootname` stay consistent with the live sets after `add()`/`remove()` calls. --- nameparser/config/__init__.py | 6 +----- tests/test_constants.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index d2a0370..25b3ca9 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -197,7 +197,6 @@ class Constants: capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager - _pst: Set[str] | None string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" """ @@ -280,13 +279,10 @@ def __init__(self, self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) - self._pst = None @property def suffixes_prefixes_titles(self) -> Set[str]: - if not self._pst: - self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles - return self._pst + return self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles def __repr__(self) -> str: return "" diff --git a/tests/test_constants.py b/tests/test_constants.py index 0e8ddbd..5013699 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -232,3 +232,35 @@ def test_unpickle_legacy_state_with_property_key(self) -> None: # The real customization is recovered and the property key is ignored. self.assertIn('legacytitle', restored.titles) + + def test_suffixes_prefixes_titles_reflects_add_title(self) -> None: + """suffixes_prefixes_titles must include titles added after construction.""" + c = Constants() + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_prefix(self) -> None: + """suffixes_prefixes_titles must include prefixes added after construction.""" + c = Constants() + c.prefixes.add('xpfx') + self.assertIn('xpfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_title(self) -> None: + """suffixes_prefixes_titles must not include a word that was only in titles and is then removed.""" + c = Constants() + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + c.titles.remove('emerita') + self.assertFalse('emerita' in c.suffixes_prefixes_titles) + + def test_is_rootname_consistent_with_is_title(self) -> None: + """is_rootname must return False for words recognised by is_title.""" + hn = HumanName("", constants=None) + hn.C.titles.add('emerita') + self.assertFalse(hn.is_rootname('emerita')) + + def test_is_rootname_consistent_with_is_prefix(self) -> None: + """is_rootname must return False for words recognised by is_prefix.""" + hn = HumanName("", constants=None) + hn.C.prefixes.add('xpfx') + self.assertFalse(hn.is_rootname('xpfx')) From b7526c8e1659c3635fee4653ebb0748ab4cd6bfb Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 27 Jun 2026 12:52:32 -0700 Subject: [PATCH 2/7] perf: restore _pst cache with proper invalidation via callbacks and __setattr__ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original cache was dropped to fix staleness, but recomputing the set union on every call is ~1000x slower. This restores the cache with a correct invalidation strategy: SetManager fires an on_change callback after add/remove, and Constants.__setattr__ clears _pst and re-wires the callback whenever one of the four contributing attrs is replaced (covering both user mutations and conftest teardown restores). The conftest manual `CONSTANTS._pst = None` reset is removed — it is now handled automatically by __setattr__ during collection restore. Co-Authored-By: Claude Sonnet 4.6 --- nameparser/config/__init__.py | 29 ++++++++++++++++++++++++----- tests/conftest.py | 3 --- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 25b3ca9..4f72766 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -29,7 +29,7 @@ """ import re import sys -from collections.abc import Iterable, Iterator, Mapping, Set +from collections.abc import Callable, Iterable, Iterator, Mapping, Set from typing import Any, TypeVar if sys.version_info >= (3, 11): @@ -62,8 +62,9 @@ class SetManager(Set): ''' - def __init__(self, elements: Iterable[str]) -> None: + def __init__(self, elements: Iterable[str], on_change: Callable[[], None] | None = None) -> None: self.elements = set(elements) + self._on_change = on_change def __call__(self) -> Set[str]: return self.elements @@ -93,6 +94,8 @@ def add_with_encoding(self, s: str, encoding: str | None = None) -> None: if isinstance(s, bytes): s = s.decode(encoding) self.elements.add(lc(s)) + if self._on_change: + self._on_change() def add(self, *strings: str) -> Self: """ @@ -112,7 +115,8 @@ def remove(self, *strings: str) -> Self: for s in strings: if (lower := lc(s)) in self.elements: self.elements.remove(lower) - + if self._on_change: + self._on_change() return self @@ -164,6 +168,9 @@ def __getattr__(self, attr: str) -> re.Pattern[str]: return self.get(attr, EMPTY_REGEX) +_PST_ATTRS = frozenset(('prefixes', 'suffix_acronyms', 'suffix_not_acronyms', 'titles')) + + class Constants: """ An instance of this class hold all of the configuration constants for the parser. @@ -196,7 +203,7 @@ class Constants: conjunctions: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager - + _pst: Set[str] | None string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" """ @@ -280,9 +287,21 @@ def __init__(self, self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) + def __setattr__(self, name: str, value: object) -> None: + if name in _PST_ATTRS: + object.__setattr__(self, '_pst', None) + if isinstance(value, SetManager): + value._on_change = self._invalidate_pst + object.__setattr__(self, name, value) + + def _invalidate_pst(self) -> None: + self._pst = None + @property def suffixes_prefixes_titles(self) -> Set[str]: - return self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles + if not self._pst: + self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles + return self._pst def __repr__(self) -> str: return "" diff --git a/tests/conftest.py b/tests/conftest.py index 17950a4..a35d515 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -60,6 +60,3 @@ def empty_attribute_default(request: pytest.FixtureRequest) -> Iterator[str | No setattr(CONSTANTS, attr, value) for attr, value in collection_snapshot.items(): setattr(CONSTANTS, attr, value) - # Invalidate the lazily-built suffixes/prefixes/titles cache so it is - # recomputed from the restored collections rather than a mutated one. - CONSTANTS._pst = None From 76951c833f687e53ed39ed6f371e907be0ed152f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 27 Jun 2026 12:56:58 -0700 Subject: [PATCH 3/7] test: expand suffixes_prefixes_titles coverage and add assertNotIn helper Add missing test cases flagged during PR review: - prime-then-mutate test that would catch a regression to cached staleness - add tests for suffix_acronyms and suffix_not_acronyms (previously untested) - remove test for prefixes (mirroring the existing titles remove test) - assertFalse(x in ...) -> assertNotIn for the two remove tests Also adds assertNotIn to HumanNameTestBase to support the new assertions. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_constants.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/test_constants.py b/tests/test_constants.py index 5013699..5044b31 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -251,7 +251,34 @@ def test_suffixes_prefixes_titles_reflects_remove_title(self) -> None: c.titles.add('emerita') self.assertIn('emerita', c.suffixes_prefixes_titles) c.titles.remove('emerita') - self.assertFalse('emerita' in c.suffixes_prefixes_titles) + self.assertNotIn('emerita', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_prefix(self) -> None: + """suffixes_prefixes_titles must not include a word that was only in prefixes and is then removed.""" + c = Constants() + c.prefixes.add('xpfx') + self.assertIn('xpfx', c.suffixes_prefixes_titles) + c.prefixes.remove('xpfx') + self.assertNotIn('xpfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_suffix_acronym(self) -> None: + """suffixes_prefixes_titles must include suffix acronyms added after construction.""" + c = Constants() + c.suffix_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_suffix_not_acronym(self) -> None: + """suffixes_prefixes_titles must include non-acronym suffixes added after construction.""" + c = Constants() + c.suffix_not_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_after_initial_read(self) -> None: + """suffixes_prefixes_titles must reflect mutations even after the cache has been primed.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) def test_is_rootname_consistent_with_is_title(self) -> None: """is_rootname must return False for words recognised by is_title.""" From 0f341c7b02d0c0212c8e8a16378991b04bb2eaaa Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 27 Jun 2026 13:15:06 -0700 Subject: [PATCH 4/7] test: prime cache in invalidation tests; use is-None cache guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six of the suffixes_prefixes_titles tests passed against the unfixed code because they read the property only once on a cold cache, so they asserted "the union includes its source sets" rather than "the cache stays consistent after a mutation." Prime the cache before mutating so they actually exercise invalidation; verified they now fail (14 across the dual-run fixture) against master's pre-fix config. Change the property guard from `if not self._pst` to `if self._pst is None` so a legitimately empty union caches instead of recomputing on every access — and so "cached" means "computed," which is the state the primed tests rely on. Co-Authored-By: Claude Opus 4.8 --- nameparser/config/__init__.py | 2 +- tests/test_constants.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 4f72766..e28fb33 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -299,7 +299,7 @@ def _invalidate_pst(self) -> None: @property def suffixes_prefixes_titles(self) -> Set[str]: - if not self._pst: + if self._pst is None: self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles return self._pst diff --git a/tests/test_constants.py b/tests/test_constants.py index 5044b31..0983a8e 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -236,12 +236,14 @@ def test_unpickle_legacy_state_with_property_key(self) -> None: def test_suffixes_prefixes_titles_reflects_add_title(self) -> None: """suffixes_prefixes_titles must include titles added after construction.""" c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised c.titles.add('emerita') self.assertIn('emerita', c.suffixes_prefixes_titles) def test_suffixes_prefixes_titles_reflects_add_prefix(self) -> None: """suffixes_prefixes_titles must include prefixes added after construction.""" c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised c.prefixes.add('xpfx') self.assertIn('xpfx', c.suffixes_prefixes_titles) @@ -264,12 +266,14 @@ def test_suffixes_prefixes_titles_reflects_remove_prefix(self) -> None: def test_suffixes_prefixes_titles_reflects_add_suffix_acronym(self) -> None: """suffixes_prefixes_titles must include suffix acronyms added after construction.""" c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised c.suffix_acronyms.add('xsfx') self.assertIn('xsfx', c.suffixes_prefixes_titles) def test_suffixes_prefixes_titles_reflects_add_suffix_not_acronym(self) -> None: """suffixes_prefixes_titles must include non-acronym suffixes added after construction.""" c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised c.suffix_not_acronyms.add('xsfx') self.assertIn('xsfx', c.suffixes_prefixes_titles) @@ -283,11 +287,13 @@ def test_suffixes_prefixes_titles_reflects_add_after_initial_read(self) -> None: def test_is_rootname_consistent_with_is_title(self) -> None: """is_rootname must return False for words recognised by is_title.""" hn = HumanName("", constants=None) + _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable hn.C.titles.add('emerita') self.assertFalse(hn.is_rootname('emerita')) def test_is_rootname_consistent_with_is_prefix(self) -> None: """is_rootname must return False for words recognised by is_prefix.""" hn = HumanName("", constants=None) + _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable hn.C.prefixes.add('xpfx') self.assertFalse(hn.is_rootname('xpfx')) From c7018c880ec341684da448769647fe2f27bef499 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 27 Jun 2026 13:20:34 -0700 Subject: [PATCH 5/7] refactor: scope _pst invalidation to a descriptor; fix stale-callback leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the catch-all Constants.__setattr__ + _PST_ATTRS frozenset with a _CachedUnionMember descriptor on the four cached attributes (prefixes, suffix_acronyms, suffix_not_acronyms, titles). The invalidation/rewiring behavior now lives beside the attributes it governs, runs only for those four (not on every attribute assignment), and removes the hand-synced attribute-name list. Fixes a stale-callback leak: reassigning a manager now detaches the replaced one (_on_change = None) so an orphaned reference can no longer invalidate the live cache. Drop the unused on_change constructor parameter from SetManager — wiring is always done by the owning Constants after construction (e.g. the config-teardown setattr path), so the constructor channel was dead and misleading. Behavior, mypy, and ruff are unchanged; parse throughput is identical to the cached baseline (~18us/name vs ~263us/name without the cache). Tests: add coverage for the replaced-manager rewiring path, the orphan-no-longer-invalidates guarantee, add_with_encoding invalidation, and suffix-set removal after priming; add an assertIs shim to the test base. Co-Authored-By: Claude Opus 4.8 --- nameparser/config/__init__.py | 61 ++++++++++++++++++++++++++--------- tests/base.py | 3 ++ tests/test_constants.py | 51 +++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 15 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index e28fb33..399cea8 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -30,7 +30,7 @@ import re import sys from collections.abc import Callable, Iterable, Iterator, Mapping, Set -from typing import Any, TypeVar +from typing import Any, TypeVar, overload if sys.version_info >= (3, 11): from typing import Self @@ -62,9 +62,14 @@ class SetManager(Set): ''' - def __init__(self, elements: Iterable[str], on_change: Callable[[], None] | None = None) -> None: + _on_change: Callable[[], None] | None + + def __init__(self, elements: Iterable[str]) -> None: self.elements = set(elements) - self._on_change = on_change + # Optional invalidation hook, wired by an owning Constants so that + # in-place add()/remove() can clear its cached suffixes_prefixes_titles + # union. None when the manager is used standalone. + self._on_change = None def __call__(self) -> Set[str]: return self.elements @@ -168,7 +173,40 @@ def __getattr__(self, attr: str) -> re.Pattern[str]: return self.get(attr, EMPTY_REGEX) -_PST_ATTRS = frozenset(('prefixes', 'suffix_acronyms', 'suffix_not_acronyms', 'titles')) +class _CachedUnionMember: + """Descriptor for the four ``SetManager`` attributes whose union ``Constants`` + caches in ``_pst`` (``prefixes``, ``suffix_acronyms``, ``suffix_not_acronyms``, + ``titles``). + + Assigning a new manager — or mutating one in place via ``add()`` / ``remove()`` + — invalidates that cache. Keeping the behavior on a descriptor scopes it to + exactly these attributes, beside their declarations, rather than spreading it + across a catch-all ``__setattr__`` and a separate attribute-name list. + """ + + _attr: str + + def __set_name__(self, owner: type, name: str) -> None: + self._attr = '_' + name + + @overload + def __get__(self, obj: None, objtype: type | None = None) -> '_CachedUnionMember': ... + @overload + def __get__(self, obj: 'Constants', objtype: type | None = None) -> SetManager: ... + + def __get__(self, obj: 'Constants | None', objtype: type | None = None) -> 'SetManager | _CachedUnionMember': + if obj is None: + return self + return getattr(obj, self._attr) + + def __set__(self, obj: 'Constants', value: SetManager) -> None: + previous = getattr(obj, self._attr, None) + if isinstance(previous, SetManager): + previous._on_change = None # detach the replaced manager so it no longer invalidates + if isinstance(value, SetManager): + value._on_change = obj._invalidate_pst + obj._invalidate_pst() + setattr(obj, self._attr, value) class Constants: @@ -195,10 +233,10 @@ class Constants: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ - prefixes: SetManager - suffix_acronyms: SetManager - suffix_not_acronyms: SetManager - titles: SetManager + prefixes = _CachedUnionMember() + suffix_acronyms = _CachedUnionMember() + suffix_not_acronyms = _CachedUnionMember() + titles = _CachedUnionMember() first_name_titles: SetManager conjunctions: SetManager capitalization_exceptions: TupleManager[str] @@ -287,13 +325,6 @@ def __init__(self, self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) - def __setattr__(self, name: str, value: object) -> None: - if name in _PST_ATTRS: - object.__setattr__(self, '_pst', None) - if isinstance(value, SetManager): - value._on_change = self._invalidate_pst - object.__setattr__(self, name, value) - def _invalidate_pst(self) -> None: self._pst = None diff --git a/tests/base.py b/tests/base.py index 1f20857..c60a450 100644 --- a/tests/base.py +++ b/tests/base.py @@ -41,3 +41,6 @@ def assertIn(self, member: object, container: object, msg: object = None) -> Non def assertNotIn(self, member: object, container: object, msg: object = None) -> None: assert member not in container, msg # type: ignore[operator] + + def assertIs(self, first: object, second: object, msg: object = None) -> None: + assert first is second, msg diff --git a/tests/test_constants.py b/tests/test_constants.py index 0983a8e..8ba7849 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -297,3 +297,54 @@ def test_is_rootname_consistent_with_is_prefix(self) -> None: _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable hn.C.prefixes.add('xpfx') self.assertFalse(hn.is_rootname('xpfx')) + + def test_suffixes_prefixes_titles_reflects_remove_suffix_acronym(self) -> None: + """suffixes_prefixes_titles must reflect a suffix acronym removed after the cache is primed.""" + c = Constants() + c.suffix_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache + c.suffix_acronyms.remove('xsfx') + self.assertNotIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_suffix_not_acronym(self) -> None: + """suffixes_prefixes_titles must reflect a non-acronym suffix removed after the cache is primed.""" + c = Constants() + c.suffix_not_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache + c.suffix_not_acronyms.remove('xsfx') + self.assertNotIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_with_encoding(self) -> None: + """add_with_encoding must invalidate the cache like add()/remove() do.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles.add_with_encoding(b'b\351ck', encoding='latin_1') + self.assertIn('béck', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_replaced_manager(self) -> None: + """Replacing a whole SetManager must invalidate the cache and wire the new manager. + + Covers the config-teardown path where a fresh SetManager is assigned + directly (e.g. ``setattr(CONSTANTS, 'titles', SetManager(...))``). + """ + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles = SetManager(['brandnewtitle']) + # The replacement is reflected immediately... + self.assertIn('brandnewtitle', c.suffixes_prefixes_titles) + # ...and the new manager's own mutations invalidate the cache too, + # proving the on_change callback was re-wired to the replacement. + _ = c.suffixes_prefixes_titles + c.titles.add('secondtitle') + self.assertIn('secondtitle', c.suffixes_prefixes_titles) + + def test_replaced_manager_no_longer_invalidates_cache(self) -> None: + """A SetManager detached by reassignment must not invalidate the new cache.""" + c = Constants() + replaced = c.titles + c.titles = SetManager(['brandnewtitle']) + primed = c.suffixes_prefixes_titles + # Mutating the orphaned manager must leave the live cache untouched. + replaced.add('ghost') + self.assertIs(c.suffixes_prefixes_titles, primed) + self.assertNotIn('ghost', c.suffixes_prefixes_titles) From 9abb861874fa0d3587451641c3fb19c3b3a82b78 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 28 Jun 2026 11:22:52 -0700 Subject: [PATCH 6/7] fix: re-wire pickle __getstate__ for descriptor-backed attrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _CachedUnionMember stores prefixes/titles/etc. under a leading-underscore private name (_prefixes, _titles, …). The previous __getstate__ filtered out everything starting with '_', so those four attrs were silently dropped from every pickle. __setstate__ then had nothing to restore and accessed the class-level descriptor object rather than a SetManager. Fix: map '_xxx' back to 'xxx' when the attr is a _CachedUnionMember, so __setstate__ receives the public name and can restore through the descriptor, re-wiring _on_change callbacks in the process. Co-Authored-By: Claude Sonnet 4.6 --- nameparser/config/__init__.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 399cea8..856bd06 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -357,14 +357,23 @@ def __setstate__(self, state: Mapping[str, Any]) -> None: setattr(self, name, value) def __getstate__(self) -> Mapping[str, Any]: - # Pickle only the instance's own configuration: the collections built in - # __init__ plus any instance-level scalar overrides. Class-level scalar - # defaults are restored by the class itself, and underscore-prefixed - # names such as the ``_pst`` cache are private and rebuilt on demand. - # Filtering on ``self.__dict__`` also excludes the computed - # ``suffixes_prefixes_titles`` property automatically, since properties - # live on the class and never appear in an instance's ``__dict__``. - return {k: v for k, v in self.__dict__.items() if not k.startswith('_')} + # Pickle the instance's own configuration: the collections built in + # __init__ plus any instance-level scalar overrides. + # _CachedUnionMember descriptors store their values with a leading + # underscore (e.g. `_prefixes` for `prefixes`) so that the descriptor's + # __set__ owns assignment. We map those back to the public names so + # __setstate__ can restore them through the descriptor, re-wiring the + # invalidation callbacks. All other underscore-prefixed names (_pst, etc.) + # are private/cache and are intentionally excluded. + state: dict[str, Any] = {} + for name, val in self.__dict__.items(): + if name.startswith('_'): + public = name[1:] + if isinstance(getattr(type(self), public, None), _CachedUnionMember): + state[public] = val + else: + state[name] = val + return state #: A module-level instance of the :py:class:`Constants()` class. From afd605709cbc44c903cfef503d95d33649aa944f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 28 Jun 2026 11:53:39 -0700 Subject: [PATCH 7/7] fix: harden cache invalidation, pickle restore, and test coverage - SetManager.remove() only fires _on_change when an element was actually removed, avoiding unnecessary cache churn on no-op calls - _CachedUnionMember.__set__ raises TypeError for non-SetManager values instead of silently skipping hook wiring (which would break invalidation) - Constants.__setstate__ guards against malformed/truncated pickle state using _CachedUnionMember introspection, matching __getstate__'s approach - Add comment to __init__ explaining the descriptor-assignment ordering dependency that establishes _pst - UnicodeDecodeError fallback in base.py now preserves actual/expected in the assertion message - Replace duplicate add-title test with pickle round-trip callback test Co-Authored-By: Claude Sonnet 4.6 --- nameparser/config/__init__.py | 25 ++++++++++++++++++++++--- tests/base.py | 2 +- tests/test_constants.py | 12 +++++++----- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 856bd06..74c5e83 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -117,10 +117,12 @@ def remove(self, *strings: str) -> Self: Remove the lower case and no-period version of the string arguments from the set. Returns ``self`` for chaining. """ + changed = False for s in strings: if (lower := lc(s)) in self.elements: self.elements.remove(lower) - if self._on_change: + changed = True + if changed and self._on_change: self._on_change() return self @@ -200,11 +202,15 @@ def __get__(self, obj: 'Constants | None', objtype: type | None = None) -> 'SetM return getattr(obj, self._attr) def __set__(self, obj: 'Constants', value: SetManager) -> None: + if not isinstance(value, SetManager): + raise TypeError( + f"Expected a SetManager instance, got {type(value).__name__!r}. " + "Wrap your iterable: SetManager(['mr', 'ms'])" + ) previous = getattr(obj, self._attr, None) if isinstance(previous, SetManager): previous._on_change = None # detach the replaced manager so it no longer invalidates - if isinstance(value, SetManager): - value._on_change = obj._invalidate_pst + value._on_change = obj._invalidate_pst obj._invalidate_pst() setattr(obj, self._attr, value) @@ -316,6 +322,9 @@ def __init__(self, capitalization_exceptions: TupleManager[str] | Iterable[tuple[str, str]] = CAPITALIZATION_EXCEPTIONS, regexes: RegexTupleManager | TupleManager[re.Pattern[str]] | Iterable[tuple[str, re.Pattern[str]]] = REGEXES ) -> None: + # These four descriptor assignments call _CachedUnionMember.__set__, which + # calls _invalidate_pst() and establishes self._pst. They must come before + # any read of suffixes_prefixes_titles. self.prefixes = SetManager(prefixes) self.suffix_acronyms = SetManager(suffix_acronyms) self.suffix_not_acronyms = SetManager(suffix_not_acronyms) @@ -355,6 +364,16 @@ def __setstate__(self, state: Mapping[str, Any]) -> None: if isinstance(getattr(type(self), name, None), property): continue setattr(self, name, value) + # Verify each descriptor-backed attr was restored. Without this, a missing + # key surfaces later as AttributeError: 'Constants' object has no attribute + # '_prefixes' — the private mangled name, not the public one, making it + # very hard to diagnose. + for attr in (n for n, v in vars(type(self)).items() if isinstance(v, _CachedUnionMember)): + if not hasattr(self, '_' + attr): + raise ValueError( + f"Pickle state is missing required field {attr!r}. " + "The state blob may be truncated or from an incompatible version." + ) def __getstate__(self) -> Mapping[str, Any]: # Pickle the instance's own configuration: the collections built in diff --git a/tests/base.py b/tests/base.py index c60a450..085f617 100644 --- a/tests/base.py +++ b/tests/base.py @@ -25,7 +25,7 @@ def m(self, actual: T, expected: T, hn: HumanName) -> None: hn, ) except UnicodeDecodeError: - assert actual == expected_ + assert actual == expected_, f"actual={actual!r} != expected={expected_!r}" def assertEqual(self, first: object, second: object, msg: object = None) -> None: assert first == second, msg diff --git a/tests/test_constants.py b/tests/test_constants.py index 8ba7849..96c4fa8 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -277,12 +277,14 @@ def test_suffixes_prefixes_titles_reflects_add_suffix_not_acronym(self) -> None: c.suffix_not_acronyms.add('xsfx') self.assertIn('xsfx', c.suffixes_prefixes_titles) - def test_suffixes_prefixes_titles_reflects_add_after_initial_read(self) -> None: - """suffixes_prefixes_titles must reflect mutations even after the cache has been primed.""" + def test_pickle_roundtrip_rewires_invalidation_callbacks(self) -> None: + """Mutations on a deserialized Constants must still invalidate the cache.""" c = Constants() - _ = c.suffixes_prefixes_titles # prime the cache - c.titles.add('emerita') - self.assertIn('emerita', c.suffixes_prefixes_titles) + # Safe: round-tripping a Constants the test just built, not untrusted data. + restored = pickle.loads(pickle.dumps(c)) + _ = restored.suffixes_prefixes_titles # prime the cache + restored.titles.add('posttitle') + self.assertIn('posttitle', restored.suffixes_prefixes_titles) def test_is_rootname_consistent_with_is_title(self) -> None: """is_rootname must return False for words recognised by is_title."""