From 698dbc38dc25dbf45f52a1464988ea2d84884f7f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 15:41:14 -0700 Subject: [PATCH 1/9] feat: add last_base / last_prefixes derived properties (#130, #132) Adds _split_last(), last_base, last_prefixes, last_base_list, last_prefixes_list, family, and family_prefixes to HumanName. The guard prevents stripping when every last-name token is a prefix. Co-Authored-By: Claude Sonnet 4.6 --- nameparser/parser.py | 74 ++++++++++++++++++++++++++++++++++++++++++ tests/test_prefixes.py | 68 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) diff --git a/nameparser/parser.py b/nameparser/parser.py index 7cc4f0b..9784100 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -430,6 +430,80 @@ def given_names(self) -> str: """ return " ".join(self.given_names_list) or self.C.empty_attribute_default + def _split_last(self) -> tuple[list[str], list[str]]: + """Return (prefix_particles, base_words) split from the last name. + + >>> HumanName("Vincent van Gogh")._split_last() + (['van'], ['Gogh']) + """ + words = " ".join(self.last_list).split() + i = 0 + while i < len(words) and self.is_prefix(words[i]): + i += 1 + if i == len(words): + # Every word is a prefix (e.g. surname "Do" which is also a + # prefix word). A family name can't consist only of particles, + # so don't strip — treat the whole last name as the base. + return [], words + return words[:i], words[i:] + + @property + def last_prefixes_list(self) -> list[str]: + """ + List of leading prefix particles in the last name (the *tussenvoegsel*). + + >>> HumanName("Juan de la Vega").last_prefixes_list + ['de', 'la'] + """ + return self._split_last()[0] + + @property + def last_base_list(self) -> list[str]: + """ + List of last-name words after stripping leading prefix particles. + + >>> HumanName("Vincent van Gogh").last_base_list + ['Gogh'] + """ + return self._split_last()[1] + + @property + def last_base(self) -> str: + """ + The last name with leading prefix particles removed (the core surname). + For ``"van Gogh"`` this is ``"Gogh"``; for ``"Smith"`` it is ``"Smith"``. + ``last`` is always unchanged. + + >>> HumanName("Vincent van Gogh").last_base + 'Gogh' + >>> HumanName("John Smith").last_base + 'Smith' + """ + return " ".join(self.last_base_list) or self.C.empty_attribute_default + + @property + def last_prefixes(self) -> str: + """ + The leading prefix particle(s) of the last name (the *tussenvoegsel*). + Returns ``""`` (or ``empty_attribute_default``) when there are none. + + >>> HumanName("Vincent van Gogh").last_prefixes + 'van' + >>> HumanName("Juan de la Vega").last_prefixes + 'de la' + """ + return " ".join(self.last_prefixes_list) or self.C.empty_attribute_default + + @property + def family(self) -> str: + """Alias for :py:attr:`last_base`.""" + return self.last_base + + @property + def family_prefixes(self) -> str: + """Alias for :py:attr:`last_prefixes`.""" + return self.last_prefixes + # setter methods def _set_list(self, attr: str, value: str | list[str] | None) -> None: diff --git a/tests/test_prefixes.py b/tests/test_prefixes.py index 0d18449..2f47fdb 100644 --- a/tests/test_prefixes.py +++ b/tests/test_prefixes.py @@ -162,3 +162,71 @@ def test_comma_three_conjunctions(self) -> None: self.m(hn.title, "Dr.", hn) self.m(hn.middle, "Q. Xavier", hn) self.m(hn.suffix, "III", hn) + +class LastNamePrefixSplitTestCase(HumanNameTestBase): + + def test_van_gogh_last_base(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.last_base, "Gogh", hn) + + def test_van_gogh_last_prefixes(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.last_prefixes, "van", hn) + + def test_van_gogh_last_base_list(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.last_base_list, ["Gogh"], hn) + + def test_van_gogh_last_prefixes_list(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.last_prefixes_list, ["van"], hn) + + def test_von_bergen_wessels(self) -> None: + hn = HumanName("pennie von bergen wessels") + self.m(hn.last_base, "bergen wessels", hn) + self.m(hn.last_prefixes, "von", hn) + + def test_de_la_vega_multiword_prefix(self) -> None: + hn = HumanName("Juan de la Vega") + self.m(hn.last_base, "Vega", hn) + self.m(hn.last_prefixes, "de la", hn) + self.m(hn.last_prefixes_list, ["de", "la"], hn) + + def test_no_prefix(self) -> None: + hn = HumanName("John Smith") + self.m(hn.last_base, "Smith", hn) + self.m(hn.last_prefixes, "", hn) + self.assertEqual(hn.last_prefixes_list, []) + + def test_do_guard_surname_equals_prefix_word(self) -> None: + # "Do" is in PREFIXES; without the guard last_base would be empty + hn = HumanName("Anh Do") + self.m(hn.last_base, "Do", hn) + self.m(hn.last_prefixes, "", hn) + + def test_all_particles_guard(self) -> None: + # Artificial case: last name whose every word is a prefix — must not strip + hn = HumanName("Smith van der") + # last="van der"; both words are prefixes — guard fires, base = full last + self.m(hn.last_base, hn.last, hn) + self.m(hn.last_prefixes, "", hn) + + def test_alias_family_equals_last_base(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.family, hn.last_base, hn) + + def test_alias_family_prefixes_equals_last_prefixes(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.family_prefixes, hn.last_prefixes, hn) + + def test_da_silva_title_plus_prefix(self) -> None: + hn = HumanName("Dra. Andréia da Silva") + self.m(hn.last_base, "Silva", hn) + self.m(hn.last_prefixes, "da", hn) + + def test_empty_name(self) -> None: + hn = HumanName() + self.m(hn.last_base, "", hn) + self.m(hn.last_prefixes, "", hn) + self.assertEqual(hn.last_base_list, []) + self.assertEqual(hn.last_prefixes_list, []) From 2ee350d9b0f0c7061ea783d9336e3d6b44b6db21 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 15:44:27 -0700 Subject: [PATCH 2/9] test: clarify assertEqual usage for empty list assertions --- tests/test_prefixes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_prefixes.py b/tests/test_prefixes.py index 2f47fdb..44c036d 100644 --- a/tests/test_prefixes.py +++ b/tests/test_prefixes.py @@ -163,6 +163,7 @@ def test_comma_three_conjunctions(self) -> None: self.m(hn.middle, "Q. Xavier", hn) self.m(hn.suffix, "III", hn) + class LastNamePrefixSplitTestCase(HumanNameTestBase): def test_van_gogh_last_base(self) -> None: @@ -196,6 +197,7 @@ def test_no_prefix(self) -> None: hn = HumanName("John Smith") self.m(hn.last_base, "Smith", hn) self.m(hn.last_prefixes, "", hn) + # self.m() coerces [] via `expected or empty_attribute_default`; use assertEqual for empty lists self.assertEqual(hn.last_prefixes_list, []) def test_do_guard_surname_equals_prefix_word(self) -> None: @@ -228,5 +230,6 @@ def test_empty_name(self) -> None: hn = HumanName() self.m(hn.last_base, "", hn) self.m(hn.last_prefixes, "", hn) + # self.m() coerces [] via `expected or empty_attribute_default`; use assertEqual for empty lists self.assertEqual(hn.last_base_list, []) self.assertEqual(hn.last_prefixes_list, []) From e76eb4228a75e69ccc3093fb15f2cbfc62bcee53 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 15:44:55 -0700 Subject: [PATCH 3/9] docs: add last_base / last_prefixes to usage.rst quickstart --- docs/usage.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/usage.rst b/docs/usage.rst index f744dea..281a803 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -21,6 +21,12 @@ Requires Python 3.10+. 'Q. Xavier' >>> name.last 'de la Vega' + >>> name.last_base + 'Vega' + >>> name.last_prefixes + 'de la' + >>> name.family + 'Vega' >>> name.suffix 'III' >>> name.surnames From 00ddc946bac722708428ed1b5c8141284c7ab6be Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 15:45:23 -0700 Subject: [PATCH 4/9] docs: document PREFIXES customization for last_base / last_prefixes --- docs/customize.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/customize.rst b/docs/customize.rst index 87881c0..293d578 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -63,6 +63,26 @@ Other editable attributes * :py:obj:`~nameparser.config.Constants.initials_separator` - string placed between consecutive initials within the same name group (after the delimiter). Defaults to ``" "``, so ``"A. K."``; set to ``""`` for compact ``"A.K."``. +Splitting last-name prefix particles +------------------------------------- + +The :py:attr:`~nameparser.parser.HumanName.last_base` and +:py:attr:`~nameparser.parser.HumanName.last_prefixes` properties split the last +name at the boundary between leading prefix particles and the core surname. They +use the same ``PREFIXES`` set, so adding a particle makes the split pick it up +automatically:: + + >>> from nameparser import HumanName + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.prefixes.add('ter') + >>> HumanName("Jan ter Horst").last_base + 'Horst' + >>> HumanName("Jan ter Horst").last_prefixes + 'ter' + >>> CONSTANTS.prefixes.remove('ter') + +Note the ``remove`` call at the end — ``customize.rst`` examples share global +``CONSTANTS``, so mutations must be reversed to avoid affecting later examples. Parser Customization Examples ----------------------------- From f1422526c341bb92e5384bc478556d38a20c7175 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 15:45:27 -0700 Subject: [PATCH 5/9] docs: add last_base / last_prefixes to release log --- docs/release_log.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release_log.rst b/docs/release_log.rst index 4720d32..ed5a5f4 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -18,6 +18,7 @@ Release Log - Fix spurious leading space in surnames and empty token in suffix list after ``capitalize()`` with an empty middle or suffix (#164) - Fix extra whitespace before punctuation in ``str()`` output when a ``string_format`` field is empty (closes #139) - Fix ``'apn aprn'`` split into separate ``suffix_acronyms`` entries so each is recognized independently (closes #155) + - Add ``last_base``, ``last_prefixes`` (and ``_list`` variants) plus ``family`` / ``family_prefixes`` aliases for splitting last-name prefix particles (tussenvoegsels) from the core surname (#130, #132) * 1.2.1 - June 19, 2026 - Fix ``initials()`` interpolating the literal ``None`` for empty name parts when ``empty_attribute_default = None`` (e.g. ``"J. None D."``); empty parts now render as an empty string and a fully-empty result returns ``empty_attribute_default`` - Add ``python -m nameparser "Name String"`` command-line helper that prints a parsed name From b9adbe443c190b71aa1cc42bcee5bd0a8f750111 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 18:45:44 -0700 Subject: [PATCH 6/9] test/docs: address PR review feedback on last_base / last_prefixes - Add list-form assertions to test_von_bergen_wessels (multi-word base) - Add test_case_insensitive_prefix_detection to pin is_prefix lowercasing - Clarify _split_last docstring: document the all-particles guard invariant, add Anh Do doctest, soften inline comment to heuristic framing - Add guard-behavior notes to last_base_list, last_prefixes_list, last_base, and last_prefixes docstrings so the non-obvious empty-prefix case is surfaced Co-Authored-By: Claude Sonnet 4.6 --- nameparser/parser.py | 24 +++++++++++++++++++----- tests/test_prefixes.py | 7 +++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 9784100..b064e15 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -433,17 +433,24 @@ def given_names(self) -> str: def _split_last(self) -> tuple[list[str], list[str]]: """Return (prefix_particles, base_words) split from the last name. + The base_words list is never empty: if every word in the last name + matches a prefix particle, the guard fires and all words are returned + as the base with an empty prefix list (heuristic: a family name is + assumed not to consist entirely of particles). + >>> HumanName("Vincent van Gogh")._split_last() (['van'], ['Gogh']) + >>> HumanName("Anh Do")._split_last() + ([], ['Do']) """ words = " ".join(self.last_list).split() i = 0 while i < len(words) and self.is_prefix(words[i]): i += 1 if i == len(words): - # Every word is a prefix (e.g. surname "Do" which is also a - # prefix word). A family name can't consist only of particles, - # so don't strip — treat the whole last name as the base. + # Heuristic: assume a family name isn't entirely composed of + # particles (e.g. surname "Do" which also appears in PREFIXES). + # Don't strip — treat the whole last name as the base. return [], words return words[:i], words[i:] @@ -451,6 +458,8 @@ def _split_last(self) -> tuple[list[str], list[str]]: def last_prefixes_list(self) -> list[str]: """ List of leading prefix particles in the last name (the *tussenvoegsel*). + Returns ``[]`` when there are none, including the case where every word + in the last name matches a prefix — see :py:meth:`_split_last`. >>> HumanName("Juan de la Vega").last_prefixes_list ['de', 'la'] @@ -461,6 +470,8 @@ def last_prefixes_list(self) -> list[str]: def last_base_list(self) -> list[str]: """ List of last-name words after stripping leading prefix particles. + Never empty: when every word matches a prefix, no stripping occurs and + the full last name is returned — see :py:meth:`_split_last`. >>> HumanName("Vincent van Gogh").last_base_list ['Gogh'] @@ -472,7 +483,8 @@ def last_base(self) -> str: """ The last name with leading prefix particles removed (the core surname). For ``"van Gogh"`` this is ``"Gogh"``; for ``"Smith"`` it is ``"Smith"``. - ``last`` is always unchanged. + ``last`` is always unchanged. When every word in the last name matches a + prefix particle, no stripping occurs and the full last name is returned. >>> HumanName("Vincent van Gogh").last_base 'Gogh' @@ -485,7 +497,9 @@ def last_base(self) -> str: def last_prefixes(self) -> str: """ The leading prefix particle(s) of the last name (the *tussenvoegsel*). - Returns ``""`` (or ``empty_attribute_default``) when there are none. + Returns ``""`` (or ``empty_attribute_default``) when there are none, + including when every word in the last name matches a prefix particle + (the all-particles guard; see :py:meth:`_split_last`). >>> HumanName("Vincent van Gogh").last_prefixes 'van' diff --git a/tests/test_prefixes.py b/tests/test_prefixes.py index 44c036d..8172ba4 100644 --- a/tests/test_prefixes.py +++ b/tests/test_prefixes.py @@ -186,6 +186,8 @@ def test_von_bergen_wessels(self) -> None: hn = HumanName("pennie von bergen wessels") self.m(hn.last_base, "bergen wessels", hn) self.m(hn.last_prefixes, "von", hn) + self.m(hn.last_base_list, ["bergen", "wessels"], hn) + self.m(hn.last_prefixes_list, ["von"], hn) def test_de_la_vega_multiword_prefix(self) -> None: hn = HumanName("Juan de la Vega") @@ -233,3 +235,8 @@ def test_empty_name(self) -> None: # self.m() coerces [] via `expected or empty_attribute_default`; use assertEqual for empty lists self.assertEqual(hn.last_base_list, []) self.assertEqual(hn.last_prefixes_list, []) + + def test_case_insensitive_prefix_detection(self) -> None: + hn = HumanName("VINCENT VAN GOGH") + self.m(hn.last_prefixes, "VAN", hn) + self.m(hn.last_base, "GOGH", hn) From a40e81e42dfa4793ba973763e7d0e5aacd1a8b36 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 18:53:37 -0700 Subject: [PATCH 7/9] docs: add last_base sorting example to customize.rst Co-Authored-By: Claude Sonnet 4.6 --- docs/customize.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/customize.rst b/docs/customize.rst index 293d578..b0fee20 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -84,6 +84,23 @@ automatically:: Note the ``remove`` call at the end — ``customize.rst`` examples share global ``CONSTANTS``, so mutations must be reversed to avoid affecting later examples. +Because ``last_base`` is a plain string property, sorting a list of names by +core surname (ignoring prefix particles like *van*, *de la*) is just a key +function:: + + names = [ + HumanName("Vincent van Gogh"), + HumanName("Juan de la Vega"), + HumanName("John Smith"), + ] + sorted_names = sorted(names, key=lambda n: n.last_base.lower()) + # sort keys: 'gogh', 'smith', 'vega' → van Gogh, Smith, de la Vega + +To sort by first name when two people share the same ``last_base``, add it as +a secondary key:: + + sorted_names = sorted(names, key=lambda n: (n.last_base.lower(), n.first.lower())) + Parser Customization Examples ----------------------------- From d21f9b70a406e4a9b17d3e33d8f63a52e3d4e909 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 19:14:13 -0700 Subject: [PATCH 8/9] feat: add prefix particles from Wikipedia list of family name affixes Adds 'af', 'av', 'bar', 'bat', 'bint', 'den', 'ter', 'zu', and "'t" sourced from https://en.wikipedia.org/wiki/List_of_family_name_affixes (also referenced in PR #132). Excludes 'ben' due to ambiguity with the common English given name; excludes single-letter particles and fused prefixes (mc, fitz) as too ambiguous for general use. Co-Authored-By: Claude Sonnet 4.6 --- nameparser/config/prefixes.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 9e0e772..1d36ce1 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -8,14 +8,20 @@ #: join with all following name pieces until the suffix "MD", resulting in the #: correct parsing of the last name "von bergen wessels". PREFIXES = set([ + "'t", 'abu', + 'af', 'al', + 'av', + 'bar', + 'bat', 'bin', + 'bint', 'bon', 'da', 'dal', 'de', - 'de\'', + "de'", 'degli', 'dei', 'del', @@ -24,6 +30,7 @@ 'delle', 'delli', 'dello', + 'den', 'der', 'di', 'dí', @@ -39,9 +46,11 @@ 'santa', 'st', 'ste', + 'ter', 'van', 'vander', 'vel', - 'von', 'vom', + 'von', + 'zu', ]) From 1cf134fef3006cb9a8b011dcc94d4720a893f61f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 19:18:40 -0700 Subject: [PATCH 9/9] docs: update customize.rst example now that 'ter' is a built-in prefix Replace 'ter' (now in PREFIXES) with 'op' as the custom-particle example; also demonstrates chaining with the newly added 'den' prefix. Co-Authored-By: Claude Sonnet 4.6 --- docs/customize.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index b0fee20..2a14314 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -74,12 +74,12 @@ automatically:: >>> from nameparser import HumanName >>> from nameparser.config import CONSTANTS - >>> CONSTANTS.prefixes.add('ter') - >>> HumanName("Jan ter Horst").last_base - 'Horst' - >>> HumanName("Jan ter Horst").last_prefixes - 'ter' - >>> CONSTANTS.prefixes.remove('ter') + >>> CONSTANTS.prefixes.add('op') + >>> HumanName("Jan op den Berg").last_base + 'Berg' + >>> HumanName("Jan op den Berg").last_prefixes + 'op den' + >>> CONSTANTS.prefixes.remove('op') Note the ``remove`` call at the end — ``customize.rst`` examples share global ``CONSTANTS``, so mutations must be reversed to avoid affecting later examples.