diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 4f1e4f2..4eaf15b 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -231,7 +231,7 @@ def __init__(self, self.first_name_titles = SetManager(first_name_titles) self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) - self.regexes = TupleManager(regexes) + self.regexes = TupleManager([tpl[:2] for tpl in REGEXES]) self._pst = None @property diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index bd4b320..9be2f1e 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -18,20 +18,39 @@ '[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) -REGEXES = set([ +REGEXES = [ ("spaces", re.compile(r"\s+", re.U)), ("word", re.compile(r"(\w|\.)+", re.U)), ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), - ("quoted_word", re.compile(r'(?<!\w)\'([^\s]*?)\'(?!\w)', re.U)), - ("double_quotes", re.compile(r'\"(.*?)\"', re.U)), - ("parenthesis", re.compile(r'\((.*?)\)', re.U)), + ("double_apostrophe_ASCII", re.compile(r"(?!\w)''(\w[^']*?)''(?!\w)", re.U), 'nickname'), + ("smart_quote", re.compile(r"(?!\w)“(\w[^”]*?)”(?!\w)", re.U), 'nickname'), + ("smart_single_quote", re.compile(r"(?!\w)‘(\w[^’]*?)’(?!\w)", re.U), 'nickname'), + ("grave_accent", re.compile(r'(?!\w)`(\w[^`]*?)`(?!\w)', re.U), 'nickname'), + ("grave_acute", re.compile(r'(?!\w)`(\w[^´]*?)´(?!\w)', re.U), 'nickname'), + ("apostrophe_ASCII", re.compile(r"(?!\w)'(\w[^']*?)'(?!\w)", re.U), 'nickname'), + ("quote_ASCII", re.compile(r'(?!\w)"(\w[^"]*?)"(?!\w)', re.U), 'nickname'), + ("parenthesis", re.compile(r'(?!\w)\((\w[^)]*?)\)(?!\w)', re.U), 'nickname'), ("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)), ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)), ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), ("emoji",re_emoji), ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), -]) + ("nn_sep_safe", re.compile(r'[^ ,]', re.U)), + ("paren_suffix", re.compile(r'(?!\w)(\((?:ret|vet)\.?\))(?!\w)', re.I | re.U)), +] """ All regular expressions used by the parser are precompiled and stored in the config. + +REGEX tuple positions are: + [0] - name of the pattern, used in code as named attribute + [1] - compiled pattern + [2] - (optional) label/tag of the pattern, used in code for + filtering patterns + +All nickname patterns should follow this pattern: + (?!\w)leading_delim([^trailing_delim]*?)trailing_delim(?!\w) + +Nicknames are assume to be delimited by non-word characters. + """ diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index 9765b92..7af82b8 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -6,6 +6,7 @@ 'esq', 'esquire', 'jr', + 'jr.', 'jnr', 'junior', 'sr', @@ -25,6 +26,7 @@ """ SUFFIX_ACRONYMS = set([ '(ret)', + '(ret.)', '(vet)', '8-vsb', 'aas', diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 3d5892f..3462af8 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -166,6 +166,7 @@ 'chef', 'chemist', 'chief', + 'chief justice', 'chieftain', 'choreographer', 'civil', @@ -339,6 +340,7 @@ 'judicial', 'junior', 'jurist', + 'justice', 'keyboardist', 'kingdom', 'knowledge', diff --git a/nameparser/parser.py b/nameparser/parser.py index bd79057..eb319b3 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -12,6 +12,7 @@ from nameparser.config import CONSTANTS from nameparser.config import Constants from nameparser.config import DEFAULT_ENCODING +from nameparser.config.regexes import REGEXES ENCODING = 'utf-8' @@ -70,7 +71,7 @@ class HumanName(object): _members = ['title','first','middle','last','suffix','nickname'] unparsable = True _full_name = '' - + def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, string_format=None): self.C = constants @@ -79,7 +80,17 @@ def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, self.encoding = encoding self.string_format = string_format or self.C.string_format + self._nickname_regexes = [tpl[1] + for tpl in REGEXES + if isinstance(tpl[-1], str) + and 'nickname' in tpl[-1] + ] # full_name setter triggers the parse + #======================================================== + #IMPORTANT NOTE: + # The followint statement must be the last one in the + # __init__ function + #======================================================== self.full_name = full_name def __iter__(self): @@ -243,7 +254,11 @@ def nickname(self): The person's nicknames. Any text found inside of quotes (``""``) or parenthesis (``()``) """ - return " ".join(self.nickname_list) or self.C.empty_attribute_default + if len(self.nickname_list) <= 1: + f_string = '{0}' + else: + f_string = '"{0}"' + return ", ".join([f_string.format(nn) for nn in self.nickname_list]) or self.C.empty_attribute_default @property def surnames_list(self): @@ -387,18 +402,24 @@ def pre_process(self): This method happens at the beginning of the :py:func:`parse_full_name` before any other processing of the string aside from unicode normalization, so it's a good place to do any custom handling in a - subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`. + subclass. Runs + :py:func:`fix_phd` + :py:func:`parse_parenthesized_suffixes` + :py:func:`parse_nicknames` + :py:func:`squash_emoji`. """ self.fix_phd() + self.parse_parenthesized_suffixes() self.parse_nicknames() self.squash_emoji() def post_process(self): """ This happens at the end of the :py:func:`parse_full_name` after - all other processing has taken place. Runs :py:func:`handle_firstnames` - and :py:func:`handle_capitalization`. + all other processing has taken place. Runs + :py:func:`handle_firstnames` + :py:func:`handle_capitalization` """ self.handle_firstnames() self.handle_capitalization() @@ -412,25 +433,49 @@ def fix_phd(self): def parse_nicknames(self): """ - The content of parenthesis or quotes in the name will be added to the + The content of defined nickname regex patterns in the name will be added to the nicknames list. This happens before any other processing of the name. - - Single quotes cannot span white space characters and must border - white space to allow for quotes in names like O'Connor and Kawai'ae'a. - Double quotes and parenthesis can span white space. - - Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; - `quoted_word`, `double_quotes` and `parenthesis`. - """ - - re_quoted_word = self.C.regexes.quoted_word - re_double_quotes = self.C.regexes.double_quotes - re_parenthesis = self.C.regexes.parenthesis - - for _re in (re_quoted_word, re_double_quotes, re_parenthesis): - if _re.search(self._full_name): - self.nickname_list += [x for x in _re.findall(self._full_name)] - self._full_name = _re.sub('', self._full_name) + + Some basic rules for nickname processing: + * Nicknames must begin with a word character. + * Nickname patterns should include an outer (not processed) + delimiter that excludes word characters. + + Loops through :py:data:`~nameparser.config.regexes.REGEXES` with + label/tag like "nickname" + """ + #ToDo: + # * create a list of matches + # * sort the list by span + # * check inter-match strings for commas + # * remove the commas if safe to remove + # safe = character(s) between matches are ONLY + # spaces and commas + # * iterate the matches, collecting the nicknames + # and removing the matches from self._full_name + nn_matches = [] + nn_sep = self.C.regexes.nn_sep_safe + _fn = self._full_name + for _re in self._nickname_regexes: + if _re.search(_fn): + nn_matches.extend( _re.finditer(_fn) ) + #remove matches from string + for _match in _re.finditer(_fn): + _fn = (' ' * (_match.end() - _match.start())).join([_fn[:_match.start()], _fn[_match.end():]]) + + if len(nn_matches) == 0: + return #"empty matches" + + nn_matches.sort(key=lambda x: x.span()) + + #remove any inter-match commas, if safe to do so + for low, high in zip(nn_matches[0:-1], nn_matches[1:]): + if nn_sep.search(self._full_name[low.span()[1]:high.span()[0]]) is None: + self._full_name = ' '.join([self._full_name[:low.span()[1]], self._full_name[high.span()[0]:] ]) + + for nn_match in nn_matches: + self.nickname_list.append( nn_match.group(1) ) + self._full_name = nn_match.re.sub(' ', self._full_name, 1) def squash_emoji(self): """ @@ -452,6 +497,18 @@ def handle_firstnames(self): and not lc(self.title) in self.C.first_name_titles: self.last, self.first = self.first, self.last + def parse_parenthesized_suffixes(self): + """ + Extract any parenthesized suffixes: (ret. | ret | vet. | vet) + """ + _re = self.C.regexes.paren_suffix + if _re.search(self._full_name): + for _match in _re.finditer(self._full_name): + self.suffix_list.append(_match.group(1)) + + self._full_name = _re.sub(' ', self._full_name) + + def parse_full_name(self): """ diff --git a/tests.py b/tests.py index 5f976b8..b19a0cc 100644 --- a/tests.py +++ b/tests.py @@ -27,6 +27,7 @@ from nameparser import HumanName from nameparser.util import u from nameparser.config import Constants +import re log = logging.getLogger('HumanName') @@ -1491,7 +1492,36 @@ def test_nickname_and_last_name_with_title(self): self.m(hn.last, "Edmonds", hn) self.m(hn.nickname, "Rick", hn) + def test_append_nickname(self): + hn = HumanName() + new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE) + hn._nickname_regexes.append(new_rgx) + self.assertEqual(hn._nickname_regexes[-1], new_rgx) + hn.full_name = r"Benjamin (_openBen):close Franklin" + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, ":close", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "_openBen", hn) + def test_prepend_nickname(self): + hn = HumanName() + new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE) + hn._nickname_regexes.insert(0, new_rgx) + self.assertEqual(hn._nickname_regexes[0], new_rgx) + hn.full_name = r"Benjamin (_openBen):close Franklin" + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "Ben", hn) + + def test_multiple_nicknames(self): + hn = HumanName('Chief Justice John (JR), "No Glove, No Love" Glover Roberts, Jr.') + self.m(hn.title, 'Chief Justice', hn) + self.m(hn.first, "John", hn) + self.m(hn.middle, "Glover", hn) + self.m(hn.last, "Roberts", hn) + self.m(hn.suffix, "Jr.", hn) + self.m(hn.nickname, '"JR", "No Glove, No Love"', hn) # class MaidenNameTestCase(HumanNameTestBase): # @@ -1766,6 +1796,14 @@ def test_suffix_with_periods_with_lastname_comma(self): self.m(hn.last, "Doe", hn) self.m(hn.suffix, "Msc.Ed.", hn) + def test_suffix_parenthesized_with_nickname(self): + hn = HumanName("Gen Dwight David (Ike) Eisenhower (ret.) KG") + self.m(hn.title, "Gen", hn) + self.m(hn.first, "Dwight", hn) + self.m(hn.middle, "David", hn) + self.m(hn.last, "Eisenhower", hn) + self.m(hn.suffix, "(ret.), KG", hn) + self.m(hn.nickname, "Ike", hn) class TitleTestCase(HumanNameTestBase):