import re
from collections.abc import Iterable, Iterator
from operator import itemgetter
from itertools import groupby
from typing import overload
from nameparser.util import HumanNameAttributeT, lc
from nameparser.util import log
from nameparser.config import CONSTANTS
from nameparser.config import Constants
from nameparser.config import DEFAULT_ENCODING
def group_contiguous_integers(data: Iterable[int]) -> list[tuple[int, int]]:
"""
return list of tuples containing first and last index
position of contiguous numbers in a series
"""
ranges: list[tuple[int, int]] = []
for key, group_with_indices in groupby(enumerate(data), lambda i: i[0] - i[1]):
group = list(map(itemgetter(1), group_with_indices))
if len(group) > 1:
ranges.append((group[0], group[-1]))
return ranges
[docs]
class HumanName:
"""
Parse a person's name into individual components.
Instantiation assigns to ``full_name``, and assignment to
:py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the
name, these instance attributes are available. Alternatively, you can pass
any of the instance attributes to the constructor method and skip the parsing
process. If any of the the instance attributes are passed to the constructor
as keywords, :py:func:`parse_full_name` will not be performed.
**HumanName Instance Attributes**
* :py:attr:`title`
* :py:attr:`first`
* :py:attr:`middle`
* :py:attr:`last`
* :py:attr:`suffix`
* :py:attr:`nickname`
* :py:attr:`surnames`
* :py:attr:`given_names`
:param str full_name: The name string to be parsed.
:param constants constants:
a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for
`per-instance config <customize.html>`_.
:param str encoding: string representing the encoding of your input
:param str string_format: python string formatting
:param str initials_format: python initials string formatting
:param str initials_delimter: string delimiter for initials
:param str initials_separator: string separator between consecutive initials
:param str suffix_delimiter: additional delimiter to split post-comma parts
before suffix detection, e.g. ``" - "`` for ``"RN - CRNA"``
:param str first: first name
:param str middle: middle name
:param str last: last name
:param str title: The title or prenominal
:param str suffix: The suffix or postnominal
:param str nickname: Nicknames
"""
C = CONSTANTS
"""
A reference to the configuration for this instance, which may or may not be
a reference to the shared, module-wide instance at
:py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser
<customize.html>`_.
"""
original: str | bytes = ''
"""
The original string, untouched by the parser.
"""
_count = 0
_members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname']
unparsable = True
_full_name = ''
title_list: list[str]
first_list: list[str]
middle_list: list[str]
last_list: list[str]
suffix_list: list[str]
nickname_list: list[str]
_had_comma: bool
[docs]
def __init__(
self,
full_name: str | bytes = "",
constants: Constants = CONSTANTS,
encoding: str = DEFAULT_ENCODING,
string_format: str | None = None,
initials_format: str | None = None,
initials_delimiter: str | None = None,
initials_separator: str | None = None,
suffix_delimiter: str | None = None,
first: str | list[str] | None = None,
middle: str | list[str] | None = None,
last: str | list[str] | None = None,
title: str | list[str] | None = None,
suffix: str | list[str] | None = None,
nickname: str | list[str] | None = None,
) -> None:
self.C = constants
if type(self.C) is not type(CONSTANTS):
self.C = Constants()
self.encoding = encoding
self.string_format = string_format if string_format is not None else self.C.string_format
self.initials_format = initials_format if initials_format is not None else self.C.initials_format
self.initials_delimiter = initials_delimiter if initials_delimiter is not None else self.C.initials_delimiter
self.initials_separator = initials_separator if initials_separator is not None else self.C.initials_separator
self.suffix_delimiter = suffix_delimiter if suffix_delimiter is not None else self.C.suffix_delimiter
self._had_comma = False
if (first or middle or last or title or suffix or nickname):
self.first = first
self.middle = middle
self.last = last
self.title = title
self.suffix = suffix
self.nickname = nickname
self.unparsable = False
else:
# full_name setter triggers the parse
self.full_name = full_name
def __getstate__(self) -> dict:
state = self.__dict__.copy()
if state.get('C') is CONSTANTS:
state['C'] = None # sentinel: restore shared singleton on load
return state
def __setstate__(self, state: dict) -> None:
if state.get('C') is None:
state['C'] = CONSTANTS
self.__dict__.update(state)
def __iter__(self) -> Iterator[str]:
return self
def __len__(self) -> int:
l = 0
for x in self:
l += 1
return l
[docs]
def __eq__(self, other: object) -> bool:
"""
HumanName instances are equal to other objects whose
lower case unicode representation is the same.
"""
return str(self).lower() == str(other).lower()
def __ne__(self, other: object) -> bool:
return not str(self).lower() == str(other).lower()
@overload
def __getitem__(self, key: slice) -> list[str]: ...
@overload
def __getitem__(self, key: str) -> str: ...
def __getitem__(self, key: slice | str) -> str | list[str]:
if isinstance(key, slice):
return [getattr(self, x) for x in self._members[key]]
else:
return getattr(self, key)
def __setitem__(self, key: str, value: str) -> None:
if key in self._members:
self._set_list(key, value)
else:
raise KeyError("Not a valid HumanName attribute", key)
def __next__(self) -> str:
if self._count >= len(self._members):
self._count = 0
raise StopIteration
else:
c = self._count
self._count = c + 1
return getattr(self, self._members[c]) or next(self)
def __str__(self) -> str:
if self.string_format is not None:
# string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
_s = self.string_format.format(**self.as_dict())
# remove trailing punctuation from missing nicknames
_s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "")
_s = self.C.regexes.space_before_comma.sub(',', _s)
return self.collapse_whitespace(_s).strip(', ')
return " ".join(self)
def __hash__(self) -> int:
return hash(str(self))
def __repr__(self) -> str:
if self.unparsable:
_string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, }
else:
_string = "<%(class)s : [\n\ttitle: %(title)r \n\tfirst: %(first)r \n\tmiddle: %(middle)r \n\tlast: %(last)r \n\tsuffix: %(suffix)r\n\tnickname: %(nickname)r\n]>" % {
'class': self.__class__.__name__,
'title': self.title or '',
'first': self.first or '',
'middle': self.middle or '',
'last': self.last or '',
'suffix': self.suffix or '',
'nickname': self.nickname or '',
}
return _string
[docs]
def as_dict(self, include_empty: bool = True) -> dict[str, str]:
"""
Return the parsed name as a dictionary of its attributes.
:param bool include_empty: Include keys in the dictionary for empty name attributes.
:rtype: dict
.. doctest::
>>> name = HumanName("Bob Dole")
>>> name.as_dict()
{'title': '', 'first': 'Bob', 'middle': '', 'last': 'Dole', 'suffix': '', 'nickname': ''}
>>> name.as_dict(False)
{'first': 'Bob', 'last': 'Dole'}
"""
d = {}
for m in self._members:
if include_empty:
d[m] = getattr(self, m)
else:
val = getattr(self, m)
if val:
d[m] = val
return d
def __process_initial__(self, name_part: str, firstname: bool = False) -> str:
"""
Name parts may include prefixes or conjunctions. This function filters these from the name unless it is
a first name, since first names cannot be conjunctions or prefixes.
"""
parts = name_part.split(" ")
initials = []
if len(parts) and isinstance(parts, list):
for part in parts:
if not (self.is_prefix(part) or self.is_conjunction(part)) or firstname:
initials.append(part[0])
if len(initials) > 0:
return self.initials_separator.join(initials)
else:
return self.C.empty_attribute_default
[docs]
def initials_list(self) -> list[str]:
"""
Returns the initials as a list
.. doctest::
>>> name = HumanName("Sir Bob Andrew Dole")
>>> name.initials_list()
['B', 'A', 'D']
>>> name = HumanName("J. Doe")
>>> name.initials_list()
['J', 'D']
"""
first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name]
middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name]
last_initials_list = [self.__process_initial__(name) for name in self.last_list if name]
return first_initials_list + middle_initials_list + last_initials_list
[docs]
def initials(self) -> str:
"""
Return formatted initials for the name, controlled by
``initials_format``, ``initials_delimiter``, and ``initials_separator``.
``initials_delimiter`` is appended after each individual initial.
``initials_separator`` is placed between consecutive initials within
a name group (first, middle, or last). Both can be set as
``Constants`` attributes or as ``HumanName`` constructor kwargs.
.. doctest::
>>> name = HumanName("Sir Bob Andrew Dole")
>>> name.initials()
'B. A. D.'
>>> name = HumanName("Sir Bob Andrew Dole", initials_format="{first} {middle}")
>>> name.initials()
'B. A.'
>>> name = HumanName("Doe, John A.", initials_delimiter="", initials_separator="")
>>> name.initials()
'J A D'
"""
first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name]
middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name]
last_initials_list = [self.__process_initial__(name) for name in self.last_list if name]
# Empty parts must render as '' (not empty_attribute_default, which may be
# None) so str.format does not interpolate the literal "None" into the
# output. A fully-empty result falls back to empty_attribute_default,
# matching the other attribute accessors (e.g. ``first``).
initials_dict = {
"first": (self.initials_delimiter + self.initials_separator).join(first_initials_list) + self.initials_delimiter
if len(first_initials_list) else "",
"middle": (self.initials_delimiter + self.initials_separator).join(middle_initials_list) + self.initials_delimiter
if len(middle_initials_list) else "",
"last": (self.initials_delimiter + self.initials_separator).join(last_initials_list) + self.initials_delimiter
if len(last_initials_list) else ""
}
_s = self.initials_format.format(**initials_dict)
return self.collapse_whitespace(_s) or self.C.empty_attribute_default
@property
def has_own_config(self) -> bool:
"""
True if this instance is not using the shared module-level
configuration.
"""
return self.C is not CONSTANTS
# attributes
@property
def title(self) -> str:
"""
The person's titles. Any string of consecutive pieces in
:py:mod:`~nameparser.config.titles` or
:py:mod:`~nameparser.config.conjunctions`
at the beginning of :py:attr:`full_name`.
"""
return " ".join(self.title_list) or self.C.empty_attribute_default
@title.setter
def title(self, value: str | list[str] | None) -> None:
self._set_list('title', value)
@property
def first(self) -> str:
"""
The person's first name. The first name piece after any known
:py:attr:`title` pieces parsed from :py:attr:`full_name`.
"""
return " ".join(self.first_list) or self.C.empty_attribute_default
@first.setter
def first(self, value: str | list[str] | None) -> None:
self._set_list('first', value)
@property
def middle(self) -> str:
"""
The person's middle names. All name pieces after the first name and
before the last name parsed from :py:attr:`full_name`.
"""
return " ".join(self.middle_list) or self.C.empty_attribute_default
@middle.setter
def middle(self, value: str | list[str] | None) -> None:
self._set_list('middle', value)
@property
def last(self) -> str:
"""
The person's last name. The last name piece parsed from
:py:attr:`full_name`.
"""
return " ".join(self.last_list) or self.C.empty_attribute_default
@last.setter
def last(self, value: str | list[str] | None) -> None:
self._set_list('last', value)
@property
def suffix(self) -> str:
"""
The persons's suffixes. Pieces at the end of the name that are found in
:py:mod:`~nameparser.config.suffixes`, or pieces that are at the end
of comma separated formats, e.g.
"Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed
from :py:attr:`full_name`.
"""
return ", ".join(self.suffix_list) or self.C.empty_attribute_default
@suffix.setter
def suffix(self, value: str | list[str] | None) -> None:
self._set_list('suffix', value)
@property
def nickname(self) -> str:
"""
The person's nicknames. Any text found inside of quotes (``""``) or
parenthesis (``()``)
"""
return " ".join(self.nickname_list) or self.C.empty_attribute_default
@nickname.setter
def nickname(self, value: str | list[str] | None) -> None:
self._set_list('nickname', value)
@property
def surnames_list(self) -> list[str]:
"""
List of middle names followed by last name.
"""
return self.middle_list + self.last_list
@property
def surnames(self) -> str:
"""
A string of all middle names followed by the last name.
"""
return " ".join(self.surnames_list) or self.C.empty_attribute_default
@property
def given_names_list(self) -> list[str]:
"""
List of first name followed by middle names.
"""
return self.first_list + self.middle_list
@property
def given_names(self) -> str:
"""
A string of the first name followed by all middle names.
"""
return " ".join(self.given_names_list) or self.C.empty_attribute_default
def _split_last(self) -> tuple[list[str], list[str]]:
"""Return (prefix_particles, base_words) split from the last name.
The base_words list is never empty: if every word in the last name
matches a prefix particle, the guard fires and all words are returned
as the base with an empty prefix list (heuristic: a family name is
assumed not to consist entirely of particles).
>>> HumanName("Vincent van Gogh")._split_last()
(['van'], ['Gogh'])
>>> HumanName("Anh Do")._split_last()
([], ['Do'])
"""
words = " ".join(self.last_list).split()
i = 0
while i < len(words) and self.is_prefix(words[i]):
i += 1
if i == len(words):
# Heuristic: assume a family name isn't entirely composed of
# particles (e.g. surname "Do" which also appears in PREFIXES).
# Don't strip — treat the whole last name as the base.
return [], words
return words[:i], words[i:]
@property
def last_prefixes_list(self) -> list[str]:
"""
List of leading prefix particles in the last name (the *tussenvoegsel*).
Returns ``[]`` when there are none, including the case where every word
in the last name matches a prefix — see :py:meth:`_split_last`.
>>> HumanName("Juan de la Vega").last_prefixes_list
['de', 'la']
"""
return self._split_last()[0]
@property
def last_base_list(self) -> list[str]:
"""
List of last-name words after stripping leading prefix particles.
Never empty: when every word matches a prefix, no stripping occurs and
the full last name is returned — see :py:meth:`_split_last`.
>>> HumanName("Vincent van Gogh").last_base_list
['Gogh']
"""
return self._split_last()[1]
@property
def last_base(self) -> str:
"""
The last name with leading prefix particles removed (the core surname).
For ``"van Gogh"`` this is ``"Gogh"``; for ``"Smith"`` it is ``"Smith"``.
``last`` is always unchanged. When every word in the last name matches a
prefix particle, no stripping occurs and the full last name is returned.
>>> HumanName("Vincent van Gogh").last_base
'Gogh'
>>> HumanName("John Smith").last_base
'Smith'
"""
return " ".join(self.last_base_list) or self.C.empty_attribute_default
@property
def last_prefixes(self) -> str:
"""
The leading prefix particle(s) of the last name (the *tussenvoegsel*).
Returns ``""`` (or ``empty_attribute_default``) when there are none,
including when every word in the last name matches a prefix particle
(the all-particles guard; see :py:meth:`_split_last`).
>>> HumanName("Vincent van Gogh").last_prefixes
'van'
>>> HumanName("Juan de la Vega").last_prefixes
'de la'
"""
return " ".join(self.last_prefixes_list) or self.C.empty_attribute_default
@property
def family(self) -> str:
"""Alias for :py:attr:`last_base`."""
return self.last_base
@property
def family_prefixes(self) -> str:
"""Alias for :py:attr:`last_prefixes`."""
return self.last_prefixes
# setter methods
def _set_list(self, attr: str, value: str | list[str] | None) -> None:
if isinstance(value, list):
val = value
elif isinstance(value, (str, bytes)):
val = [value]
elif value is None:
val = []
else:
raise TypeError(
"Can only assign strings, lists or None to name attributes."
" Got {}".format(type(value)))
setattr(self, attr+"_list", self.parse_pieces(val))
# Parse helpers
[docs]
def is_title(self, value: str) -> bool:
"""Is in the :py:data:`~nameparser.config.titles.TITLES` set."""
return lc(value) in self.C.titles
[docs]
def is_leading_title(self, piece: str) -> bool:
"""
True if ``piece`` is a known title, or an unrecognized multi-letter
word ending in a single trailing period (e.g. ``"Major."``). The
``{2,}`` in the ``period_abbreviation`` regex, not a separate
``is_an_initial()`` check, is what excludes single-letter initials
like ``"J."``. Only meaningful for pieces in the title position
(before the first name is set) — a period-abbreviation appearing
later in the name is left as a middle name. Does not mutate
``C.titles``, so the periodless form (``"Major"``) is never affected
in later parses.
"""
return self.is_title(piece) or bool(self.C.regexes.period_abbreviation.match(piece))
[docs]
def is_conjunction(self, piece: str) -> bool:
"""Is in the conjunctions set and not :py:func:`is_an_initial()`."""
if isinstance(piece, list):
for item in piece:
if self.is_conjunction(item):
return True
else:
return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece)
[docs]
def is_prefix(self, piece: str) -> bool:
"""
Lowercased, leading/trailing-periods-stripped version of piece is in the
:py:data:`~nameparser.config.prefixes.PREFIXES` set.
"""
if isinstance(piece, list):
for item in piece:
if self.is_prefix(item):
return True
else:
return lc(piece) in self.C.prefixes
[docs]
def is_first_name_prefix(self, piece: str) -> bool:
"""Lowercased, leading/trailing-periods-stripped version of piece is in :py:attr:`~nameparser.config.Constants.first_name_prefixes`."""
return lc(piece) in self.C.first_name_prefixes
def _join_first_name_prefix(self, pieces: list[str], reserve_last: bool) -> list[str]:
"""Join a first-name prefix to its following piece.
Finds the first non-title piece; if it is in ``first_name_prefixes``,
merges it with the next piece — unless ``reserve_last`` is True and no
further piece would remain for the last name.
"""
fi = next((i for i, p in enumerate(pieces) if not self.is_title(p)), None)
if fi is None:
return pieces
if not self.is_first_name_prefix(pieces[fi]):
return pieces
next_i = fi + 1
if next_i >= len(pieces):
return pieces
if reserve_last:
# Count non-suffix pieces from next_i onward; need ≥2 so the join
# target and at least one last-name piece both exist.
non_suffix_remaining = sum(
1 for p in pieces[next_i:] if not self.is_suffix(p)
)
if non_suffix_remaining <= 1:
return pieces
pieces[fi] = pieces[fi] + " " + pieces[next_i]
del pieces[next_i]
return pieces
[docs]
def is_roman_numeral(self, value: str) -> bool:
"""
Matches the ``roman_numeral`` regular expression in
:py:data:`~nameparser.config.regexes.REGEXES`.
"""
return bool(self.C.regexes.roman_numeral.match(value))
[docs]
def is_suffix(self, piece: str) -> bool:
"""
Is in the suffixes set and not :py:func:`is_an_initial()`.
Some suffixes may be acronyms (M.B.A) while some are not (Jr.),
so we remove the periods from `piece` when testing against
`C.suffix_acronyms`.
"""
# suffixes may have periods inside them like "M.D."
if isinstance(piece, list):
for item in piece:
if self.is_suffix(item):
return True
else:
return ((lc(piece).replace('.', '') in self.C.suffix_acronyms)
or (lc(piece) in self.C.suffix_not_acronyms)) \
and not self.is_an_initial(piece)
[docs]
def are_suffixes(self, pieces: Iterable[str]) -> bool:
"""Return True if all pieces are suffixes."""
for piece in pieces:
if not self.is_suffix(piece):
return False
return True
[docs]
def are_suffixes_after_comma(self, pieces: Iterable[str]) -> bool:
"""Like are_suffixes, but pieces found in suffix_not_acronyms are
accepted unconditionally without passing through is_suffix().
Used when detecting suffix-comma format (e.g. "John Ingram, V") where
the post-comma position is unambiguous. This covers all
suffix_not_acronyms members (i, ii, iii, iv, v, jr, sr, etc.),
case-insensitively, including single-letter entries that is_suffix()
would otherwise reject via is_an_initial().
"""
for piece in pieces:
if lc(piece) in self.C.suffix_not_acronyms:
continue
if not self.is_suffix(piece):
return False
return True
[docs]
def is_suffix_at_lastname_comma_end(self, piece: str, nxt: str | None, parts: list[str]) -> bool:
"""True when ``piece`` is a suffix_not_acronyms member that should be
treated as a suffix at the end of ``parts[1]`` (the post-comma segment)
in a lastname-comma name, where ``parts`` is the full comma-split of the
name string.
Returns True only when all three conditions hold:
- ``nxt is None``: piece is the last token in the post-comma segment
- ``len(parts) == 2``: no ``parts[2]`` suffix segment exists
- ``lc(piece) in suffix_not_acronyms``
When ``parts[2]`` exists the caller already declared an explicit suffix
via comma (e.g. 'Doe, Rev. John V, Jr.'), making the trailing token more
likely a middle initial; ``len(parts) == 2`` excludes that case.
Used as an OR alternative to ``is_suffix()`` for pieces that
``is_suffix()`` would reject via ``is_an_initial()``.
"""
return (nxt is None
and len(parts) == 2
and lc(piece) in self.C.suffix_not_acronyms)
[docs]
def is_rootname(self, piece: str) -> bool:
"""
Is not a known title, suffix or prefix. Just first, middle, last names.
"""
return lc(piece) not in self.C.suffixes_prefixes_titles \
and not self.is_an_initial(piece)
[docs]
def is_an_initial(self, value: str) -> bool:
"""
Words with a single period at the end, or a single uppercase letter.
Matches the ``initial`` regular expression in
:py:data:`~nameparser.config.regexes.REGEXES`.
"""
return bool(self.C.regexes.initial.match(value))
[docs]
def is_patronymic(self, piece: str) -> bool:
"""
Return True if ``piece`` ends with a recognised East-Slavic patronymic
suffix, checked against both Latin-script and Cyrillic patterns in
``self.C.regexes``. Latin suffixes: ``-ovich``, ``-ovna``, ``-evich``,
``-evna``, ``-ichna``, and the irregular forms ``-ilyich``, ``-kuzmich``,
``-lukich``, ``-fomich``, ``-fokich``. Cyrillic equivalents are matched
by a separate pattern.
"""
return bool(
self.C.regexes.patronymic.search(piece)
or self.C.regexes.patronymic_cyrillic.search(piece)
)
# full_name parser
@property
def full_name(self) -> str:
"""The string output of the HumanName instance."""
return self.__str__()
@full_name.setter
def full_name(self, value: str | bytes) -> None:
self.original = value
if isinstance(value, bytes):
self._full_name = value.decode(self.encoding)
else:
self._full_name = value
self.parse_full_name()
def collapse_whitespace(self, string: str) -> str:
# collapse multiple spaces into single space
string = self.C.regexes.spaces.sub(" ", string.strip())
if string.endswith(","):
string = string[:-1]
return string
[docs]
def pre_process(self) -> None:
"""
This method happens at the beginning of the :py:func:`parse_full_name`
before any other processing of the string aside from unicode
normalization, so it's a good place to do any custom handling in a
subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`.
"""
self.fix_phd()
self.parse_nicknames()
self.squash_emoji()
[docs]
def handle_patronymic_name_order(self) -> None:
"""
When patronymic_name_order is enabled, detect Russian formal order
(Surname GivenName Patronymic) and rotate to Western order.
Fires only for no-comma, single-token first/middle/last where the last
token is a patronymic and the middle token is not. Title, suffix, and
nickname parts do not affect this guard — reordering proceeds regardless
of whether they are present.
"""
if (
not self._had_comma
and len(self.first_list) == 1
and len(self.middle_list) == 1
and len(self.last_list) == 1
and self.is_patronymic(self.last_list[0])
and not self.is_patronymic(self.middle_list[0])
):
self.first_list, self.middle_list, self.last_list = (
self.middle_list,
self.last_list,
self.first_list,
)
[docs]
def handle_middle_name_as_last(self) -> None:
"""
When middle_name_as_last is enabled, fold middle_list into last_list
(prepended, preserving order) and clear middle_list. No-op when
middle_list is already empty.
"""
self.last_list = self.middle_list + self.last_list
self.middle_list = []
[docs]
def post_process(self) -> None:
"""
This happens at the end of the :py:func:`parse_full_name` after
all other processing has taken place. Runs :py:func:`handle_firstnames`
and :py:func:`handle_capitalization`.
"""
self.handle_firstnames()
if self.C.patronymic_name_order:
self.handle_patronymic_name_order()
if self.C.middle_name_as_last:
self.handle_middle_name_as_last()
self.handle_capitalization()
def fix_phd(self) -> None:
_re = self.C.regexes.phd
if match := _re.search(self._full_name):
self.suffix_list.extend(match.groups())
self._full_name = _re.sub('', self._full_name)
[docs]
def parse_nicknames(self) -> None:
"""
The content of parenthesis or quotes in the name will be added to the
nicknames list, unless that content is suffix-shaped -- an unambiguous
suffix_not_acronyms/suffix_acronyms member, or content ending in a
period -- in which case it's left in place (undelimited) for normal
downstream suffix/title/word parsing instead. This happens before any
other processing of the name.
Single quotes cannot span white space characters and must border
white space to allow for quotes in names like O'Connor and Kawai'ae'a.
Double quotes and parenthesis can span white space.
Loops through the built-in `quoted_word`, `double_quotes` and
`parenthesis` patterns in :py:attr:`~nameparser.config.Constants.regexes`,
followed by any patterns added to
:py:attr:`~nameparser.config.Constants.extra_nickname_delimiters` --
see the "Adding Custom Nickname Delimiters" section of the
customization docs.
"""
def handle_match(m: 're.Match[str]') -> str:
# Fall back to the whole match when the regex has no capturing
# group (e.g. a custom override regex without one, like
# EMPTY_REGEX) -- mirrors the old code's use of findall(), which
# returns the whole match for group-less patterns.
content = m.group(1) if m.lastindex else m.group(0)
stripped = lc(content)
# Inlined rather than calling self.is_suffix(content): is_suffix()
# also rejects single-letter initials via is_an_initial(), which
# isn't relevant here, and the suffix_acronyms_ambiguous exclusion
# needs to be interleaved into the acronym branch specifically.
# Acronym suffixes may have periods between every letter (e.g.
# "M.D", "Ph.D") that aren't necessarily trailing, so -- exactly
# like is_suffix() -- strip all periods before checking
# suffix_acronyms/suffix_acronyms_ambiguous membership. Bare
# `stripped` (lc() only strips leading/trailing periods) is still
# used for suffix_not_acronyms, matching is_suffix()'s asymmetry.
acronym_stripped = stripped.replace('.', '')
is_unambiguous_suffix = (
stripped in self.C.suffix_not_acronyms
or (acronym_stripped in self.C.suffix_acronyms
and acronym_stripped not in self.C.suffix_acronyms_ambiguous)
)
if is_unambiguous_suffix or content.endswith('.'):
# Leave the bare content -- no delimiters -- so downstream
# word-splitting/suffix-matching sees it exactly as if it had
# never been wrapped in parens/quotes. is_suffix()/lc() only
# strip periods, never parens/quotes, so returning m.group(0)
# here (e.g. literal "(Ret)") would never match
# suffix_not_acronyms ("ret").
return content
self.nickname_list.append(content)
return ''
# Same handle_match for every delimiter: suffix-shaped content
# is rare in quotes but not impossible, and the logic is delimiter-
# agnostic, so there's no reason to special-case parenthesis here.
# The three built-ins are read live from self.C.regexes (not copied),
# so overriding e.g. self.C.regexes.parenthesis keeps working as
# before; extra_nickname_delimiters is iterated afterward so callers
# can add new delimiter patterns at runtime without needing to
# override parse_nicknames() itself -- see issue #112.
delimiters = (
self.C.regexes.quoted_word,
self.C.regexes.double_quotes,
self.C.regexes.parenthesis,
*self.C.extra_nickname_delimiters.values(),
)
for _re in delimiters:
self._full_name = _re.sub(handle_match, self._full_name)
[docs]
def squash_emoji(self) -> None:
"""
Remove emoji from the input string.
"""
re_emoji = self.C.regexes.emoji
if re_emoji and re_emoji.search(self._full_name):
self._full_name = re_emoji.sub('', self._full_name)
[docs]
def handle_firstnames(self) -> None:
"""
If there are only two parts and one is a title, assume it's a last name
instead of a first name. e.g. Mr. Johnson. Unless it's a special title
like "Sir", then when it's followed by a single name that name is always
a first name.
"""
if self.title \
and len(self) == 2 \
and lc(self.title) not in self.C.first_name_titles:
self.last, self.first = self.first, self.last
[docs]
def parse_full_name(self) -> None:
"""
The main parse method for the parser. This method is run upon
assignment to the :py:attr:`full_name` attribute or instantiation.
Basic flow is to hand off to :py:func:`pre_process` to handle
nicknames. It then splits on commas and chooses a code path depending
on the number of commas.
:py:func:`parse_pieces` then splits those parts on spaces and
:py:func:`join_on_conjunctions` joins any pieces next to conjunctions.
"""
self.title_list = []
self.first_list = []
self.middle_list = []
self.last_list = []
self.suffix_list = []
self.nickname_list = []
self.unparsable = True
self.pre_process()
self._full_name = self.collapse_whitespace(self._full_name)
# break up full_name by commas
parts = [x.strip() for x in self._full_name.split(",")]
self._had_comma = len(parts) > 1
if self.suffix_delimiter and len(parts) > 1:
expanded = [parts[0]]
for part in parts[1:]:
expanded.extend([p for p in (p.strip() for p in part.split(self.suffix_delimiter)) if p])
parts = expanded
log.debug("full_name: %s", self._full_name)
log.debug("parts: %s", parts)
if len(parts) == 1:
# no commas, title first middle middle middle last suffix
# part[0]
pieces = self.parse_pieces(parts)
pieces = self._join_first_name_prefix(pieces, reserve_last=True)
p_len = len(pieces)
for i, piece in enumerate(pieces):
try:
nxt = pieces[i + 1]
except IndexError:
nxt = None
# title must have a next piece, unless it's just a title
if not self.first \
and (nxt or p_len == 1) \
and self.is_leading_title(piece):
self.title_list.append(piece)
continue
if not self.first:
if p_len == 1 and self.nickname:
self.last_list.append(piece)
continue
self.first_list.append(piece)
continue
if self.are_suffixes(pieces[i+1:]) or \
(
# if the next piece is the last piece and a roman
# numeral but this piece is not an initial
nxt is not None and \
self.is_roman_numeral(nxt) and i == p_len - 2
and not self.is_an_initial(piece)
):
self.last_list.append(piece)
self.suffix_list += pieces[i+1:]
break
if not nxt:
self.last_list.append(piece)
continue
self.middle_list.append(piece)
else:
# if all the end parts are suffixes and there is more than one piece
# in the first part. (Suffixes will never appear after last names
# only, and allows potential first names to be in suffixes, e.g.
# "Johnson, Bart"
post_comma_pieces = self.parse_pieces(parts[1].split(' '), 1)
if self.are_suffixes_after_comma(parts[1].split(' ')) \
and len(parts[0].split(' ')) > 1:
# suffix comma:
# title first middle last [suffix], suffix [suffix] [, suffix]
# parts[0], parts[1:...]
self.suffix_list += parts[1:]
pieces = self.parse_pieces(parts[0].split(' '))
log.debug("pieces: %s", str(pieces))
for i, piece in enumerate(pieces):
try:
nxt = pieces[i + 1]
except IndexError:
nxt = None
if not self.first \
and (nxt or len(pieces) == 1) \
and self.is_leading_title(piece):
self.title_list.append(piece)
continue
if not self.first:
self.first_list.append(piece)
continue
if self.are_suffixes(pieces[i+1:]):
self.last_list.append(piece)
self.suffix_list = pieces[i+1:] + self.suffix_list
break
if not nxt:
self.last_list.append(piece)
continue
self.middle_list.append(piece)
else:
# lastname comma:
# last [suffix], title first middles[,] suffix [,suffix]
# parts[0], parts[1], parts[2:...]
log.debug("post-comma pieces: %s", str(post_comma_pieces))
post_comma_pieces = self._join_first_name_prefix(post_comma_pieces, reserve_last=False)
# lastname part may have suffixes in it
lastname_pieces = self.parse_pieces(parts[0].split(' '), 1)
for piece in lastname_pieces:
# the first one is always a last name, even if it looks like
# a suffix
if self.is_suffix(piece) and len(self.last_list) > 0:
self.suffix_list.append(piece)
else:
self.last_list.append(piece)
for i, piece in enumerate(post_comma_pieces):
try:
nxt = post_comma_pieces[i + 1]
except IndexError:
nxt = None
if not self.first \
and (nxt or len(post_comma_pieces) == 1) \
and self.is_leading_title(piece):
self.title_list.append(piece)
continue
if not self.first:
self.first_list.append(piece)
continue
if self.is_suffix(piece) or self.is_suffix_at_lastname_comma_end(piece, nxt, parts):
self.suffix_list.append(piece)
continue
self.middle_list.append(piece)
try:
if parts[2]:
self.suffix_list += parts[2:]
except IndexError:
pass
if len(self) < 0:
log.info("Unparsable: \"%s\" ", self.original)
else:
self.unparsable = False
self.post_process()
[docs]
def parse_pieces(self, parts: Iterable[str], additional_parts_count: int = 0) -> list[str]:
"""
Split parts on spaces and remove commas, join on conjunctions and
lastname prefixes. If parts have periods in the middle, try splitting
on periods and check if the parts are titles or suffixes. If they are
add to the constant so they will be found.
:param list parts: name part strings from the comma split
:param int additional_parts_count:
if the comma format contains other parts, we need to know
how many there are to decide if things should be considered a
conjunction.
:return: pieces split on spaces and joined on conjunctions
:rtype: list
"""
output: list[str] = []
for part in parts:
if not isinstance(part, (str, bytes)):
raise TypeError("Name parts must be strings. "
" Got {}".format(type(part)))
output += [x.strip(' ,') for x in part.split(' ')]
# If part contains periods, check if it's multiple titles or suffixes
# together without spaces if so, add the new part with periods to the
# constants so they get parsed correctly later
for part in output:
# if this part has a period not at the beginning or end
if self.C.regexes.period_not_at_end and self.C.regexes.period_not_at_end.match(part):
# split on periods, any of the split pieces titles or suffixes?
# ("Lt.Gov.")
period_chunks = part.split(".")
titles = list(filter(self.is_title, period_chunks))
suffixes = list(filter(self.is_suffix, period_chunks))
# add the part to the constant so it will be found
if len(list(titles)):
self.C.titles.add(part)
continue
if len(list(suffixes)):
self.C.suffix_not_acronyms.add(part)
continue
return self.join_on_conjunctions(output, additional_parts_count)
[docs]
def join_on_conjunctions(self, pieces: list[str], additional_parts_count: int = 0) -> list[str]:
"""
Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.:
['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==>
['Mr. and Mrs.', 'John', 'Doe']
['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==>
['The Secretary of State', 'Hillary', 'Clinton']
When joining titles, saves newly formed piece to the instance's titles
constant so they will be parsed correctly later. E.g. after parsing the
example names above, 'The Secretary of State' and 'Mr. and Mrs.' would
be present in the titles constant set.
:param list pieces: name pieces strings after split on spaces
:param int additional_parts_count:
:return: new list with piece next to conjunctions merged into one piece
with spaces in it.
:rtype: list
"""
length = len(pieces) + additional_parts_count
# don't join on conjunctions if there's only 2 parts
if length < 3:
return pieces
rootname_pieces = [p for p in pieces if self.is_rootname(p)]
total_length = len(rootname_pieces) + additional_parts_count
# find all the conjunctions, join any conjunctions that are next to each
# other, then join those newly joined conjunctions and any single
# conjunctions to the piece before and after it
conj_index = [i for i, piece in enumerate(pieces)
if self.is_conjunction(piece)]
contiguous_conj_i = group_contiguous_integers(conj_index)
delete_i: list[int] = []
for cont_i in contiguous_conj_i:
new_piece = " ".join(pieces[cont_i[0]: cont_i[1]+1])
delete_i += list(range(cont_i[0]+1, cont_i[1]+1))
pieces[cont_i[0]] = new_piece
# add newly joined conjunctions to constants to be found later
self.C.conjunctions.add(new_piece)
for i in reversed(delete_i):
# delete pieces in reverse order or the index changes on each delete
del pieces[i]
if len(pieces) == 1:
# if there's only one piece left, nothing left to do
return pieces
# refresh conjunction index locations
conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
for i in conj_index:
if len(pieces[i]) == 1 and total_length < 4 and pieces[i].isalpha():
# if there are only 3 total parts (minus known titles, suffixes
# and prefixes) and this conjunction is a single letter, prefer
# treating it as an initial rather than a conjunction.
# http://code.google.com/p/python-nameparser/issues/detail?id=11
continue
if i == 0:
new_piece = " ".join(pieces[i:i+2])
if self.is_title(pieces[i+1]):
# when joining to a title, make new_piece a title too
self.C.titles.add(new_piece)
if self.is_prefix(pieces[i+1]):
# when joining to a prefix, make new_piece a prefix too, so
# e.g. "von" + "und" bridges into "von und" and can still
# chain onto a following prefix/lastname (see "von und zu")
self.C.prefixes.add(new_piece)
pieces[i] = new_piece
pieces.pop(i+1)
# subtract 1 from the index of all the remaining conjunctions
for j, val in enumerate(conj_index):
if val > i:
conj_index[j] = val-1
else:
new_piece = " ".join(pieces[i-1:i+2])
if self.is_title(pieces[i-1]):
# when joining to a title, make new_piece a title too
self.C.titles.add(new_piece)
if self.is_prefix(pieces[i-1]):
# when joining to a prefix, make new_piece a prefix too, so
# e.g. "von" + "und" bridges into "von und" and can still
# chain onto a following prefix/lastname (see "von und zu")
self.C.prefixes.add(new_piece)
pieces[i-1] = new_piece
pieces.pop(i)
rm_count = 2
try:
pieces.pop(i)
except IndexError:
rm_count = 1
# subtract the number of removed pieces from the index
# of all the remaining conjunctions
for j, val in enumerate(conj_index):
if val > i:
conj_index[j] = val - rm_count
# join prefixes to following lastnames: ['de la Vega'], ['van Buren']
prefixes = list(filter(self.is_prefix, pieces))
if prefixes:
for prefix in prefixes:
try:
i = pieces.index(prefix)
except ValueError:
# If the prefix is no longer in pieces, it's because it has been
# combined with the prefix that appears right before (or before that when
# chained together) in the last loop, so the index of that newly created
# piece is the same as in the last loop, i==i still, and we want to join
# it to the next piece.
pass
new_piece = ''
# join everything after the prefix until the next prefix or suffix
try:
if i == 0 and total_length >= 1:
# If it's the first piece and there are more than 1 rootnames, assume it's a first name
continue
next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:])))
j = pieces.index(next_prefix, i + 1)
if j == i + 1:
# if there are two prefixes in sequence, join to the following piece
j += 1
new_piece = ' '.join(pieces[i:j])
pieces = pieces[:i] + [new_piece] + pieces[j:]
except StopIteration:
try:
# if there are no more prefixes, look for a suffix to stop at
stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:])))
# search from i + 1: filter() finds the value of stop_at
# in pieces[i+1:] but pieces.index() without a start
# argument searches from 0, so an earlier occurrence of
# the same token (e.g. a suffix token that also appears
# before the prefix) would be matched instead.
j = pieces.index(stop_at, i + 1)
new_piece = ' '.join(pieces[i:j])
pieces = pieces[:i] + [new_piece] + pieces[j:]
except StopIteration:
# if there were no suffixes, nothing to stop at so join all
# remaining pieces
new_piece = ' '.join(pieces[i:])
pieces = pieces[:i] + [new_piece]
log.debug("pieces: %s", pieces)
return pieces
# Capitalization Support
def cap_word(self, word: str, attribute: HumanNameAttributeT) -> str:
if (self.is_prefix(word) and attribute in ('last', 'middle')) \
or self.is_conjunction(word):
return word.lower()
exceptions = self.C.capitalization_exceptions
if lc(word) in exceptions:
return exceptions[lc(word)]
if lc(word).replace('.', '') in exceptions:
return exceptions[lc(word).replace('.', '')]
mac_match = self.C.regexes.mac.match(word)
if mac_match:
def cap_after_mac(m: re.Match) -> str:
return m.group(1).capitalize() + m.group(2).capitalize()
return self.C.regexes.mac.sub(cap_after_mac, word)
else:
return word.capitalize()
def cap_piece(self, piece: str, attribute: HumanNameAttributeT) -> str:
if not piece:
return ""
def replacement(m: re.Match) -> str:
return self.cap_word(m.group(0), attribute)
return self.C.regexes.word.sub(replacement, piece)
[docs]
def capitalize(self, force: bool | None = None) -> None:
"""
The HumanName class can try to guess the correct capitalization of name
entered in all upper or lower case. By default, it will not adjust the
case of names entered in mixed case. To run capitalization on all names
pass the parameter `force=True`.
:param bool force: Forces capitalization of mixed case strings. This
parameter overrides rules set within
:py:class:`~nameparser.config.CONSTANTS`.
**Usage**
.. doctest:: capitalize
>>> name = HumanName('bob v. de la macdole-eisenhower phd')
>>> name.capitalize()
>>> str(name)
'Bob V. de la MacDole-Eisenhower Ph.D.'
>>> # Don't touch good names
>>> name = HumanName('Shirley Maclaine')
>>> name.capitalize()
>>> str(name)
'Shirley Maclaine'
>>> name.capitalize(force=True)
>>> str(name)
'Shirley MacLaine'
"""
name = str(self)
force = self.C.force_mixed_case_capitalization \
if force is None else force
if not force and not (name == name.upper() or name == name.lower()):
return
self.title_list = self.cap_piece(self.title, 'title').split()
self.first_list = self.cap_piece(self.first, 'first').split()
self.middle_list = self.cap_piece(self.middle, 'middle').split()
self.last_list = self.cap_piece(self.last, 'last').split()
# suffix is stored comma-separated ("Ph.D., J.D."), not space-separated
self.suffix_list = [s for s in self.cap_piece(self.suffix, 'suffix').split(', ') if s]
[docs]
def handle_capitalization(self) -> None:
"""
Handles capitalization configurations set within
:py:class:`~nameparser.config.CONSTANTS`.
"""
if self.C.capitalize_name:
self.capitalize()