Source code for nameparser.config

"""
The :py:mod:`nameparser.config` module manages the configuration of the
nameparser. 

A module-level instance of :py:class:`~nameparser.config.Constants` is created
and used by default for all HumanName instances. You can adjust the entire module's
configuration by importing this instance and changing it.

::

    >>> from nameparser.config import CONSTANTS
    >>> CONSTANTS.titles.remove('hon').add('chemistry','dean') # doctest: +SKIP

You can also adjust the configuration of individual instances by passing
``None`` as the second argument upon instantiation.

::

    >>> from nameparser import HumanName
    >>> hn = HumanName("Dean Robert Johns", None)
    >>> hn.C.titles.add('dean') # doctest: +SKIP
    >>> hn.parse_full_name() # need to run this again after config changes

**Potential Gotcha**: If you do not pass ``None`` as the second argument,
``hn.C`` will be a reference to the module config, possibly yielding 
unexpected results. See `Customizing the Parser <customize.html>`_.
"""
import re
import sys
from collections.abc import Callable, Iterable, Iterator, Mapping, Set
from typing import Any, TypeVar, overload

if sys.version_info >= (3, 11):
    from typing import Self
else:
    from typing_extensions import Self

from nameparser.util import lc
from nameparser.config.prefixes import PREFIXES
from nameparser.config.first_name_prefixes import FIRST_NAME_PREFIXES
from nameparser.config.capitalization import CAPITALIZATION_EXCEPTIONS
from nameparser.config.conjunctions import CONJUNCTIONS
from nameparser.config.suffixes import SUFFIX_ACRONYMS
from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS
from nameparser.config.suffixes import SUFFIX_ACRONYMS_AMBIGUOUS
from nameparser.config.titles import TITLES
from nameparser.config.titles import FIRST_NAME_TITLES
from nameparser.config.regexes import EMPTY_REGEX, REGEXES

DEFAULT_ENCODING = 'UTF-8'


[docs] class SetManager(Set): ''' Easily add and remove config variables per module or instance. Subclass of ``collections.abc.Set``. Only special functionality beyond that provided by set() is to normalize constants for comparison (lower case, no periods) when they are add()ed and remove()d and allow passing multiple string arguments to the :py:func:`add()` and :py:func:`remove()` methods. ''' _on_change: Callable[[], None] | None def __init__(self, elements: Iterable[str]) -> None: self.elements = set(elements) # Optional invalidation hook, wired by an owning Constants so that # in-place add()/remove() can clear its cached suffixes_prefixes_titles # union. None when the manager is used standalone. self._on_change = None def __call__(self) -> Set[str]: return self.elements def __repr__(self) -> str: return "SetManager({})".format(self.elements) # used for docs def __iter__(self) -> Iterator[str]: return iter(self.elements) def __contains__(self, value: object) -> bool: return value in self.elements def __len__(self) -> int: return len(self.elements)
[docs] def add_with_encoding(self, s: str, encoding: str | None = None) -> None: """ Add the lowercased, leading/trailing-periods-stripped version of the string to the set. Pass an explicit `encoding` parameter to specify the encoding of binary strings that are not DEFAULT_ENCODING (UTF-8). """ stdin_encoding = None if sys.stdin: stdin_encoding = sys.stdin.encoding encoding = encoding or stdin_encoding or DEFAULT_ENCODING if isinstance(s, bytes): s = s.decode(encoding) normalized = lc(s) if normalized not in self.elements: self.elements.add(normalized) if self._on_change: self._on_change()
[docs] def add(self, *strings: str) -> Self: """ Add the lowercased, leading/trailing-periods-stripped version of the string arguments to the set. Can pass a list of strings. Returns ``self`` for chaining. """ for s in strings: self.add_with_encoding(s) return self
[docs] def remove(self, *strings: str) -> Self: """ Remove the lower case and no-period version of the string arguments from the set. Returns ``self`` for chaining. """ changed = False for s in strings: if (lower := lc(s)) in self.elements: self.elements.remove(lower) changed = True if changed and self._on_change: self._on_change() return self
[docs] def clear(self) -> Self: """Remove all entries from the set. Returns ``self`` for chaining.""" if self.elements: self.elements.clear() if self._on_change: self._on_change() return self
T = TypeVar('T')
[docs] class TupleManager(dict[str, T]): ''' A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants more friendly. ''' def __getattr__(self, attr: str) -> T | None: # Dunder names are Python's protocol probes (copy looks up __deepcopy__, # inspect.unwrap looks up __wrapped__, ...), never config keys. Report # them as genuinely absent so hasattr() is honest and those probes work; # otherwise the dict default is mistaken for a real protocol hook. See # RegexTupleManager.__getattr__ for the concrete failure this prevents. if attr.startswith("__") and attr.endswith("__"): raise AttributeError(attr) return self.get(attr) __setattr__ = dict.__setitem__ __delattr__ = dict.__delitem__ def __getstate__(self) -> Mapping[str, T]: return dict(self) def __setstate__(self, state: Mapping[str, T]) -> None: self.update(state) def __reduce__(self) -> tuple[type, tuple[()], Mapping[str, T]]: # Use type(self), not TupleManager, so subclasses such as # RegexTupleManager survive a pickle round-trip instead of being # downgraded to a plain TupleManager (which loses the EMPTY_REGEX # default for unknown keys). return (type(self), (), self.__getstate__())
[docs] class RegexTupleManager(TupleManager[re.Pattern[str]]): def __getattr__(self, attr: str) -> re.Pattern[str]: # Dunder names are Python's protocol probes (copy.deepcopy looks up # __deepcopy__, inspect.unwrap looks up __wrapped__, ...), never regex # keys. Report them as genuinely absent; otherwise the EMPTY_REGEX # default is mistaken for a real protocol hook — e.g. copy.deepcopy # tries to call the returned re.Pattern and raises TypeError. if attr.startswith("__") and attr.endswith("__"): raise AttributeError(attr) return self.get(attr, EMPTY_REGEX)
class _CachedUnionMember: """Descriptor for the four ``SetManager`` attributes whose union ``Constants`` caches in ``_pst`` (``prefixes``, ``suffix_acronyms``, ``suffix_not_acronyms``, ``titles``). Assigning a new manager — or mutating one in place via ``add()`` / ``remove()`` — invalidates that cache. Keeping the behavior on a descriptor scopes it to exactly these attributes, beside their declarations, rather than spreading it across a catch-all ``__setattr__`` and a separate attribute-name list. """ _attr: str def __set_name__(self, owner: type, name: str) -> None: self._attr = '_' + name @overload def __get__(self, obj: None, objtype: type | None = None) -> '_CachedUnionMember': ... @overload def __get__(self, obj: 'Constants', objtype: type | None = None) -> SetManager: ... def __get__(self, obj: 'Constants | None', objtype: type | None = None) -> 'SetManager | _CachedUnionMember': if obj is None: return self return getattr(obj, self._attr) def __set__(self, obj: 'Constants', value: SetManager) -> None: if not isinstance(value, SetManager): raise TypeError( f"Expected a SetManager instance, got {type(value).__name__!r}. " "Wrap your iterable: SetManager(['mr', 'ms'])" ) previous = getattr(obj, self._attr, None) if isinstance(previous, SetManager): previous._on_change = None # detach the replaced manager so it no longer invalidates value._on_change = obj._invalidate_pst obj._invalidate_pst() setattr(obj, self._attr, value)
[docs] class Constants: """ An instance of this class hold all of the configuration constants for the parser. :param set prefixes: :py:attr:`prefixes` wrapped with :py:class:`SetManager`. :param set titles: :py:attr:`titles` wrapped with :py:class:`SetManager`. :param set first_name_titles: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. :param set suffix_acronyms_ambiguous: :py:attr:`~suffixes.SUFFIX_ACRONYMS_AMBIGUOUS` wrapped with :py:class:`SetManager`. :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :param set first_name_prefixes: :py:attr:`~first_name_prefixes.FIRST_NAME_PREFIXES` wrapped with :py:class:`SetManager`. :type capitalization_exceptions: tuple or dict :param capitalization_exceptions: :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`. :type regexes: tuple or dict :param regexes: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ prefixes = _CachedUnionMember() suffix_acronyms = _CachedUnionMember() suffix_not_acronyms = _CachedUnionMember() titles = _CachedUnionMember() first_name_titles: SetManager conjunctions: SetManager first_name_prefixes: SetManager suffix_acronyms_ambiguous: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager extra_nickname_delimiters: TupleManager[re.Pattern[str]] _pst: Set[str] | None string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" """ The default string format use for all new `HumanName` instances. """ initials_format = "{first} {middle} {last}" """ The default initials format used for all new `HumanName` instances. """ initials_delimiter = "." """ The default initials delimiter used for all new `HumanName` instances. Will be used to add a delimiter between each initial. """ initials_separator = " " """ The default separator placed between consecutive initials within a name group (first, middle, or last). Distinct from ``initials_delimiter``, which is the trailing character after each individual initial. With defaults ``initials_delimiter="."`` and ``initials_separator=" "``, ``initials()`` produces ``"J. A. D."``. Setting ``initials_separator=""`` with ``initials_delimiter="."`` and ``initials_format="{first}{middle}{last}"`` produces ``"J.A.D."``. With the default ``initials_format``, group-level spacing from the template is still applied. """ suffix_delimiter = None """ If set, an additional delimiter used to split suffix groups after comma-splitting. For example, setting ``suffix_delimiter=" - "`` allows ``"RN - CRNA"`` to be parsed as two separate suffixes. Default is ``None`` (no additional splitting beyond the standard comma split). Note: setting this to ``","`` or ``", "`` has no additional effect — the full name is already split on bare commas first, and each resulting part is stripped of surrounding whitespace before this step runs. Known limitation: the expansion is applied to all post-comma parts, not just suffix groups. In inverted format (``"Last, First, suffix"``), the first-name part is also split on the delimiter. In practice this is harmless since first names rarely contain the delimiter string, but a name like ``"Doe, Mary - Kate, RN"`` with ``suffix_delimiter=" - "`` would misparse. """ empty_attribute_default = '' """ Default return value for empty attributes. .. doctest:: >>> from nameparser.config import CONSTANTS >>> CONSTANTS.empty_attribute_default = None >>> name = HumanName("John Doe") >>> print(name.title) None >>> name.first 'John' """ capitalize_name = False """ If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to :py:class:`~nameparser.parser.HumanName` instance. .. doctest:: >>> from nameparser.config import CONSTANTS >>> CONSTANTS.capitalize_name = True >>> name = HumanName("bob v. de la macdole-eisenhower phd") >>> str(name) 'Bob V. de la MacDole-Eisenhower Ph.D.' """ force_mixed_case_capitalization = False """ If set, forces the capitalization of mixed case strings when :py:meth:`~nameparser.parser.HumanName.capitalize` is called. .. doctest:: >>> from nameparser.config import CONSTANTS >>> CONSTANTS.force_mixed_case_capitalization = True >>> name = HumanName('Shirley Maclaine') >>> name.capitalize() >>> str(name) 'Shirley MacLaine' """ patronymic_name_order = False """ If set, detects names in Russian formal order (``Surname GivenName Patronymic``) by recognizing a trailing East-Slavic patronymic suffix on the last token, and rotates the three name parts so that ``first``/``middle``/``last`` map to given name / patronymic / surname respectively. Detection requires exactly one token in each of first, middle, and last; names with multi-part given names or multiple middle names are left unchanged. Opt-in because a Western person whose surname happens to end in a patronymic suffix (e.g. ``"David Michael Abramovich"``) will be reordered incorrectly when the flag is on. Enable only when your data is predominantly Russian formal-order names. For per-instance control without a shared ``Constants``, pass a dedicated instance: ``HumanName("...", constants=Constants(patronymic_name_order=True))``. .. doctest:: >>> from nameparser import HumanName >>> from nameparser.config import Constants >>> C = Constants(patronymic_name_order=True) >>> hn = HumanName("Ivanov Ivan Ivanovich", constants=C) >>> hn.first, hn.middle, hn.last ('Ivan', 'Ivanovich', 'Ivanov') """ middle_name_as_last = False """ If set, folds middle names into the last name: ``middle_list`` is prepended to ``last_list`` and ``middle_list`` is cleared, so ``.last`` becomes what ``.surnames`` already was and ``.middle`` becomes empty. Useful for naming systems with no middle-name concept, where everything after the given name is lineage/family (e.g. Arabic patronymic chaining: given + father + grandfather + family). The fold is uniform across both no-comma and comma ("Last, First Middle") input, so the two written forms of a name converge on the same result. For per-instance control without a shared ``Constants``, pass a dedicated instance: ``HumanName("...", constants=Constants(middle_name_as_last=True))``. .. doctest:: >>> from nameparser import HumanName >>> from nameparser.config import Constants >>> C = Constants(middle_name_as_last=True) >>> hn = HumanName("Mohamad Ahmad Ali Hassan", constants=C) >>> hn.first, hn.middle, hn.last ('Mohamad', '', 'Ahmad Ali Hassan') """ def __init__(self, prefixes: Iterable[str] = PREFIXES, suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS, suffix_not_acronyms: Iterable[str] = SUFFIX_NOT_ACRONYMS, suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS, titles: Iterable[str] = TITLES, first_name_titles: Iterable[str] = FIRST_NAME_TITLES, conjunctions: Iterable[str] = CONJUNCTIONS, first_name_prefixes: Iterable[str] = FIRST_NAME_PREFIXES, capitalization_exceptions: TupleManager[str] | Iterable[tuple[str, str]] = CAPITALIZATION_EXCEPTIONS, regexes: RegexTupleManager | TupleManager[re.Pattern[str]] | Iterable[tuple[str, re.Pattern[str]]] = REGEXES, patronymic_name_order: bool = False, middle_name_as_last: bool = False, ) -> None: # These four descriptor assignments call _CachedUnionMember.__set__, which # calls _invalidate_pst() and establishes self._pst. They must come before # any read of suffixes_prefixes_titles. self.prefixes = SetManager(prefixes) self.suffix_acronyms = SetManager(suffix_acronyms) self.suffix_not_acronyms = SetManager(suffix_not_acronyms) self.titles = SetManager(titles) self.first_name_titles = SetManager(first_name_titles) self.conjunctions = SetManager(conjunctions) self.first_name_prefixes = SetManager(first_name_prefixes) self.suffix_acronyms_ambiguous = SetManager(suffix_acronyms_ambiguous) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) # Named, appendable group of *additional* delimiter patterns that # parse_nicknames() iterates after its three built-in delimiters # (quoted_word/double_quotes/parenthesis, read live from self.regexes # so overriding those keeps working as before). Empty by default; add # a pattern here (and re-parse) to recognize a new delimiter without # needing to override parse_nicknames() itself. See issue #112. self.extra_nickname_delimiters = TupleManager() self.patronymic_name_order = patronymic_name_order self.middle_name_as_last = middle_name_as_last def _invalidate_pst(self) -> None: self._pst = None @property def suffixes_prefixes_titles(self) -> Set[str]: if self._pst is None: self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles return self._pst def __repr__(self) -> str: return "<Constants() instance>" def __setstate__(self, state: Mapping[str, Any]) -> None: # Restore each saved attribute directly. The previous implementation # passed the whole state dict to __init__ as the ``prefixes`` argument, # which silently reverted every collection to its module default on # unpickling. self._pst = None for name, value in state.items(): setattr(self, name, value) # Verify each descriptor-backed attr was restored. Without this, a missing # key surfaces later as AttributeError: 'Constants' object has no attribute # '_prefixes' — the private mangled name, not the public one, making it # very hard to diagnose. for attr in (n for n, v in vars(type(self)).items() if isinstance(v, _CachedUnionMember)): if not hasattr(self, '_' + attr): raise ValueError( f"Pickle state is missing required field {attr!r}. " "The state blob may be truncated or from an incompatible version." ) def __getstate__(self) -> Mapping[str, Any]: # Pickle the instance's own configuration: the collections built in # __init__ plus any instance-level scalar overrides. # _CachedUnionMember descriptors store their values with a leading # underscore (e.g. `_prefixes` for `prefixes`) so that the descriptor's # __set__ owns assignment. We map those back to the public names so # __setstate__ can restore them through the descriptor, re-wiring the # invalidation callbacks. All other underscore-prefixed names (_pst, etc.) # are private/cache and are intentionally excluded. state: dict[str, Any] = {} for name, val in self.__dict__.items(): if name.startswith('_'): public = name[1:] if isinstance(getattr(type(self), public, None), _CachedUnionMember): state[public] = val else: state[name] = val return state
#: A module-level instance of the :py:class:`Constants()` class. #: Provides a common instance for the module to share #: to easily adjust configuration for the entire module. #: See `Customizing the Parser with Your Own Configuration <customize.html>`_. CONSTANTS = Constants()