Files
2025-09-06 22:03:29 -04:00

77 lines
3.1 KiB
Python

# -*- coding: utf-8 -*-
import six
from geodata.text import _normalize
from geodata.text.token_types import token_types
from geodata.encoding import safe_decode
# String options
NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII
NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE
NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS
NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE
NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE
NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM
NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS
NORMALIZE_STRING_SIMPLE_LATIN_ASCII = _normalize.NORMALIZE_STRING_SIMPLE_LATIN_ASCII
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_DEFAULT_STRING_OPTIONS
# Token options
NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS
NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS
TOKEN_OPTIONS_DROP_PERIODS = _normalize.NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS
DEFAULT_TOKEN_OPTIONS_NUMERIC = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC
def remove_parens(tokens):
new_tokens = []
open_parens = 0
for t, c in tokens:
if c == token_types.PUNCT_OPEN:
open_parens += 1
elif c == token_types.PUNCT_CLOSE:
if open_parens > 0:
open_parens -= 1
elif open_parens <= 0:
new_tokens.append((t, c))
return new_tokens
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS):
s = safe_decode(s)
return _normalize.normalize_string(s, string_options)
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
token_options=DEFAULT_TOKEN_OPTIONS,
strip_parentheticals=True, whitespace=False):
'''
Normalizes a string, tokenizes, and normalizes each token
with string and token-level options.
This version only uses libpostal's deterministic normalizations
i.e. methods with a single output. The string tree version will
return multiple normalized strings, each with tokens.
Usage:
normalized_tokens(u'St.-Barthélemy')
'''
s = safe_decode(s)
normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace)
if strip_parentheticals:
normalized_tokens = remove_parens(normalized_tokens)
return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens]