[abbreviations] Adding ability to abbreviate within hyphenated phrases e.g. Sint-Maarten => St.-Maarten

This commit is contained in:
Al
2016-08-24 17:32:28 -04:00
parent a6dad74a2b
commit dfa5c8e0a6
2 changed files with 145 additions and 84 deletions

View File

@@ -1,9 +1,11 @@
import random
import re
import six
from geodata.address_expansions.gazetteers import *
from geodata.encoding import safe_decode, safe_encode
from geodata.text.tokenize import tokenize_raw, token_types
from geodata.text.utils import non_breaking_dash_regex
LOWER, UPPER, TITLE, MIXED = range(4)
@@ -20,30 +22,45 @@ def token_capitalization(s):
return MIXED
def recase_abbreviation(expansion, tokens):
expansion_tokens = expansion.split()
expansion_token_regex = re.compile('([^ \-\.]+)([\.\- ]+|$)')
def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
expansion_tokens = expansion_token_regex.findall(expansion)
print expansion, expansion_tokens, tokens
if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
return expansion.upper()
expansion_tokens = tokenize(expansion)
is_acronym = len(expansion_tokens) == 1 and expansion_tokens[0][1] == token_types.ACRONYM
if len(expansion) <= 3 or is_acronym:
return expansion.upper()
else:
return expansion.title()
elif len(tokens) == len(expansion_tokens):
strings = []
for (t, c), e in zip(tokens, expansion_tokens):
for (t, c), (e, suf) in zip(tokens, expansion_tokens):
cap = token_capitalization(t)
if suf == six.u(' '):
suf = space_token
if cap == LOWER:
strings.append(e.lower())
strings.append(six.u('').join((e.lower(), suf)))
elif cap == UPPER:
strings.append(e.upper())
strings.append(six.u('').join((e.upper(), suf)))
elif cap == TITLE:
strings.append(e.title())
strings.append(six.u('').join((e.title(), suf)))
elif t.lower() == e.lower():
strings.append(t)
else:
strings.append(e.title())
return six.u(' ').join(strings)
strings.append(six.u('').join((e.title(), suf)))
if suf == six.u(' '):
strings.append(space_token)
return six.u('').join(strings)
else:
return six.u(' ').join([t.title() for t in expansion_tokens])
return space_token.join([t.title() for t in expansion_tokens])
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
'''
Abbreviations
-------------
@@ -63,101 +80,141 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
i = 0
for t, c, length, data in gazetteer.filter(norm_tokens):
if c == token_types.PHRASE:
valid = []
data = [d.split(six.b('|')) for d in data]
def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
data = [d.split(six.b('|')) for d in data]
added = False
# local copy
abbreviated = []
# Append the original tokens with whitespace if there is any
if random.random() > abbreviate_prob:
for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0])
if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
abbreviated.append(six.u(' '))
i += len(t)
# Append the original tokens with whitespace if there is any
if random.random() > abbreviate_prob:
for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0])
if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
abbreviated.append(space_token)
return abbreviated
for lang, dictionary, is_canonical, canonical in data:
if lang not in (language, 'all'):
continue
for lang, dictionary, is_canonical, canonical in data:
if lang not in (language, 'all'):
continue
is_canonical = int(is_canonical)
is_stopword = dictionary == 'stopword'
is_prefix = dictionary.startswith('concatenated_prefixes')
is_suffix = dictionary.startswith('concatenated_suffixes')
is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
is_canonical = int(is_canonical)
is_stopword = dictionary == 'stopword'
is_prefix = dictionary.startswith('concatenated_prefixes')
is_suffix = dictionary.startswith('concatenated_suffixes')
is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
suffix = None
prefix = None
suffix = None
prefix = None
if not is_canonical:
continue
if not is_canonical:
continue
if not is_prefix and not is_suffix:
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
token = random.choice(abbreviations) if abbreviations else canonical
token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
abbreviated.append(token)
if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
abbreviated.append(space_token)
break
elif is_prefix:
token = tokens[i][0]
prefix, token = token[:length], token[length:]
if not is_prefix and not is_suffix:
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
token = random.choice(abbreviations) if abbreviations else canonical
token = recase_abbreviation(token, tokens[i:i + len(t)])
abbreviated.append(prefix)
if random.random() < separate_prob:
sub_tokens = tokenize(token)
if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
token = six.u('').join((t for t, c in sub_tokens[1:]))
abbreviated.append(space_token)
if token.islower():
abbreviated.append(token.title())
else:
abbreviated.append(token)
if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
abbreviated.append(six.u(' '))
break
elif is_prefix:
token = tokens[i][0]
prefix, token = token[:length], token[length:]
abbreviated.append(prefix)
if random.random() < separate_prob:
abbreviated.append(six.u(' '))
if token.islower():
abbreviated.append(token.title())
else:
abbreviated.append(token)
abbreviated.append(six.u(' '))
break
elif is_suffix:
token = tokens[i][0]
abbreviated.append(space_token)
break
elif is_suffix:
token = tokens[i][0]
token, suffix = token[:-length], token[-length:]
token, suffix = token[:-length], token[-length:]
concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
separated_abbreviations = []
phrase = gazetteer.trie.get(suffix.rstrip('.'))
suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
for l, d, _, c in suffix_data:
if l == lang and c == canonical:
separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
separated_abbreviations = []
phrase = gazetteer.trie.get(suffix.rstrip('.'))
suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
for l, d, _, c in suffix_data:
if l == lang and c == canonical:
separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
separate = random.random() < separate_prob
separate = random.random() < separate_prob
if concatenated_abbreviations and not separate:
abbreviation = random.choice(concatenated_abbreviations)
elif separated_abbreviations:
abbreviation = random.choice(separated_abbreviations)
else:
abbreviation = canonical
if concatenated_abbreviations and not separate:
abbreviation = random.choice(concatenated_abbreviations)
elif separated_abbreviations:
abbreviation = random.choice(separated_abbreviations)
else:
abbreviation = canonical
abbreviated.append(token)
if separate:
abbreviated.append(six.u(' '))
if suffix.isupper():
abbreviated.append(abbreviation.upper())
elif separate:
abbreviated.append(abbreviation.title())
else:
abbreviated.append(abbreviation)
abbreviated.append(six.u(' '))
break
if separate:
sub_tokens = tokenize(token)
if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
token = six.u('').join((t for t, c in sub_tokens[:-1]))
abbreviated.append(token)
if separate:
abbreviated.append(space_token)
if suffix.isupper():
abbreviated.append(abbreviation.upper())
elif separate:
abbreviated.append(abbreviation.title())
else:
abbreviated.append(abbreviation)
abbreviated.append(space_token)
break
else:
for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0])
if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
abbreviated.append(six.u(' '))
i += len(t)
return abbreviated
return abbreviated
for t, c, length, data in gazetteer.filter(norm_tokens):
if c == token_types.PHRASE:
abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
abbreviated.extend(abbrev_tokens)
i += len(t)
else:
abbreviated.append(tokens[i][0])
token = tokens[i][0]
if not non_breaking_dash_regex.search(token):
abbreviated.append(token)
else:
sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
sub_token_abbreviated = []
sub_i = 0
sub_n = len(sub_tokens)
for t, c, length, data in gazetteer.filter(sub_tokens_norm):
if c == token_types.PHRASE:
abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
sub_token_abbreviated.extend(abbrev_tokens)
sub_i += len(t)
if sub_i < sub_n:
if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
sub_token_abbreviated.append(six.u('.'))
sub_token_abbreviated.append(six.u('-'))
else:
sub_token_abbreviated.append(sub_tokens[sub_i][0])
sub_i += 1
if sub_i < sub_n:
sub_token_abbreviated.append(six.u('-'))
abbreviated.append(six.u('').join(sub_token_abbreviated))
if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
abbreviated.append(six.u(' '))
i += 1