[abbreviations] Adding ability to abbreviate within hyphenated phrases e.g. Sint-Maarten => St.-Maarten
This commit is contained in:
@@ -1,9 +1,11 @@
|
|||||||
import random
|
import random
|
||||||
|
import re
|
||||||
import six
|
import six
|
||||||
|
|
||||||
from geodata.address_expansions.gazetteers import *
|
from geodata.address_expansions.gazetteers import *
|
||||||
from geodata.encoding import safe_decode, safe_encode
|
from geodata.encoding import safe_decode, safe_encode
|
||||||
from geodata.text.tokenize import tokenize_raw, token_types
|
from geodata.text.tokenize import tokenize_raw, token_types
|
||||||
|
from geodata.text.utils import non_breaking_dash_regex
|
||||||
|
|
||||||
|
|
||||||
LOWER, UPPER, TITLE, MIXED = range(4)
|
LOWER, UPPER, TITLE, MIXED = range(4)
|
||||||
@@ -20,30 +22,45 @@ def token_capitalization(s):
|
|||||||
return MIXED
|
return MIXED
|
||||||
|
|
||||||
|
|
||||||
def recase_abbreviation(expansion, tokens):
|
expansion_token_regex = re.compile('([^ \-\.]+)([\.\- ]+|$)')
|
||||||
expansion_tokens = expansion.split()
|
|
||||||
|
|
||||||
|
def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
|
||||||
|
expansion_tokens = expansion_token_regex.findall(expansion)
|
||||||
|
|
||||||
|
print expansion, expansion_tokens, tokens
|
||||||
if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
|
if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
|
||||||
|
expansion_tokens = tokenize(expansion)
|
||||||
|
is_acronym = len(expansion_tokens) == 1 and expansion_tokens[0][1] == token_types.ACRONYM
|
||||||
|
if len(expansion) <= 3 or is_acronym:
|
||||||
return expansion.upper()
|
return expansion.upper()
|
||||||
|
else:
|
||||||
|
return expansion.title()
|
||||||
elif len(tokens) == len(expansion_tokens):
|
elif len(tokens) == len(expansion_tokens):
|
||||||
strings = []
|
strings = []
|
||||||
for (t, c), e in zip(tokens, expansion_tokens):
|
for (t, c), (e, suf) in zip(tokens, expansion_tokens):
|
||||||
cap = token_capitalization(t)
|
cap = token_capitalization(t)
|
||||||
|
if suf == six.u(' '):
|
||||||
|
suf = space_token
|
||||||
if cap == LOWER:
|
if cap == LOWER:
|
||||||
strings.append(e.lower())
|
strings.append(six.u('').join((e.lower(), suf)))
|
||||||
elif cap == UPPER:
|
elif cap == UPPER:
|
||||||
strings.append(e.upper())
|
strings.append(six.u('').join((e.upper(), suf)))
|
||||||
elif cap == TITLE:
|
elif cap == TITLE:
|
||||||
strings.append(e.title())
|
strings.append(six.u('').join((e.title(), suf)))
|
||||||
elif t.lower() == e.lower():
|
elif t.lower() == e.lower():
|
||||||
strings.append(t)
|
strings.append(t)
|
||||||
else:
|
else:
|
||||||
strings.append(e.title())
|
strings.append(six.u('').join((e.title(), suf)))
|
||||||
return six.u(' ').join(strings)
|
|
||||||
|
if suf == six.u(' '):
|
||||||
|
strings.append(space_token)
|
||||||
|
return six.u('').join(strings)
|
||||||
else:
|
else:
|
||||||
return six.u(' ').join([t.title() for t in expansion_tokens])
|
return space_token.join([t.title() for t in expansion_tokens])
|
||||||
|
|
||||||
|
|
||||||
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
|
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
|
||||||
'''
|
'''
|
||||||
Abbreviations
|
Abbreviations
|
||||||
-------------
|
-------------
|
||||||
@@ -63,21 +80,19 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
|
|||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
for t, c, length, data in gazetteer.filter(norm_tokens):
|
def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
|
||||||
if c == token_types.PHRASE:
|
|
||||||
valid = []
|
|
||||||
data = [d.split(six.b('|')) for d in data]
|
data = [d.split(six.b('|')) for d in data]
|
||||||
|
|
||||||
added = False
|
# local copy
|
||||||
|
abbreviated = []
|
||||||
|
|
||||||
# Append the original tokens with whitespace if there is any
|
# Append the original tokens with whitespace if there is any
|
||||||
if random.random() > abbreviate_prob:
|
if random.random() > abbreviate_prob:
|
||||||
for j, (t_i, c_i) in enumerate(t):
|
for j, (t_i, c_i) in enumerate(t):
|
||||||
abbreviated.append(tokens[i + j][0])
|
abbreviated.append(tokens[i + j][0])
|
||||||
if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
|
if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
|
||||||
abbreviated.append(six.u(' '))
|
abbreviated.append(space_token)
|
||||||
i += len(t)
|
return abbreviated
|
||||||
continue
|
|
||||||
|
|
||||||
for lang, dictionary, is_canonical, canonical in data:
|
for lang, dictionary, is_canonical, canonical in data:
|
||||||
if lang not in (language, 'all'):
|
if lang not in (language, 'all'):
|
||||||
@@ -98,22 +113,27 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
|
|||||||
if not is_prefix and not is_suffix:
|
if not is_prefix and not is_suffix:
|
||||||
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
|
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
|
||||||
token = random.choice(abbreviations) if abbreviations else canonical
|
token = random.choice(abbreviations) if abbreviations else canonical
|
||||||
token = recase_abbreviation(token, tokens[i:i + len(t)])
|
token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
|
||||||
abbreviated.append(token)
|
abbreviated.append(token)
|
||||||
if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
|
if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
|
||||||
abbreviated.append(six.u(' '))
|
abbreviated.append(space_token)
|
||||||
break
|
break
|
||||||
elif is_prefix:
|
elif is_prefix:
|
||||||
token = tokens[i][0]
|
token = tokens[i][0]
|
||||||
prefix, token = token[:length], token[length:]
|
prefix, token = token[:length], token[length:]
|
||||||
|
|
||||||
abbreviated.append(prefix)
|
abbreviated.append(prefix)
|
||||||
if random.random() < separate_prob:
|
if random.random() < separate_prob:
|
||||||
abbreviated.append(six.u(' '))
|
sub_tokens = tokenize(token)
|
||||||
|
if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
|
||||||
|
token = six.u('').join((t for t, c in sub_tokens[1:]))
|
||||||
|
|
||||||
|
abbreviated.append(space_token)
|
||||||
if token.islower():
|
if token.islower():
|
||||||
abbreviated.append(token.title())
|
abbreviated.append(token.title())
|
||||||
else:
|
else:
|
||||||
abbreviated.append(token)
|
abbreviated.append(token)
|
||||||
abbreviated.append(six.u(' '))
|
abbreviated.append(space_token)
|
||||||
break
|
break
|
||||||
elif is_suffix:
|
elif is_suffix:
|
||||||
token = tokens[i][0]
|
token = tokens[i][0]
|
||||||
@@ -138,26 +158,63 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
|
|||||||
else:
|
else:
|
||||||
abbreviation = canonical
|
abbreviation = canonical
|
||||||
|
|
||||||
|
if separate:
|
||||||
|
sub_tokens = tokenize(token)
|
||||||
|
if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
|
||||||
|
token = six.u('').join((t for t, c in sub_tokens[:-1]))
|
||||||
|
|
||||||
abbreviated.append(token)
|
abbreviated.append(token)
|
||||||
if separate:
|
if separate:
|
||||||
abbreviated.append(six.u(' '))
|
abbreviated.append(space_token)
|
||||||
if suffix.isupper():
|
if suffix.isupper():
|
||||||
abbreviated.append(abbreviation.upper())
|
abbreviated.append(abbreviation.upper())
|
||||||
elif separate:
|
elif separate:
|
||||||
abbreviated.append(abbreviation.title())
|
abbreviated.append(abbreviation.title())
|
||||||
else:
|
else:
|
||||||
abbreviated.append(abbreviation)
|
abbreviated.append(abbreviation)
|
||||||
abbreviated.append(six.u(' '))
|
abbreviated.append(space_token)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
for j, (t_i, c_i) in enumerate(t):
|
for j, (t_i, c_i) in enumerate(t):
|
||||||
abbreviated.append(tokens[i + j][0])
|
abbreviated.append(tokens[i + j][0])
|
||||||
if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
|
if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
|
||||||
abbreviated.append(six.u(' '))
|
abbreviated.append(six.u(' '))
|
||||||
i += len(t)
|
return abbreviated
|
||||||
|
return abbreviated
|
||||||
|
|
||||||
|
for t, c, length, data in gazetteer.filter(norm_tokens):
|
||||||
|
if c == token_types.PHRASE:
|
||||||
|
abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
|
||||||
|
abbreviated.extend(abbrev_tokens)
|
||||||
|
i += len(t)
|
||||||
else:
|
else:
|
||||||
abbreviated.append(tokens[i][0])
|
token = tokens[i][0]
|
||||||
|
if not non_breaking_dash_regex.search(token):
|
||||||
|
abbreviated.append(token)
|
||||||
|
else:
|
||||||
|
sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
|
||||||
|
sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
|
||||||
|
|
||||||
|
sub_token_abbreviated = []
|
||||||
|
sub_i = 0
|
||||||
|
sub_n = len(sub_tokens)
|
||||||
|
for t, c, length, data in gazetteer.filter(sub_tokens_norm):
|
||||||
|
if c == token_types.PHRASE:
|
||||||
|
abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
|
||||||
|
sub_token_abbreviated.extend(abbrev_tokens)
|
||||||
|
sub_i += len(t)
|
||||||
|
if sub_i < sub_n:
|
||||||
|
if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
|
||||||
|
sub_token_abbreviated.append(six.u('.'))
|
||||||
|
sub_token_abbreviated.append(six.u('-'))
|
||||||
|
else:
|
||||||
|
sub_token_abbreviated.append(sub_tokens[sub_i][0])
|
||||||
|
sub_i += 1
|
||||||
|
if sub_i < sub_n:
|
||||||
|
sub_token_abbreviated.append(six.u('-'))
|
||||||
|
|
||||||
|
abbreviated.append(six.u('').join(sub_token_abbreviated))
|
||||||
|
|
||||||
if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
|
if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
|
||||||
abbreviated.append(six.u(' '))
|
abbreviated.append(six.u(' '))
|
||||||
i += 1
|
i += 1
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from geodata.text.tokenize import tokenize
|
from geodata.text.tokenize import tokenize
|
||||||
from geodata.text.token_types import token_types
|
from geodata.text.token_types import token_types
|
||||||
|
|
||||||
|
non_breaking_dash_regex = re.compile(u'[\-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]', re.UNICODE)
|
||||||
|
|
||||||
|
|
||||||
def is_numeric(s):
|
def is_numeric(s):
|
||||||
tokens = tokenize(s)
|
tokens = tokenize(s)
|
||||||
|
|||||||
Reference in New Issue
Block a user