[abbreviations] Adding ability to abbreviate within hyphenated phrases e.g. Sint-Maarten => St.-Maarten

2016-08-24 17:32:28 -04:00
parent a6dad74a2b
commit dfa5c8e0a6
2 changed files with 145 additions and 84 deletions
--- a/scripts/geodata/address_expansions/abbreviations.py
+++ b/scripts/geodata/address_expansions/abbreviations.py
@@ -1,9 +1,11 @@
 import random
 import re
 import six
 from geodata.address_expansions.gazetteers import *
 from geodata.encoding import safe_decode, safe_encode
 from geodata.text.tokenize import tokenize_raw, token_types
 from geodata.text.utils import non_breaking_dash_regex
 LOWER, UPPER, TITLE, MIXED = range(4)
@@ -20,30 +22,45 @@ def token_capitalization(s):
        return MIXED
-def recase_abbreviation(expansion, tokens):
+expansion_token_regex = re.compile('([^  \-\.]+)([\.\- ]+|$)')
-    expansion_tokens = expansion.split()
+
 def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
    expansion_tokens = expansion_token_regex.findall(expansion)
    print expansion, expansion_tokens, tokens
    if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
        expansion_tokens = tokenize(expansion)
        is_acronym = len(expansion_tokens) == 1 and expansion_tokens[0][1] == token_types.ACRONYM
        if len(expansion) <= 3 or is_acronym:
            return expansion.upper()
        else:
            return expansion.title()
    elif len(tokens) == len(expansion_tokens):
        strings = []
-        for (t, c), e in zip(tokens, expansion_tokens):
+        for (t, c), (e, suf) in zip(tokens, expansion_tokens):
            cap = token_capitalization(t)
            if suf == six.u(' '):
                suf = space_token
            if cap == LOWER:
-                strings.append(e.lower())
+                strings.append(six.u('').join((e.lower(), suf)))
            elif cap == UPPER:
-                strings.append(e.upper())
+                strings.append(six.u('').join((e.upper(), suf)))
            elif cap == TITLE:
-                strings.append(e.title())
+                strings.append(six.u('').join((e.title(), suf)))
            elif t.lower() == e.lower():
                strings.append(t)
            else:
-                strings.append(e.title())
+                strings.append(six.u('').join((e.title(), suf)))
-        return six.u(' ').join(strings)
+
            if suf == six.u(' '):
                strings.append(space_token)
        return six.u('').join(strings)
    else:
-        return six.u(' ').join([t.title() for t in expansion_tokens])
+        return space_token.join([t.title() for t in expansion_tokens])
-def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
+def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
    '''
    Abbreviations
    -------------
@@ -63,21 +80,19 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
    i = 0
-    for t, c, length, data in gazetteer.filter(norm_tokens):
+    def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
        if c == token_types.PHRASE:
            valid = []
        data = [d.split(six.b('|')) for d in data]
-            added = False
+        # local copy
        abbreviated = []
        # Append the original tokens with whitespace if there is any
        if random.random() > abbreviate_prob:
            for j, (t_i, c_i) in enumerate(t):
                abbreviated.append(tokens[i + j][0])
                if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
-                        abbreviated.append(six.u(' '))
+                    abbreviated.append(space_token)
-                i += len(t)
+            return abbreviated
                continue
        for lang, dictionary, is_canonical, canonical in data:
            if lang not in (language, 'all'):
@@ -98,22 +113,27 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
            if not is_prefix and not is_suffix:
                abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
                token = random.choice(abbreviations) if abbreviations else canonical
-                    token = recase_abbreviation(token, tokens[i:i + len(t)])
+                token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
                abbreviated.append(token)
                if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
-                        abbreviated.append(six.u(' '))
+                    abbreviated.append(space_token)
                break
            elif is_prefix:
                token = tokens[i][0]
                prefix, token = token[:length], token[length:]
                abbreviated.append(prefix)
                if random.random() < separate_prob:
-                        abbreviated.append(six.u(' '))
+                    sub_tokens = tokenize(token)
                    if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
                        token = six.u('').join((t for t, c in sub_tokens[1:]))
                    abbreviated.append(space_token)
                if token.islower():
                    abbreviated.append(token.title())
                else:
                    abbreviated.append(token)
-                    abbreviated.append(six.u(' '))
+                abbreviated.append(space_token)
                break
            elif is_suffix:
                token = tokens[i][0]
@@ -138,26 +158,63 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
                else:
                    abbreviation = canonical
                if separate:
                    sub_tokens = tokenize(token)
                    if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
                        token = six.u('').join((t for t, c in sub_tokens[:-1]))
                abbreviated.append(token)
                if separate:
-                        abbreviated.append(six.u(' '))
+                    abbreviated.append(space_token)
                if suffix.isupper():
                    abbreviated.append(abbreviation.upper())
                elif separate:
                    abbreviated.append(abbreviation.title())
                else:
                    abbreviated.append(abbreviation)
-                    abbreviated.append(six.u(' '))
+                abbreviated.append(space_token)
                break
            else:
                for j, (t_i, c_i) in enumerate(t):
                    abbreviated.append(tokens[i + j][0])
                    if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
                        abbreviated.append(six.u(' '))
-            i += len(t)
+            return abbreviated
        return abbreviated
    for t, c, length, data in gazetteer.filter(norm_tokens):
        if c == token_types.PHRASE:
            abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
            abbreviated.extend(abbrev_tokens)
            i += len(t)
        else:
-            abbreviated.append(tokens[i][0])
+            token = tokens[i][0]
            if not non_breaking_dash_regex.search(token):
                abbreviated.append(token)
            else:
                sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
                sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
                sub_token_abbreviated = []
                sub_i = 0
                sub_n = len(sub_tokens)
                for t, c, length, data in gazetteer.filter(sub_tokens_norm):
                    if c == token_types.PHRASE:
                        abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
                        sub_token_abbreviated.extend(abbrev_tokens)
                        sub_i += len(t)
                        if sub_i < sub_n:
                            if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
                                sub_token_abbreviated.append(six.u('.'))
                            sub_token_abbreviated.append(six.u('-'))
                    else:
                        sub_token_abbreviated.append(sub_tokens[sub_i][0])
                        sub_i += 1
                        if sub_i < sub_n:
                            sub_token_abbreviated.append(six.u('-'))
                abbreviated.append(six.u('').join(sub_token_abbreviated))
            if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
                abbreviated.append(six.u(' '))
            i += 1
--- a/scripts/geodata/text/utils.py
+++ b/scripts/geodata/text/utils.py
@@ -1,6 +1,10 @@
 import re
 from geodata.text.tokenize import tokenize
 from geodata.text.token_types import token_types
 non_breaking_dash_regex = re.compile(u'[\-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]', re.UNICODE)
 def is_numeric(s):
    tokens = tokenize(s)