From cca80b046c04b13cad1c48a14c15b37a7779c724 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sat, 3 Dec 2016 17:55:11 -0500
Subject: [PATCH] [abbreviation] fixing abbreviations within hyphenated
 phrases, particularly for prefix/suffix matches

---
 .../address_expansions/abbreviations.py       | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/scripts/geodata/address_expansions/abbreviations.py b/scripts/geodata/address_expansions/abbreviations.py
index 2d147e27..8c203b0c 100644
--- a/scripts/geodata/address_expansions/abbreviations.py
+++ b/scripts/geodata/address_expansions/abbreviations.py
@@ -90,12 +90,14 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a
         # local copy
         abbreviated = []
 
+        n = len(t)
+
         # Append the original tokens with whitespace if there is any
         if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
             for j, (t_i, c_i) in enumerate(t):
                 abbreviated.append(tokens[i + j][0])
 
-                if j < length - 1:
+                if j < n - 1:
                     abbreviated.append(space_token)
             return abbreviated
 
@@ -117,6 +119,9 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a
 
             if not is_prefix and not is_suffix:
                 abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
+                # TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
+                # would require an audit of the dictionaries though so abbreviations are listed from
+                # left-to-right by frequency of usage
                 token = random.choice(abbreviations) if abbreviations else canonical
                 token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
                 abbreviated.append(token)
@@ -175,14 +180,12 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a
                     abbreviated.append(abbreviation.title())
                 else:
                     abbreviated.append(abbreviation)
-                abbreviated.append(space_token)
                 break
-            else:
-                for j, (t_i, c_i) in enumerate(t):
-                    abbreviated.append(tokens[i + j][0])
-                    if j < length - 1:
-                        abbreviated.append(space_token)
-            return abbreviated
+        else:
+            for j, (t_i, c_i) in enumerate(t):
+                abbreviated.append(tokens[i + j][0])
+                if j < n - 1:
+                    abbreviated.append(space_token)
         return abbreviated
 
     for t, c, length, data in gazetteer.filter(norm_tokens):