From cca80b046c04b13cad1c48a14c15b37a7779c724 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 3 Dec 2016 17:55:11 -0500 Subject: [PATCH] [abbreviation] fixing abbreviations within hyphenated phrases, particularly for prefix/suffix matches --- .../address_expansions/abbreviations.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/geodata/address_expansions/abbreviations.py b/scripts/geodata/address_expansions/abbreviations.py index 2d147e27..8c203b0c 100644 --- a/scripts/geodata/address_expansions/abbreviations.py +++ b/scripts/geodata/address_expansions/abbreviations.py @@ -90,12 +90,14 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a # local copy abbreviated = [] + n = len(t) + # Append the original tokens with whitespace if there is any if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)): for j, (t_i, c_i) in enumerate(t): abbreviated.append(tokens[i + j][0]) - if j < length - 1: + if j < n - 1: abbreviated.append(space_token) return abbreviated @@ -117,6 +119,9 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a if not is_prefix and not is_suffix: abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary)) + # TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street" + # would require an audit of the dictionaries though so abbreviations are listed from + # left-to-right by frequency of usage token = random.choice(abbreviations) if abbreviations else canonical token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token) abbreviated.append(token) @@ -175,14 +180,12 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a abbreviated.append(abbreviation.title()) else: abbreviated.append(abbreviation) - abbreviated.append(space_token) break - else: - for j, (t_i, c_i) in enumerate(t): - abbreviated.append(tokens[i + j][0]) - if j < length - 1: - abbreviated.append(space_token) - return abbreviated + else: + for j, (t_i, c_i) in enumerate(t): + abbreviated.append(tokens[i + j][0]) + if j < n - 1: + abbreviated.append(space_token) return abbreviated for t, c, length, data in gazetteer.filter(norm_tokens):