From 5eb6bb309b72aa500d99ec114c71e67fb4a0c481 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 23 Jan 2016 20:09:45 -0500 Subject: [PATCH] [fix] Only adding whitespace back into tokenized strings during abbreviation if it existed in the original string --- scripts/geodata/osm/osm_address_training_data.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 27085bd5..feeedc24 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -365,9 +365,13 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0. to real-world input, we can safely replace the canonical phrase with an abbreviated version and retain the meaning of the words ''' - tokens = tokenize(s) + raw_tokens = tokenize_raw(s) + s_utf8 = safe_encode(s) + tokens = [(s_utf8[o:o+l], token_types.from_id(c)) for o, l, c in raw_tokens] norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] + n = len(tokens) + abbreviated = [] i = 0 @@ -382,7 +386,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0. if random.random() > abbreviate_prob: for j, (t_i, c_i) in enumerate(t): abbreviated.append(tokens[i + j][0]) - if c_i != token_types.IDEOGRAPHIC_CHAR: + if i + j < n - 1 and raw_tokens[i + j + 1][1] > raw_tokens[i + j][1] + 1: abbreviated.append(u' ') i += len(t) continue @@ -460,13 +464,13 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0. else: for j, (t_i, c_i) in enumerate(t): abbreviated.append(tokens[i + j][0]) - if c_i != token_types.IDEOGRAPHIC_CHAR: + if i + j < n - 1 and raw_tokens[i + j + 1][1] > raw_tokens[i + j][1] + 1: abbreviated.append(u' ') i += len(t) else: abbreviated.append(tokens[i][0]) - if (c != token_types.IDEOGRAPHIC_CHAR): + if i < n - 1 and raw_tokens[i + 1][1] > raw_tokens[i][1] + 1: abbreviated.append(u' ') i += 1