From 78450bfad91fbb5da92aa95eb1282d10b2a91e52 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 23 Jan 2016 21:36:20 -0500 Subject: [PATCH] [fix] Spaces in abbreviation --- scripts/geodata/osm/osm_address_training_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 7b45b668..bca06990 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -386,7 +386,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0. if random.random() > abbreviate_prob: for j, (t_i, c_i) in enumerate(t): abbreviated.append(tokens[i + j][0]) - if i + j < n - 1 and raw_tokens[i + j + 1][1] > raw_tokens[i + j][1] + 1: + if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]): abbreviated.append(u' ') i += len(t) continue @@ -412,7 +412,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0. token = random.choice(abbreviations) if abbreviations else canonical token = recase_abbreviation(token, tokens[i:i + len(t)]) abbreviated.append(token) - if t[-1][1] != token_types.IDEOGRAPHIC_CHAR: + if i < n - 1 and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]): abbreviated.append(u' ') break elif is_prefix: @@ -464,13 +464,13 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0. else: for j, (t_i, c_i) in enumerate(t): abbreviated.append(tokens[i + j][0]) - if i + j < n - 1 and raw_tokens[i + j + 1][1] > raw_tokens[i + j][1] + 1: + if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]): abbreviated.append(u' ') i += len(t) else: abbreviated.append(tokens[i][0]) - if i < n - 1 and raw_tokens[i + 1][1] > raw_tokens[i][1] + 1: + if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]): abbreviated.append(u' ') i += 1