[fix] Abbreviation probabilities should only apply once, not once per dictionary. Also fixing issues where some of the abbreviations were doubled

This commit is contained in:
Al
2016-01-22 15:48:21 -05:00
parent f9f6558e06
commit ea9bb3f2d5

View File

@@ -382,6 +382,14 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
added = False added = False
if random.random() > abbreviate_prob:
for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0])
if c_i != token_types.IDEOGRAPHIC_CHAR:
abbreviated.append(u' ')
i += len(t)
continue
for lang, dictionary, is_canonical, canonical in data: for lang, dictionary, is_canonical, canonical in data:
if lang not in (language, 'all'): if lang not in (language, 'all'):
continue continue
@@ -395,7 +403,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
suffix = None suffix = None
prefix = None prefix = None
if not is_canonical or random.random() > abbreviate_prob: if not is_canonical:
continue continue
if not is_prefix and not is_suffix: if not is_prefix and not is_suffix:
@@ -405,7 +413,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
abbreviated.append(token) abbreviated.append(token)
if t[-1][1] != token_types.IDEOGRAPHIC_CHAR: if t[-1][1] != token_types.IDEOGRAPHIC_CHAR:
abbreviated.append(u' ') abbreviated.append(u' ')
added = True break
elif is_prefix: elif is_prefix:
token = tokens[i][0] token = tokens[i][0]
prefix, token = token[:length], token[length:] prefix, token = token[:length], token[length:]
@@ -417,7 +425,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
else: else:
abbreviated.append(token) abbreviated.append(token)
abbreviated.append(u' ') abbreviated.append(u' ')
added = True break
elif is_suffix: elif is_suffix:
token = tokens[i][0] token = tokens[i][0]
@@ -451,9 +459,8 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
else: else:
abbreviated.append(abbreviation) abbreviated.append(abbreviation)
abbreviated.append(u' ') abbreviated.append(u' ')
added = True break
else:
if not added:
for j, (t_i, c_i) in enumerate(t): for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0]) abbreviated.append(tokens[i + j][0])
if c_i != token_types.IDEOGRAPHIC_CHAR: if c_i != token_types.IDEOGRAPHIC_CHAR:
@@ -1392,7 +1399,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
writer = csv.writer(f, 'tsv_no_quote') writer = csv.writer(f, 'tsv_no_quote')
for key, value, deps in parse_osm(infile): for key, value, deps in parse_osm(infile):
if not sum((1 for k, v in value.iteritems() if k.startswith('name'))) > 0: if not any((k.startswith('name') for k, v in value.iteritems())):
continue continue
try: try: