[fix] Abbreviation probabilities should only apply once, not once per dictionary. Also fixing issues where some of the abbreviations were doubled
This commit is contained in:
@@ -382,6 +382,14 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
|
|||||||
|
|
||||||
added = False
|
added = False
|
||||||
|
|
||||||
|
if random.random() > abbreviate_prob:
|
||||||
|
for j, (t_i, c_i) in enumerate(t):
|
||||||
|
abbreviated.append(tokens[i + j][0])
|
||||||
|
if c_i != token_types.IDEOGRAPHIC_CHAR:
|
||||||
|
abbreviated.append(u' ')
|
||||||
|
i += len(t)
|
||||||
|
continue
|
||||||
|
|
||||||
for lang, dictionary, is_canonical, canonical in data:
|
for lang, dictionary, is_canonical, canonical in data:
|
||||||
if lang not in (language, 'all'):
|
if lang not in (language, 'all'):
|
||||||
continue
|
continue
|
||||||
@@ -395,7 +403,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
|
|||||||
suffix = None
|
suffix = None
|
||||||
prefix = None
|
prefix = None
|
||||||
|
|
||||||
if not is_canonical or random.random() > abbreviate_prob:
|
if not is_canonical:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not is_prefix and not is_suffix:
|
if not is_prefix and not is_suffix:
|
||||||
@@ -405,7 +413,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
|
|||||||
abbreviated.append(token)
|
abbreviated.append(token)
|
||||||
if t[-1][1] != token_types.IDEOGRAPHIC_CHAR:
|
if t[-1][1] != token_types.IDEOGRAPHIC_CHAR:
|
||||||
abbreviated.append(u' ')
|
abbreviated.append(u' ')
|
||||||
added = True
|
break
|
||||||
elif is_prefix:
|
elif is_prefix:
|
||||||
token = tokens[i][0]
|
token = tokens[i][0]
|
||||||
prefix, token = token[:length], token[length:]
|
prefix, token = token[:length], token[length:]
|
||||||
@@ -417,7 +425,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
|
|||||||
else:
|
else:
|
||||||
abbreviated.append(token)
|
abbreviated.append(token)
|
||||||
abbreviated.append(u' ')
|
abbreviated.append(u' ')
|
||||||
added = True
|
break
|
||||||
elif is_suffix:
|
elif is_suffix:
|
||||||
token = tokens[i][0]
|
token = tokens[i][0]
|
||||||
|
|
||||||
@@ -451,9 +459,8 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
|
|||||||
else:
|
else:
|
||||||
abbreviated.append(abbreviation)
|
abbreviated.append(abbreviation)
|
||||||
abbreviated.append(u' ')
|
abbreviated.append(u' ')
|
||||||
added = True
|
break
|
||||||
|
else:
|
||||||
if not added:
|
|
||||||
for j, (t_i, c_i) in enumerate(t):
|
for j, (t_i, c_i) in enumerate(t):
|
||||||
abbreviated.append(tokens[i + j][0])
|
abbreviated.append(tokens[i + j][0])
|
||||||
if c_i != token_types.IDEOGRAPHIC_CHAR:
|
if c_i != token_types.IDEOGRAPHIC_CHAR:
|
||||||
@@ -1392,7 +1399,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
|
|||||||
writer = csv.writer(f, 'tsv_no_quote')
|
writer = csv.writer(f, 'tsv_no_quote')
|
||||||
|
|
||||||
for key, value, deps in parse_osm(infile):
|
for key, value, deps in parse_osm(infile):
|
||||||
if not sum((1 for k, v in value.iteritems() if k.startswith('name'))) > 0:
|
if not any((k.startswith('name') for k, v in value.iteritems())):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user