[abbreviation] fixing abbreviations within hyphenated phrases, particularly for prefix/suffix matches
This commit is contained in:
@@ -90,12 +90,14 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a
|
|||||||
# local copy
|
# local copy
|
||||||
abbreviated = []
|
abbreviated = []
|
||||||
|
|
||||||
|
n = len(t)
|
||||||
|
|
||||||
# Append the original tokens with whitespace if there is any
|
# Append the original tokens with whitespace if there is any
|
||||||
if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
|
if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
|
||||||
for j, (t_i, c_i) in enumerate(t):
|
for j, (t_i, c_i) in enumerate(t):
|
||||||
abbreviated.append(tokens[i + j][0])
|
abbreviated.append(tokens[i + j][0])
|
||||||
|
|
||||||
if j < length - 1:
|
if j < n - 1:
|
||||||
abbreviated.append(space_token)
|
abbreviated.append(space_token)
|
||||||
return abbreviated
|
return abbreviated
|
||||||
|
|
||||||
@@ -117,6 +119,9 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a
|
|||||||
|
|
||||||
if not is_prefix and not is_suffix:
|
if not is_prefix and not is_suffix:
|
||||||
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
|
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
|
||||||
|
# TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
|
||||||
|
# would require an audit of the dictionaries though so abbreviations are listed from
|
||||||
|
# left-to-right by frequency of usage
|
||||||
token = random.choice(abbreviations) if abbreviations else canonical
|
token = random.choice(abbreviations) if abbreviations else canonical
|
||||||
token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
|
token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
|
||||||
abbreviated.append(token)
|
abbreviated.append(token)
|
||||||
@@ -175,14 +180,12 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, a
|
|||||||
abbreviated.append(abbreviation.title())
|
abbreviated.append(abbreviation.title())
|
||||||
else:
|
else:
|
||||||
abbreviated.append(abbreviation)
|
abbreviated.append(abbreviation)
|
||||||
abbreviated.append(space_token)
|
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
for j, (t_i, c_i) in enumerate(t):
|
for j, (t_i, c_i) in enumerate(t):
|
||||||
abbreviated.append(tokens[i + j][0])
|
abbreviated.append(tokens[i + j][0])
|
||||||
if j < length - 1:
|
if j < n - 1:
|
||||||
abbreviated.append(space_token)
|
abbreviated.append(space_token)
|
||||||
return abbreviated
|
|
||||||
return abbreviated
|
return abbreviated
|
||||||
|
|
||||||
for t, c, length, data in gazetteer.filter(norm_tokens):
|
for t, c, length, data in gazetteer.filter(norm_tokens):
|
||||||
|
|||||||
Reference in New Issue
Block a user