Initial fork commit
This commit is contained in:
0
scripts/geodata/address_expansions/__init__.py
Normal file
0
scripts/geodata/address_expansions/__init__.py
Normal file
233
scripts/geodata/address_expansions/abbreviations.py
Normal file
233
scripts/geodata/address_expansions/abbreviations.py
Normal file
@@ -0,0 +1,233 @@
|
||||
import random
|
||||
import re
|
||||
import six
|
||||
|
||||
from geodata.address_expansions.gazetteers import *
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.text.tokenize import tokenize_raw, token_types
|
||||
from geodata.text.utils import non_breaking_dash_regex
|
||||
|
||||
|
||||
LOWER, UPPER, TITLE, MIXED = range(4)
|
||||
|
||||
|
||||
def token_capitalization(s):
|
||||
if s.istitle():
|
||||
return TITLE
|
||||
elif s.islower():
|
||||
return LOWER
|
||||
elif s.isupper():
|
||||
return UPPER
|
||||
else:
|
||||
return MIXED
|
||||
|
||||
|
||||
expansion_token_regex = re.compile('([^ \-\.]+)([\.\- ]+|$)')
|
||||
|
||||
|
||||
def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
|
||||
expansion_tokens = expansion_token_regex.findall(expansion)
|
||||
|
||||
if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
|
||||
expansion_tokenized = tokenize(expansion)
|
||||
is_acronym = len(expansion_tokenized) == 1 and expansion_tokenized[0][1] == token_types.ACRONYM
|
||||
if len(expansion) <= 3 or is_acronym:
|
||||
return expansion.upper()
|
||||
else:
|
||||
return expansion.title()
|
||||
elif len(tokens) == len(expansion_tokens):
|
||||
strings = []
|
||||
for (t, c), (e, suf) in zip(tokens, expansion_tokens):
|
||||
cap = token_capitalization(t)
|
||||
if suf == six.u(' '):
|
||||
suf = space_token
|
||||
if cap == LOWER:
|
||||
strings.append(six.u('').join((e.lower(), suf)))
|
||||
elif cap == UPPER:
|
||||
strings.append(six.u('').join((e.upper(), suf)))
|
||||
elif cap == TITLE:
|
||||
strings.append(six.u('').join((e.title(), suf)))
|
||||
elif t.lower() == e.lower():
|
||||
strings.append(t)
|
||||
else:
|
||||
strings.append(six.u('').join((e.title(), suf)))
|
||||
return six.u('').join(strings)
|
||||
else:
|
||||
|
||||
strings = []
|
||||
for e, suf in expansion_tokens:
|
||||
strings.append(e.title())
|
||||
if suf == six.u(' '):
|
||||
strings.append(space_token)
|
||||
else:
|
||||
strings.append(suf)
|
||||
return six.u('').join(strings)
|
||||
|
||||
|
||||
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
|
||||
'''
|
||||
Abbreviations
|
||||
-------------
|
||||
|
||||
OSM discourages abbreviations, but to make our training data map better
|
||||
to real-world input, we can safely replace the canonical phrase with an
|
||||
abbreviated version and retain the meaning of the words
|
||||
'''
|
||||
raw_tokens = tokenize_raw(s)
|
||||
s_utf8 = safe_encode(s)
|
||||
tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
|
||||
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
|
||||
|
||||
n = len(tokens)
|
||||
|
||||
abbreviated = []
|
||||
|
||||
i = 0
|
||||
|
||||
def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
|
||||
data = [d.split(six.b('|')) for d in data]
|
||||
|
||||
# local copy
|
||||
abbreviated = []
|
||||
|
||||
n = len(t)
|
||||
|
||||
# Append the original tokens with whitespace if there is any
|
||||
if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
|
||||
for j, (t_i, c_i) in enumerate(t):
|
||||
abbreviated.append(tokens[i + j][0])
|
||||
|
||||
if j < n - 1:
|
||||
abbreviated.append(space_token)
|
||||
return abbreviated
|
||||
|
||||
for lang, dictionary, is_canonical, canonical in data:
|
||||
if lang not in (language, 'all'):
|
||||
continue
|
||||
|
||||
is_canonical = int(is_canonical)
|
||||
is_stopword = dictionary == 'stopword'
|
||||
is_prefix = dictionary.startswith('concatenated_prefixes')
|
||||
is_suffix = dictionary.startswith('concatenated_suffixes')
|
||||
is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
|
||||
|
||||
suffix = None
|
||||
prefix = None
|
||||
|
||||
if not is_canonical:
|
||||
continue
|
||||
|
||||
if not is_prefix and not is_suffix:
|
||||
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
|
||||
# TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
|
||||
# would require an audit of the dictionaries though so abbreviations are listed from
|
||||
# left-to-right by frequency of usage
|
||||
token = random.choice(abbreviations) if abbreviations else canonical
|
||||
token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
|
||||
abbreviated.append(token)
|
||||
break
|
||||
elif is_prefix:
|
||||
token = tokens[i][0]
|
||||
prefix, token = token[:length], token[length:]
|
||||
|
||||
abbreviated.append(prefix)
|
||||
if random.random() < separate_prob:
|
||||
sub_tokens = tokenize(token)
|
||||
if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
|
||||
token = six.u('').join((t for t, c in sub_tokens[1:]))
|
||||
|
||||
abbreviated.append(space_token)
|
||||
if token.islower():
|
||||
abbreviated.append(token.title())
|
||||
else:
|
||||
abbreviated.append(token)
|
||||
abbreviated.append(space_token)
|
||||
break
|
||||
elif is_suffix:
|
||||
token = tokens[i][0]
|
||||
|
||||
token, suffix = token[:-length], token[-length:]
|
||||
|
||||
concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
|
||||
|
||||
separated_abbreviations = []
|
||||
phrase = gazetteer.trie.get(suffix.rstrip('.'))
|
||||
suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
|
||||
for l, d, _, c in suffix_data:
|
||||
if l == lang and c == canonical:
|
||||
separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
|
||||
|
||||
separate = random.random() < separate_prob
|
||||
|
||||
if concatenated_abbreviations and not separate:
|
||||
abbreviation = random.choice(concatenated_abbreviations)
|
||||
elif separated_abbreviations:
|
||||
abbreviation = random.choice(separated_abbreviations)
|
||||
else:
|
||||
abbreviation = canonical
|
||||
|
||||
if separate:
|
||||
sub_tokens = tokenize(token)
|
||||
if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
|
||||
token = six.u('').join((t for t, c in sub_tokens[:-1]))
|
||||
|
||||
abbreviated.append(token)
|
||||
if separate:
|
||||
abbreviated.append(space_token)
|
||||
if suffix.isupper():
|
||||
abbreviated.append(abbreviation.upper())
|
||||
elif separate:
|
||||
abbreviated.append(abbreviation.title())
|
||||
else:
|
||||
abbreviated.append(abbreviation)
|
||||
break
|
||||
else:
|
||||
for j, (t_i, c_i) in enumerate(t):
|
||||
abbreviated.append(tokens[i + j][0])
|
||||
if j < n - 1:
|
||||
abbreviated.append(space_token)
|
||||
return abbreviated
|
||||
|
||||
for t, c, length, data in gazetteer.filter(norm_tokens):
|
||||
if c == token_types.PHRASE:
|
||||
abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
|
||||
abbreviated.extend(abbrev_tokens)
|
||||
|
||||
if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
|
||||
abbreviated.append(six.u(' '))
|
||||
|
||||
i += len(t)
|
||||
|
||||
else:
|
||||
token = tokens[i][0]
|
||||
if not non_breaking_dash_regex.search(token):
|
||||
abbreviated.append(token)
|
||||
else:
|
||||
sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
|
||||
sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
|
||||
|
||||
sub_token_abbreviated = []
|
||||
sub_i = 0
|
||||
sub_n = len(sub_tokens)
|
||||
for t, c, length, data in gazetteer.filter(sub_tokens_norm):
|
||||
if c == token_types.PHRASE:
|
||||
abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
|
||||
sub_token_abbreviated.extend(abbrev_tokens)
|
||||
sub_i += len(t)
|
||||
if sub_i < sub_n:
|
||||
if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
|
||||
sub_token_abbreviated.append(six.u('.'))
|
||||
sub_token_abbreviated.append(six.u('-'))
|
||||
else:
|
||||
sub_token_abbreviated.append(sub_tokens[sub_i][0])
|
||||
sub_i += 1
|
||||
if sub_i < sub_n:
|
||||
sub_token_abbreviated.append(six.u('-'))
|
||||
|
||||
abbreviated.append(six.u('').join(sub_token_abbreviated))
|
||||
|
||||
if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
|
||||
abbreviated.append(six.u(' '))
|
||||
i += 1
|
||||
|
||||
return six.u('').join(abbreviated).strip()
|
||||
254
scripts/geodata/address_expansions/address_dictionaries.py
Normal file
254
scripts/geodata/address_expansions/address_dictionaries.py
Normal file
@@ -0,0 +1,254 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
|
||||
ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'dictionaries')
|
||||
|
||||
ADDRESS_HEADER_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_rule.h')
|
||||
ADDRESS_DATA_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c')
|
||||
|
||||
address_language_index_template = u'{{{language}, {index}, {length}}}'
|
||||
address_expansion_rule_template = u'{{{phrase}, {num_dictionaries}, {{{dictionaries}}}, {canonical_index}}}'
|
||||
|
||||
|
||||
address_expansion_rule_header_template = u'''
|
||||
#ifndef ADDRESS_EXPANSION_RULE_H
|
||||
#define ADDRESS_EXPANSION_RULE_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "constants.h"
|
||||
#include "gazetteers.h"
|
||||
|
||||
#define MAX_DICTIONARY_TYPES {max_dictionary_types}
|
||||
|
||||
typedef struct address_expansion_rule {{
|
||||
char *phrase;
|
||||
uint32_t num_dictionaries;
|
||||
dictionary_type_t dictionaries[MAX_DICTIONARY_TYPES];
|
||||
int32_t canonical_index;
|
||||
}} address_expansion_rule_t;
|
||||
|
||||
typedef struct address_language_index {{
|
||||
char language[MAX_LANGUAGE_LEN];
|
||||
uint32_t index;
|
||||
size_t len;
|
||||
}} address_language_index_t;
|
||||
|
||||
|
||||
#endif
|
||||
'''
|
||||
|
||||
address_expansion_data_file_template = u'''
|
||||
char *canonical_strings[] = {{
|
||||
{canonical_strings}
|
||||
}};
|
||||
|
||||
address_expansion_rule_t expansion_rules[] = {{
|
||||
{expansion_rules}
|
||||
}};
|
||||
|
||||
address_language_index_t expansion_languages[] = {{
|
||||
{address_languages}
|
||||
}};
|
||||
'''
|
||||
|
||||
|
||||
gazetteer_types = {
|
||||
'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE',
|
||||
'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION',
|
||||
'building_types': 'DICTIONARY_BUILDING_TYPE',
|
||||
'categories': 'DICTIONARY_CATEGORY',
|
||||
'categories_plural': 'DICTIONARY_CATEGORY_PLURAL',
|
||||
'chains': 'DICTIONARY_CHAIN',
|
||||
'company_types': 'DICTIONARY_COMPANY_TYPE',
|
||||
'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE',
|
||||
'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE',
|
||||
'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE',
|
||||
'cross_streets': 'DICTIONARY_CROSS_STREET',
|
||||
'directionals': 'DICTIONARY_DIRECTIONAL',
|
||||
'elisions': 'DICTIONARY_ELISION',
|
||||
'entrances': 'DICTIONARY_ENTRANCE',
|
||||
'given_names': 'DICTIONARY_GIVEN_NAME',
|
||||
'house_numbers': 'DICTIONARY_HOUSE_NUMBER',
|
||||
'level_types_basement': 'DICTIONARY_LEVEL_BASEMENT',
|
||||
'level_types_mezzanine': 'DICTIONARY_LEVEL_MEZZANINE',
|
||||
'level_types_numbered': 'DICTIONARY_LEVEL_NUMBERED',
|
||||
'level_types_standalone': 'DICTIONARY_LEVEL_STANDALONE',
|
||||
'level_types_sub_basement': 'DICTIONARY_LEVEL_SUB_BASEMENT',
|
||||
'near': 'DICTIONARY_NEAR',
|
||||
'no_number': 'DICTIONARY_NO_NUMBER',
|
||||
'number': 'DICTIONARY_NUMBER',
|
||||
'nulls': 'DICTIONARY_NULL',
|
||||
'organizations': 'DICTIONARY_NAMED_ORGANIZATION',
|
||||
'people': 'DICTIONARY_NAMED_PERSON',
|
||||
'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX',
|
||||
'personal_titles': 'DICTIONARY_PERSONAL_TITLE',
|
||||
'place_names': 'DICTIONARY_PLACE_NAME',
|
||||
'post_office': 'DICTIONARY_POST_OFFICE',
|
||||
'postcodes': 'DICTIONARY_POSTAL_CODE',
|
||||
'qualifiers': 'DICTIONARY_QUALIFIER',
|
||||
'staircases': 'DICTIONARY_STAIRCASE',
|
||||
'stopwords': 'DICTIONARY_STOPWORD',
|
||||
'street_names': 'DICTIONARY_STREET_NAME',
|
||||
'street_types': 'DICTIONARY_STREET_TYPE',
|
||||
'surnames': 'DICTIONARY_SURNAME',
|
||||
'synonyms': 'DICTIONARY_SYNONYM',
|
||||
'toponyms': 'DICTIONARY_TOPONYM',
|
||||
'unit_directions': 'DICTIONARY_UNIT_DIRECTION',
|
||||
'unit_types_numbered': 'DICTIONARY_UNIT_NUMBERED',
|
||||
'unit_types_standalone': 'DICTIONARY_UNIT_STANDALONE',
|
||||
|
||||
}
|
||||
|
||||
|
||||
class InvalidAddressFileException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def read_dictionary_file(path):
|
||||
for i, line in enumerate(open(path)):
|
||||
line = safe_decode(line.rstrip())
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
if u'}' in line:
|
||||
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
|
||||
phrases = line.split(u'|')
|
||||
|
||||
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
|
||||
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
|
||||
|
||||
yield phrases
|
||||
|
||||
|
||||
def quote_string(s):
|
||||
return u'"{}"'.format(safe_decode(s).replace('\\', '\\\\').replace('"', '\\"'))
|
||||
|
||||
|
||||
class AddressPhraseDictionaries(object):
|
||||
def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
|
||||
self.base_dir = base_dir
|
||||
self.languages = []
|
||||
|
||||
self.language_dictionaries = defaultdict(list)
|
||||
self.phrases = defaultdict(list)
|
||||
|
||||
for language in os.listdir(base_dir):
|
||||
language_dir = os.path.join(base_dir, language)
|
||||
if not os.path.isdir(language_dir):
|
||||
continue
|
||||
|
||||
self.languages.append(language)
|
||||
|
||||
for filename in os.listdir(language_dir):
|
||||
if not filename.endswith('.txt'):
|
||||
raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename))
|
||||
dictionary_name = filename.split('.')[0].lower()
|
||||
|
||||
if dictionary_name not in gazetteer_types:
|
||||
raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types))))
|
||||
self.language_dictionaries[language].append(dictionary_name)
|
||||
|
||||
path = os.path.join(language_dir, filename)
|
||||
for i, line in enumerate(open(path)):
|
||||
line = safe_decode(line.rstrip())
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
if u'}' in line:
|
||||
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
|
||||
phrases = line.split(u'|')
|
||||
|
||||
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
|
||||
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
|
||||
|
||||
self.phrases[(language, dictionary_name)].append(phrases)
|
||||
|
||||
self.language_dictionaries = dict(self.language_dictionaries)
|
||||
self.phrases = dict(self.phrases)
|
||||
|
||||
|
||||
address_phrase_dictionaries = AddressPhraseDictionaries()
|
||||
|
||||
|
||||
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
|
||||
address_languages = []
|
||||
expansion_rules = []
|
||||
canonical_strings = []
|
||||
|
||||
max_dictionary_types = 0
|
||||
|
||||
for language in address_phrase_dictionaries.languages:
|
||||
num_language_rules = 0
|
||||
language_index = len(expansion_rules)
|
||||
|
||||
language_canonical_dictionaries = defaultdict(list)
|
||||
canonical_indices = {}
|
||||
|
||||
for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]:
|
||||
dictionary_type = gazetteer_types[dictionary_name]
|
||||
|
||||
for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]:
|
||||
canonical = phrases[0]
|
||||
if len(phrases) > 1:
|
||||
canonical_index = canonical_indices.get(canonical, None)
|
||||
if canonical_index is None:
|
||||
canonical_index = len(canonical_strings)
|
||||
canonical_strings.append(quote_string(canonical))
|
||||
canonical_indices[canonical] = canonical_index
|
||||
else:
|
||||
canonical_index = -1
|
||||
|
||||
for i, p in enumerate(phrases):
|
||||
language_canonical_dictionaries[p, canonical_index if i > 0 else -1].append(dictionary_type)
|
||||
|
||||
for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
|
||||
max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
|
||||
rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
|
||||
num_dictionaries=str(len(dictionary_types)),
|
||||
dictionaries=', '.join(dictionary_types),
|
||||
canonical_index=canonical_index)
|
||||
expansion_rules.append(rule_template)
|
||||
num_language_rules += 1
|
||||
|
||||
address_languages.append(address_language_index_template.format(language=quote_string(language),
|
||||
index=language_index,
|
||||
length=num_language_rules))
|
||||
|
||||
header = address_expansion_rule_header_template.format(
|
||||
max_dictionary_types=str(max_dictionary_types)
|
||||
)
|
||||
out = open(header_file, 'w')
|
||||
out.write(safe_encode(header))
|
||||
out.close()
|
||||
|
||||
data_file = address_expansion_data_file_template.format(
|
||||
canonical_strings=u''',
|
||||
'''.join(canonical_strings),
|
||||
expansion_rules=u''',
|
||||
'''.join(expansion_rules),
|
||||
address_languages=u''',
|
||||
'''.join(address_languages),
|
||||
)
|
||||
|
||||
out = open(output_file, 'w')
|
||||
out.write(safe_encode(data_file))
|
||||
out.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
input_dir = sys.argv[1]
|
||||
else:
|
||||
input_dir = ADDRESS_EXPANSIONS_DIR
|
||||
|
||||
create_address_expansion_rules_file(base_dir=input_dir, output_file=ADDRESS_DATA_FILE)
|
||||
56
scripts/geodata/address_expansions/equivalence.py
Normal file
56
scripts/geodata/address_expansions/equivalence.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import random
|
||||
import re
|
||||
import six
|
||||
|
||||
from itertools import izip
|
||||
|
||||
from geodata.address_expansions.gazetteers import *
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.text.normalize import normalized_tokens
|
||||
from geodata.text.tokenize import tokenize_raw, token_types
|
||||
from geodata.text.utils import non_breaking_dash_regex
|
||||
|
||||
|
||||
def canonicals_for_language(data, language):
|
||||
canonicals = set()
|
||||
|
||||
for d in data:
|
||||
lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
|
||||
if language is None or lang == language:
|
||||
canonicals.add(canonical)
|
||||
|
||||
return canonicals
|
||||
|
||||
def equivalent(s1, s2, gazetteer, language):
|
||||
'''
|
||||
Address/place equivalence
|
||||
-------------------------
|
||||
|
||||
OSM discourages abbreviations, but to make our training data map better
|
||||
to real-world input, we can safely replace the canonical phrase with an
|
||||
abbreviated version and retain the meaning of the words
|
||||
'''
|
||||
|
||||
tokens_s1 = normalized_tokens(s1)
|
||||
tokens_s2 = normalized_tokens(s2)
|
||||
|
||||
abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1))
|
||||
abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2))
|
||||
|
||||
if len(abbreviated_s1) != len(abbreviated_s2):
|
||||
return False
|
||||
|
||||
for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2):
|
||||
if c1 != token_types.PHRASE and c2 != token_types.PHRASE:
|
||||
if t1 != t2:
|
||||
return False
|
||||
elif c2 == token_types.PHRASE and c2 == token_types.PHRASE:
|
||||
canonicals_s1 = canonicals_for_language(d1, language)
|
||||
canonicals_s2 = canonicals_for_language(d2, language)
|
||||
|
||||
if not canonicals_s1 & canonicals_s2:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
260
scripts/geodata/address_expansions/gazetteers.py
Normal file
260
scripts/geodata/address_expansions/gazetteers.py
Normal file
@@ -0,0 +1,260 @@
|
||||
import os
|
||||
import six
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
|
||||
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.i18n.unicode_paths import DATA_DIR
|
||||
from geodata.text.normalize import normalized_tokens, normalize_string
|
||||
from geodata.text.tokenize import tokenize, token_types
|
||||
from geodata.text.phrases import PhraseFilter
|
||||
from geodata.enum import EnumValue
|
||||
|
||||
from marisa_trie import BytesTrie
|
||||
|
||||
|
||||
DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')
|
||||
|
||||
PREFIX_KEY = u'\x02'
|
||||
SUFFIX_KEY = u'\x03'
|
||||
|
||||
POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix',
|
||||
'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix',
|
||||
'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc',
|
||||
'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
|
||||
'm', 'mm', 'mmm', 'mmmm'])
|
||||
|
||||
|
||||
class DictionaryPhraseFilter(PhraseFilter):
|
||||
serialize = safe_encode
|
||||
deserialize = safe_decode
|
||||
|
||||
def __init__(self, *dictionaries):
|
||||
self.dictionaries = dictionaries
|
||||
self.canonicals = {}
|
||||
|
||||
kvs = defaultdict(OrderedDict)
|
||||
|
||||
for language in address_phrase_dictionaries.languages:
|
||||
for dictionary_name in self.dictionaries:
|
||||
is_suffix_dictionary = 'suffixes' in dictionary_name
|
||||
is_prefix_dictionary = 'prefixes' in dictionary_name
|
||||
|
||||
for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
|
||||
canonical = phrases[0]
|
||||
canonical_normalized = normalize_string(canonical)
|
||||
|
||||
self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]
|
||||
|
||||
for i, phrase in enumerate(phrases):
|
||||
|
||||
if phrase in POSSIBLE_ROMAN_NUMERALS:
|
||||
continue
|
||||
|
||||
is_canonical = normalize_string(phrase) == canonical_normalized
|
||||
|
||||
if is_suffix_dictionary:
|
||||
phrase = SUFFIX_KEY + phrase[::-1]
|
||||
elif is_prefix_dictionary:
|
||||
phrase = PREFIX_KEY + phrase
|
||||
|
||||
kvs[phrase][(language, dictionary_name, canonical)] = is_canonical
|
||||
|
||||
kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
|
||||
|
||||
self.trie = BytesTrie(kvs)
|
||||
|
||||
def serialize(self, s):
|
||||
return s
|
||||
|
||||
def deserialize(self, s):
|
||||
return s
|
||||
|
||||
def search_substring(self, s):
|
||||
if len(s) == 0:
|
||||
return None, 0
|
||||
|
||||
for i in xrange(len(s) + 1):
|
||||
if not self.trie.has_keys_with_prefix(s[:i]):
|
||||
i -= 1
|
||||
break
|
||||
if i > 0:
|
||||
return (self.trie.get(s[:i]), i)
|
||||
else:
|
||||
return None, 0
|
||||
|
||||
def search_suffix(self, token):
|
||||
suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
|
||||
if suffix_len > 0:
|
||||
suffix_len -= len(SUFFIX_KEY)
|
||||
return suffix_search, suffix_len
|
||||
|
||||
def search_prefix(self, token):
|
||||
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
|
||||
if prefix_len > 0:
|
||||
prefix_len -= len(PREFIX_KEY)
|
||||
return prefix_search, prefix_len
|
||||
|
||||
def basic_filter(self, tokens):
|
||||
return super(DictionaryPhraseFilter, self).filter(tokens)
|
||||
|
||||
def filter(self, tokens):
|
||||
for p, t, data in self.basic_filter(tokens):
|
||||
if not p:
|
||||
t, c = t
|
||||
token = t
|
||||
token_len = len(token)
|
||||
|
||||
suffix_search, suffix_len = self.search_suffix(token)
|
||||
if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
|
||||
yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search))
|
||||
continue
|
||||
prefix_search, prefix_len = self.search_prefix(token)
|
||||
if prefix_search and self.trie.get(token[:prefix_len]):
|
||||
yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search))
|
||||
continue
|
||||
else:
|
||||
c = token_types.PHRASE
|
||||
yield t, c, len(t), map(safe_decode, data)
|
||||
|
||||
def gen_phrases(self, s, canonical_only=False, languages=None):
|
||||
tokens = tokenize(s)
|
||||
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
|
||||
|
||||
if not languages:
|
||||
languages = None
|
||||
elif not hasattr(languages, '__iter__'):
|
||||
languages = [languages]
|
||||
|
||||
if not hasattr(languages, '__contains__'):
|
||||
languages = set(languages)
|
||||
|
||||
for t, c, length, data in self.filter(norm_tokens):
|
||||
if c == token_types.PHRASE:
|
||||
if not canonical_only and languages is None:
|
||||
yield six.u(' ').join([t_i for t_i, c_i in t])
|
||||
else:
|
||||
phrase = None
|
||||
for d in data:
|
||||
lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
|
||||
|
||||
if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'):
|
||||
phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
|
||||
yield phrase
|
||||
|
||||
def string_contains_phrases(self, s, canonical_only=False, languages=None):
|
||||
phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages)
|
||||
try:
|
||||
phrases.next()
|
||||
return True
|
||||
except StopIteration:
|
||||
return False
|
||||
|
||||
def extract_phrases(self, s, canonical_only=False, languages=None):
|
||||
return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
|
||||
|
||||
|
||||
STREET_TYPES_ONLY_DICTIONARIES = ('street_types',
|
||||
'directionals',
|
||||
'concatenated_suffixes_separable',
|
||||
'concatenated_suffixes_inseparable',
|
||||
'people',
|
||||
'personal_suffixes',
|
||||
'personal_titles',
|
||||
)
|
||||
|
||||
STREET_TYPES_DICTIONARIES = STREET_TYPES_ONLY_DICTIONARIES + ('concatenated_prefixes_separable',
|
||||
'organizations',
|
||||
'qualifiers',
|
||||
'stopwords',
|
||||
)
|
||||
|
||||
GIVEN_NAME_DICTIONARY = 'given_names'
|
||||
SURNAME_DICTIONARY = 'surnames'
|
||||
|
||||
CHAIN_DICTIONARY = 'chains'
|
||||
|
||||
SYNONYM_DICTIONARY = 'synonyms'
|
||||
|
||||
PERSONAL_NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
|
||||
SURNAME_DICTIONARY,)
|
||||
|
||||
|
||||
NAME_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
|
||||
'building_types',
|
||||
'company_types',
|
||||
'place_names',
|
||||
'qualifiers',
|
||||
'synonyms',
|
||||
'toponyms',
|
||||
)
|
||||
|
||||
QUALIFIERS_DICTIONARY = 'qualifiers'
|
||||
|
||||
HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number')
|
||||
|
||||
POSTCODE_DICTIONARIES = ('postcode',)
|
||||
|
||||
TOPONYMS_DICTIONARY = 'toponyms'
|
||||
|
||||
TOPONYM_ABBREVIATION_DICTIONARIES = ('qualifiers',
|
||||
'directionals',
|
||||
'personal_titles',
|
||||
'synonyms',
|
||||
)
|
||||
|
||||
|
||||
UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement',
|
||||
'level_types_mezzanine',
|
||||
'level_types_numbered',
|
||||
'level_types_standalone',
|
||||
'level_types_sub_basement',
|
||||
'number',
|
||||
'post_office',
|
||||
'unit_types_numbered',
|
||||
'unit_types_standalone',
|
||||
)
|
||||
|
||||
VENUE_NAME_DICTIONARIES = ('academic_degrees',
|
||||
'building_types',
|
||||
'chains',
|
||||
'company_types',
|
||||
'directionals',
|
||||
'given_names',
|
||||
'organizations',
|
||||
'people',
|
||||
'personal_suffixes',
|
||||
'personal_titles',
|
||||
'place_names',
|
||||
'stopwords',
|
||||
'surnames',
|
||||
)
|
||||
|
||||
ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
|
||||
NAME_DICTIONARIES + \
|
||||
UNIT_ABBREVIATION_DICTIONARIES + \
|
||||
('no_number', 'nulls',)
|
||||
|
||||
|
||||
_gazetteers = []
|
||||
|
||||
|
||||
def create_gazetteer(*dictionaries):
|
||||
g = DictionaryPhraseFilter(*dictionaries)
|
||||
_gazetteers.append(g)
|
||||
return g
|
||||
|
||||
|
||||
street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
|
||||
street_types_only_gazetteer = create_gazetteer(*STREET_TYPES_ONLY_DICTIONARIES)
|
||||
qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY)
|
||||
names_gazetteer = create_gazetteer(*NAME_DICTIONARIES)
|
||||
chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
|
||||
unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
|
||||
street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, )))
|
||||
abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
|
||||
toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES)
|
||||
toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY)
|
||||
given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
|
||||
venue_names_gazetteer = create_gazetteer(*VENUE_NAME_DICTIONARIES)
|
||||
Reference in New Issue
Block a user