diff --git a/scripts/geodata/address_expansions/gazetteers.py b/scripts/geodata/address_expansions/gazetteers.py new file mode 100644 index 00000000..b47a486f --- /dev/null +++ b/scripts/geodata/address_expansions/gazetteers.py @@ -0,0 +1,195 @@ +import os +import sys + +from collections import defaultdict, OrderedDict + +from geodata.encoding import safe_decode, safe_encode +from geodata.i18n.unicode_paths import DATA_DIR +from geodata.text.normalize import normalized_tokens, normalize_string +from geodata.text.tokenize import tokenize, token_types +from geodata.text.phrases import PhraseFilter + +from marisa_trie import BytesTrie + + +DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries') + +PREFIX_KEY = u'\x02' +SUFFIX_KEY = u'\x03' + +POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', + 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', + 'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc', + 'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm', + 'm', 'mm', 'mmm', 'mmmm']) + +PHRASE = 'PHRASE' + + +class DictionaryPhraseFilter(PhraseFilter): + + def __init__(self, *dictionaries): + self.dictionaries = dictionaries + self.canonicals = {} + + def serialize(self, s): + return s + + def deserialize(self, s): + return s + + def configure(self, base_dir=DICTIONARIES_DIR): + kvs = defaultdict(OrderedDict) + for lang in os.listdir(DICTIONARIES_DIR): + for filename in self.dictionaries: + is_suffix_dictionary = 'suffixes' in filename + is_prefix_dictionary = 'prefixes' in filename + + dictionary_name = filename.split('.', 1)[0] + + path = os.path.join(DICTIONARIES_DIR, lang, filename) + if not os.path.exists(path): + continue + + for line in open(path): + line = line.strip() + if not line: + continue + + phrases = safe_decode(line).split(u'|') + if not phrases: + continue + + canonical = phrases[0] + canonical_normalized = normalize_string(canonical) + + self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] + + for i, phrase in enumerate(phrases): + + if phrase in POSSIBLE_ROMAN_NUMERALS: + continue + + is_canonical = normalize_string(phrase) == canonical_normalized + + if is_suffix_dictionary: + phrase = SUFFIX_KEY + phrase[::-1] + elif is_prefix_dictionary: + phrase = PREFIX_KEY + phrase + + kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical + + kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] + + self.trie = BytesTrie(kvs) + self.configured = True + + def search_substring(self, s): + if len(s) == 0: + return None, 0 + + for i in xrange(len(s) + 1): + if not self.trie.has_keys_with_prefix(s[:i]): + i -= 1 + break + if i > 0: + return (self.trie.get(s[:i]), i) + else: + return None, 0 + + def search_suffix(self, token): + suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) + if suffix_len > 0: + suffix_len -= len(SUFFIX_KEY) + return suffix_search, suffix_len + + def search_prefix(self, token): + prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) + if prefix_len > 0: + prefix_len -= len(PREFIX_KEY) + return prefix_search, prefix_len + + def basic_filter(self, tokens): + return super(DictionaryPhraseFilter, self).filter(tokens) + + def filter(self, tokens): + for p, t, data in self.basic_filter(tokens): + if not p: + t, c = t + token = t + token_len = len(token) + + suffix_search, suffix_len = self.search_suffix(token) + if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')): + yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search)) + continue + prefix_search, prefix_len = self.search_prefix(token) + if prefix_search and self.trie.get(token[:prefix_len]): + yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search)) + continue + else: + c = PHRASE + yield t, c, len(t), map(safe_decode, data) + +STREET_TYPES_DICTIONARIES = ('street_types.txt', + 'directionals.txt', + 'concatenated_suffixes_separable.txt', + 'concatenated_suffixes_inseparable.txt', + 'concatenated_prefixes_separable.txt', + 'organizations.txt', + 'people.txt', + 'personal_suffixes.txt', + 'personal_titles.txt', + 'qualifiers.txt', + 'stopwords.txt',) + +GIVEN_NAME_DICTIONARY = 'given_names.txt' +SURNAME_DICTIONARY = 'surnames.txt' + +NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY, + SURNAME_DICTIONARY,) + + + +NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees.txt', + 'building_types.txt', + 'company_types.txt', + 'place_names.txt', + 'qualifiers.txt', + 'synonyms.txt', + 'toponyms.txt', + ) + + +UNIT_ABBREVIATION_DICTIONARIES = ('level_types.txt', + 'post_office.txt', + 'unit_types.txt', + ) + + +ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \ + NAME_ABBREVIATION_DICTIONARIES + \ + UNIT_ABBREVIATION_DICTIONARIES + \ + ('no_number.txt', 'nulls.txt',) + + +_gazetteers = [] + + +def create_gazetteer(*dictionaries): + g = DictionaryPhraseFilter(*dictionaries) + _gazetteers.append(g) + return g + + +street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES) +names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES) +unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES) +street_and_unit_types_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + UNIT_ABBREVIATION_DICTIONARIES)) +abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES) +given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY) + + +def init_gazetteers(): + for g in _gazetteers: + g.configure() diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index 026709d2..581ab28f 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -3,21 +3,17 @@ import sys from collections import defaultdict, OrderedDict -from marisa_trie import BytesTrie - this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python'))) +from geodata.address_expansions.gazetteers import * from geodata.encoding import safe_decode, safe_encode from geodata.string_utils import wide_iter, wide_ord -from geodata.i18n.unicode_paths import DATA_DIR -from geodata.i18n.normalize import strip_accents from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages from geodata.text.normalize import normalized_tokens, normalize_string -from geodata.text.tokenize import tokenize, token_types -from geodata.text.phrases import PhraseFilter +from geodata.text.tokenize import tokenize WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt']) @@ -35,184 +31,6 @@ WELL_REPRESENTED_LANGUAGE_COUNTRIES = { 'pt': set(['pt', 'br']), } -DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries') - -PREFIX_KEY = u'\x02' -SUFFIX_KEY = u'\x03' - -POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', - 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', - 'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc', - 'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm', - 'm', 'mm', 'mmm', 'mmmm']) - -PHRASE = 'PHRASE' - - -class DictionaryPhraseFilter(PhraseFilter): - - def __init__(self, *dictionaries): - self.dictionaries = dictionaries - self.canonicals = {} - - def serialize(self, s): - return s - - def deserialize(self, s): - return s - - def configure(self, base_dir=DICTIONARIES_DIR): - kvs = defaultdict(OrderedDict) - for lang in os.listdir(DICTIONARIES_DIR): - for filename in self.dictionaries: - is_suffix_dictionary = 'suffixes' in filename - is_prefix_dictionary = 'prefixes' in filename - - dictionary_name = filename.split('.', 1)[0] - - path = os.path.join(DICTIONARIES_DIR, lang, filename) - if not os.path.exists(path): - continue - - for line in open(path): - line = line.strip() - if not line: - continue - - phrases = safe_decode(line).split(u'|') - if not phrases: - continue - - canonical = phrases[0] - canonical_normalized = normalize_string(canonical) - - self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] - - for i, phrase in enumerate(phrases): - - if phrase in POSSIBLE_ROMAN_NUMERALS: - continue - - is_canonical = normalize_string(phrase) == canonical_normalized - - if is_suffix_dictionary: - phrase = SUFFIX_KEY + phrase[::-1] - elif is_prefix_dictionary: - phrase = PREFIX_KEY + phrase - - kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical - - kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] - - self.trie = BytesTrie(kvs) - self.configured = True - - def search_substring(self, s): - if len(s) == 0: - return None, 0 - - for i in xrange(len(s) + 1): - if not self.trie.has_keys_with_prefix(s[:i]): - i -= 1 - break - if i > 0: - return (self.trie.get(s[:i]), i) - else: - return None, 0 - - def search_suffix(self, token): - suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) - if suffix_len > 0: - suffix_len -= len(SUFFIX_KEY) - return suffix_search, suffix_len - - def search_prefix(self, token): - prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) - if prefix_len > 0: - prefix_len -= len(PREFIX_KEY) - return prefix_search, prefix_len - - def basic_filter(self, tokens): - return super(DictionaryPhraseFilter, self).filter(tokens) - - def filter(self, tokens): - for p, t, data in self.basic_filter(tokens): - if not p: - t, c = t - token = t - token_len = len(token) - - suffix_search, suffix_len = self.search_suffix(token) - if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')): - yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search)) - continue - prefix_search, prefix_len = self.search_prefix(token) - if prefix_search and self.trie.get(token[:prefix_len]): - yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search)) - continue - else: - c = PHRASE - yield t, c, len(t), map(safe_decode, data) - -STREET_TYPES_DICTIONARIES = ('street_types.txt', - 'directionals.txt', - 'concatenated_suffixes_separable.txt', - 'concatenated_suffixes_inseparable.txt', - 'concatenated_prefixes_separable.txt', - 'organizations.txt', - 'people.txt', - 'personal_suffixes.txt', - 'personal_titles.txt', - 'qualifiers.txt', - 'stopwords.txt',) - -GIVEN_NAME_DICTIONARY = 'given_names.txt' -SURNAME_DICTIONARY = 'surnames.txt' - -NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY, - SURNAME_DICTIONARY,) - - - -NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees.txt', - 'building_types.txt', - 'company_types.txt', - 'place_names.txt', - 'qualifiers.txt', - 'synonyms.txt', - 'toponyms.txt', - ) - - -UNIT_ABBREVIATION_DICTIONARIES = ('level_types.txt', - 'post_office.txt', - 'unit_types.txt', - ) - - -ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \ - NAME_ABBREVIATION_DICTIONARIES + \ - UNIT_ABBREVIATION_DICTIONARIES + \ - ('no_number.txt', 'nulls.txt',) - - -gazetteers = [] - - -def create_gazetteer(*dictionaries): - g = DictionaryPhraseFilter(*dictionaries) - gazetteers.append(g) - return g - - -street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES) -names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES) -unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES) -street_and_unit_types_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + UNIT_ABBREVIATION_DICTIONARIES)) -abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES) -given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY) - - char_scripts = [] script_languages = {} diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 6f09d4c5..56ddc532 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -54,6 +54,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python'))) +from geodata.address_expansions.gazetteers import * from geodata.coordinates.conversion import * from geodata.countries.country_names import * from geodata.geonames.db import GeoNamesDB @@ -1591,6 +1592,7 @@ if __name__ == '__main__': init_country_names() init_languages() init_disambiguation() + init_gazetteers() language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir) osm_rtree = None @@ -1610,8 +1612,6 @@ if __name__ == '__main__': if args.geonames_db: geonames = GeoNamesDB(args.geonames_db) - street_types_gazetteer.configure() - # Can parallelize if args.streets_file: build_ways_training_data(language_rtree, args.streets_file, args.out_dir)