Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

@@ -0,0 +1,233 @@
import random
import re
import six
from geodata.address_expansions.gazetteers import *
from geodata.encoding import safe_decode, safe_encode
from geodata.text.tokenize import tokenize_raw, token_types
from geodata.text.utils import non_breaking_dash_regex
LOWER, UPPER, TITLE, MIXED = range(4)
def token_capitalization(s):
if s.istitle():
return TITLE
elif s.islower():
return LOWER
elif s.isupper():
return UPPER
else:
return MIXED
expansion_token_regex = re.compile('([^ \-\.]+)([\.\- ]+|$)')
def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
expansion_tokens = expansion_token_regex.findall(expansion)
if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
expansion_tokenized = tokenize(expansion)
is_acronym = len(expansion_tokenized) == 1 and expansion_tokenized[0][1] == token_types.ACRONYM
if len(expansion) <= 3 or is_acronym:
return expansion.upper()
else:
return expansion.title()
elif len(tokens) == len(expansion_tokens):
strings = []
for (t, c), (e, suf) in zip(tokens, expansion_tokens):
cap = token_capitalization(t)
if suf == six.u(' '):
suf = space_token
if cap == LOWER:
strings.append(six.u('').join((e.lower(), suf)))
elif cap == UPPER:
strings.append(six.u('').join((e.upper(), suf)))
elif cap == TITLE:
strings.append(six.u('').join((e.title(), suf)))
elif t.lower() == e.lower():
strings.append(t)
else:
strings.append(six.u('').join((e.title(), suf)))
return six.u('').join(strings)
else:
strings = []
for e, suf in expansion_tokens:
strings.append(e.title())
if suf == six.u(' '):
strings.append(space_token)
else:
strings.append(suf)
return six.u('').join(strings)
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
'''
Abbreviations
-------------
OSM discourages abbreviations, but to make our training data map better
to real-world input, we can safely replace the canonical phrase with an
abbreviated version and retain the meaning of the words
'''
raw_tokens = tokenize_raw(s)
s_utf8 = safe_encode(s)
tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
n = len(tokens)
abbreviated = []
i = 0
def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
data = [d.split(six.b('|')) for d in data]
# local copy
abbreviated = []
n = len(t)
# Append the original tokens with whitespace if there is any
if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0])
if j < n - 1:
abbreviated.append(space_token)
return abbreviated
for lang, dictionary, is_canonical, canonical in data:
if lang not in (language, 'all'):
continue
is_canonical = int(is_canonical)
is_stopword = dictionary == 'stopword'
is_prefix = dictionary.startswith('concatenated_prefixes')
is_suffix = dictionary.startswith('concatenated_suffixes')
is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
suffix = None
prefix = None
if not is_canonical:
continue
if not is_prefix and not is_suffix:
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
# TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
# would require an audit of the dictionaries though so abbreviations are listed from
# left-to-right by frequency of usage
token = random.choice(abbreviations) if abbreviations else canonical
token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
abbreviated.append(token)
break
elif is_prefix:
token = tokens[i][0]
prefix, token = token[:length], token[length:]
abbreviated.append(prefix)
if random.random() < separate_prob:
sub_tokens = tokenize(token)
if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
token = six.u('').join((t for t, c in sub_tokens[1:]))
abbreviated.append(space_token)
if token.islower():
abbreviated.append(token.title())
else:
abbreviated.append(token)
abbreviated.append(space_token)
break
elif is_suffix:
token = tokens[i][0]
token, suffix = token[:-length], token[-length:]
concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
separated_abbreviations = []
phrase = gazetteer.trie.get(suffix.rstrip('.'))
suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
for l, d, _, c in suffix_data:
if l == lang and c == canonical:
separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
separate = random.random() < separate_prob
if concatenated_abbreviations and not separate:
abbreviation = random.choice(concatenated_abbreviations)
elif separated_abbreviations:
abbreviation = random.choice(separated_abbreviations)
else:
abbreviation = canonical
if separate:
sub_tokens = tokenize(token)
if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
token = six.u('').join((t for t, c in sub_tokens[:-1]))
abbreviated.append(token)
if separate:
abbreviated.append(space_token)
if suffix.isupper():
abbreviated.append(abbreviation.upper())
elif separate:
abbreviated.append(abbreviation.title())
else:
abbreviated.append(abbreviation)
break
else:
for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0])
if j < n - 1:
abbreviated.append(space_token)
return abbreviated
for t, c, length, data in gazetteer.filter(norm_tokens):
if c == token_types.PHRASE:
abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
abbreviated.extend(abbrev_tokens)
if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
abbreviated.append(six.u(' '))
i += len(t)
else:
token = tokens[i][0]
if not non_breaking_dash_regex.search(token):
abbreviated.append(token)
else:
sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
sub_token_abbreviated = []
sub_i = 0
sub_n = len(sub_tokens)
for t, c, length, data in gazetteer.filter(sub_tokens_norm):
if c == token_types.PHRASE:
abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
sub_token_abbreviated.extend(abbrev_tokens)
sub_i += len(t)
if sub_i < sub_n:
if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
sub_token_abbreviated.append(six.u('.'))
sub_token_abbreviated.append(six.u('-'))
else:
sub_token_abbreviated.append(sub_tokens[sub_i][0])
sub_i += 1
if sub_i < sub_n:
sub_token_abbreviated.append(six.u('-'))
abbreviated.append(six.u('').join(sub_token_abbreviated))
if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
abbreviated.append(six.u(' '))
i += 1
return six.u('').join(abbreviated).strip()

View File

@@ -0,0 +1,254 @@
import os
import sys
from collections import defaultdict
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.encoding import safe_encode, safe_decode
ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'dictionaries')
ADDRESS_HEADER_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_rule.h')
ADDRESS_DATA_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c')
address_language_index_template = u'{{{language}, {index}, {length}}}'
address_expansion_rule_template = u'{{{phrase}, {num_dictionaries}, {{{dictionaries}}}, {canonical_index}}}'
address_expansion_rule_header_template = u'''
#ifndef ADDRESS_EXPANSION_RULE_H
#define ADDRESS_EXPANSION_RULE_H
#include <stdlib.h>
#include <stdint.h>
#include "constants.h"
#include "gazetteers.h"
#define MAX_DICTIONARY_TYPES {max_dictionary_types}
typedef struct address_expansion_rule {{
char *phrase;
uint32_t num_dictionaries;
dictionary_type_t dictionaries[MAX_DICTIONARY_TYPES];
int32_t canonical_index;
}} address_expansion_rule_t;
typedef struct address_language_index {{
char language[MAX_LANGUAGE_LEN];
uint32_t index;
size_t len;
}} address_language_index_t;
#endif
'''
address_expansion_data_file_template = u'''
char *canonical_strings[] = {{
{canonical_strings}
}};
address_expansion_rule_t expansion_rules[] = {{
{expansion_rules}
}};
address_language_index_t expansion_languages[] = {{
{address_languages}
}};
'''
gazetteer_types = {
'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE',
'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION',
'building_types': 'DICTIONARY_BUILDING_TYPE',
'categories': 'DICTIONARY_CATEGORY',
'categories_plural': 'DICTIONARY_CATEGORY_PLURAL',
'chains': 'DICTIONARY_CHAIN',
'company_types': 'DICTIONARY_COMPANY_TYPE',
'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE',
'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE',
'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE',
'cross_streets': 'DICTIONARY_CROSS_STREET',
'directionals': 'DICTIONARY_DIRECTIONAL',
'elisions': 'DICTIONARY_ELISION',
'entrances': 'DICTIONARY_ENTRANCE',
'given_names': 'DICTIONARY_GIVEN_NAME',
'house_numbers': 'DICTIONARY_HOUSE_NUMBER',
'level_types_basement': 'DICTIONARY_LEVEL_BASEMENT',
'level_types_mezzanine': 'DICTIONARY_LEVEL_MEZZANINE',
'level_types_numbered': 'DICTIONARY_LEVEL_NUMBERED',
'level_types_standalone': 'DICTIONARY_LEVEL_STANDALONE',
'level_types_sub_basement': 'DICTIONARY_LEVEL_SUB_BASEMENT',
'near': 'DICTIONARY_NEAR',
'no_number': 'DICTIONARY_NO_NUMBER',
'number': 'DICTIONARY_NUMBER',
'nulls': 'DICTIONARY_NULL',
'organizations': 'DICTIONARY_NAMED_ORGANIZATION',
'people': 'DICTIONARY_NAMED_PERSON',
'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX',
'personal_titles': 'DICTIONARY_PERSONAL_TITLE',
'place_names': 'DICTIONARY_PLACE_NAME',
'post_office': 'DICTIONARY_POST_OFFICE',
'postcodes': 'DICTIONARY_POSTAL_CODE',
'qualifiers': 'DICTIONARY_QUALIFIER',
'staircases': 'DICTIONARY_STAIRCASE',
'stopwords': 'DICTIONARY_STOPWORD',
'street_names': 'DICTIONARY_STREET_NAME',
'street_types': 'DICTIONARY_STREET_TYPE',
'surnames': 'DICTIONARY_SURNAME',
'synonyms': 'DICTIONARY_SYNONYM',
'toponyms': 'DICTIONARY_TOPONYM',
'unit_directions': 'DICTIONARY_UNIT_DIRECTION',
'unit_types_numbered': 'DICTIONARY_UNIT_NUMBERED',
'unit_types_standalone': 'DICTIONARY_UNIT_STANDALONE',
}
class InvalidAddressFileException(Exception):
pass
def read_dictionary_file(path):
for i, line in enumerate(open(path)):
line = safe_decode(line.rstrip())
if not line.strip():
continue
if u'}' in line:
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
phrases = line.split(u'|')
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
yield phrases
def quote_string(s):
return u'"{}"'.format(safe_decode(s).replace('\\', '\\\\').replace('"', '\\"'))
class AddressPhraseDictionaries(object):
def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
self.base_dir = base_dir
self.languages = []
self.language_dictionaries = defaultdict(list)
self.phrases = defaultdict(list)
for language in os.listdir(base_dir):
language_dir = os.path.join(base_dir, language)
if not os.path.isdir(language_dir):
continue
self.languages.append(language)
for filename in os.listdir(language_dir):
if not filename.endswith('.txt'):
raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename))
dictionary_name = filename.split('.')[0].lower()
if dictionary_name not in gazetteer_types:
raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types))))
self.language_dictionaries[language].append(dictionary_name)
path = os.path.join(language_dir, filename)
for i, line in enumerate(open(path)):
line = safe_decode(line.rstrip())
if not line.strip():
continue
if u'}' in line:
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
phrases = line.split(u'|')
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
self.phrases[(language, dictionary_name)].append(phrases)
self.language_dictionaries = dict(self.language_dictionaries)
self.phrases = dict(self.phrases)
address_phrase_dictionaries = AddressPhraseDictionaries()
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
address_languages = []
expansion_rules = []
canonical_strings = []
max_dictionary_types = 0
for language in address_phrase_dictionaries.languages:
num_language_rules = 0
language_index = len(expansion_rules)
language_canonical_dictionaries = defaultdict(list)
canonical_indices = {}
for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]:
dictionary_type = gazetteer_types[dictionary_name]
for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]:
canonical = phrases[0]
if len(phrases) > 1:
canonical_index = canonical_indices.get(canonical, None)
if canonical_index is None:
canonical_index = len(canonical_strings)
canonical_strings.append(quote_string(canonical))
canonical_indices[canonical] = canonical_index
else:
canonical_index = -1
for i, p in enumerate(phrases):
language_canonical_dictionaries[p, canonical_index if i > 0 else -1].append(dictionary_type)
for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
num_dictionaries=str(len(dictionary_types)),
dictionaries=', '.join(dictionary_types),
canonical_index=canonical_index)
expansion_rules.append(rule_template)
num_language_rules += 1
address_languages.append(address_language_index_template.format(language=quote_string(language),
index=language_index,
length=num_language_rules))
header = address_expansion_rule_header_template.format(
max_dictionary_types=str(max_dictionary_types)
)
out = open(header_file, 'w')
out.write(safe_encode(header))
out.close()
data_file = address_expansion_data_file_template.format(
canonical_strings=u''',
'''.join(canonical_strings),
expansion_rules=u''',
'''.join(expansion_rules),
address_languages=u''',
'''.join(address_languages),
)
out = open(output_file, 'w')
out.write(safe_encode(data_file))
out.close()
if __name__ == '__main__':
if len(sys.argv) > 1:
input_dir = sys.argv[1]
else:
input_dir = ADDRESS_EXPANSIONS_DIR
create_address_expansion_rules_file(base_dir=input_dir, output_file=ADDRESS_DATA_FILE)

View File

@@ -0,0 +1,56 @@
import random
import re
import six
from itertools import izip
from geodata.address_expansions.gazetteers import *
from geodata.encoding import safe_decode, safe_encode
from geodata.text.normalize import normalized_tokens
from geodata.text.tokenize import tokenize_raw, token_types
from geodata.text.utils import non_breaking_dash_regex
def canonicals_for_language(data, language):
canonicals = set()
for d in data:
lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
if language is None or lang == language:
canonicals.add(canonical)
return canonicals
def equivalent(s1, s2, gazetteer, language):
'''
Address/place equivalence
-------------------------
OSM discourages abbreviations, but to make our training data map better
to real-world input, we can safely replace the canonical phrase with an
abbreviated version and retain the meaning of the words
'''
tokens_s1 = normalized_tokens(s1)
tokens_s2 = normalized_tokens(s2)
abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1))
abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2))
if len(abbreviated_s1) != len(abbreviated_s2):
return False
for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2):
if c1 != token_types.PHRASE and c2 != token_types.PHRASE:
if t1 != t2:
return False
elif c2 == token_types.PHRASE and c2 == token_types.PHRASE:
canonicals_s1 = canonicals_for_language(d1, language)
canonicals_s2 = canonicals_for_language(d2, language)
if not canonicals_s1 & canonicals_s2:
return False
else:
return False
return True

View File

@@ -0,0 +1,260 @@
import os
import six
from collections import defaultdict, OrderedDict
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
from geodata.encoding import safe_decode, safe_encode
from geodata.i18n.unicode_paths import DATA_DIR
from geodata.text.normalize import normalized_tokens, normalize_string
from geodata.text.tokenize import tokenize, token_types
from geodata.text.phrases import PhraseFilter
from geodata.enum import EnumValue
from marisa_trie import BytesTrie
DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')
PREFIX_KEY = u'\x02'
SUFFIX_KEY = u'\x03'
POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix',
'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix',
'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc',
'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
'm', 'mm', 'mmm', 'mmmm'])
class DictionaryPhraseFilter(PhraseFilter):
serialize = safe_encode
deserialize = safe_decode
def __init__(self, *dictionaries):
self.dictionaries = dictionaries
self.canonicals = {}
kvs = defaultdict(OrderedDict)
for language in address_phrase_dictionaries.languages:
for dictionary_name in self.dictionaries:
is_suffix_dictionary = 'suffixes' in dictionary_name
is_prefix_dictionary = 'prefixes' in dictionary_name
for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
canonical = phrases[0]
canonical_normalized = normalize_string(canonical)
self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]
for i, phrase in enumerate(phrases):
if phrase in POSSIBLE_ROMAN_NUMERALS:
continue
is_canonical = normalize_string(phrase) == canonical_normalized
if is_suffix_dictionary:
phrase = SUFFIX_KEY + phrase[::-1]
elif is_prefix_dictionary:
phrase = PREFIX_KEY + phrase
kvs[phrase][(language, dictionary_name, canonical)] = is_canonical
kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
self.trie = BytesTrie(kvs)
def serialize(self, s):
return s
def deserialize(self, s):
return s
def search_substring(self, s):
if len(s) == 0:
return None, 0
for i in xrange(len(s) + 1):
if not self.trie.has_keys_with_prefix(s[:i]):
i -= 1
break
if i > 0:
return (self.trie.get(s[:i]), i)
else:
return None, 0
def search_suffix(self, token):
suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
if suffix_len > 0:
suffix_len -= len(SUFFIX_KEY)
return suffix_search, suffix_len
def search_prefix(self, token):
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
if prefix_len > 0:
prefix_len -= len(PREFIX_KEY)
return prefix_search, prefix_len
def basic_filter(self, tokens):
return super(DictionaryPhraseFilter, self).filter(tokens)
def filter(self, tokens):
for p, t, data in self.basic_filter(tokens):
if not p:
t, c = t
token = t
token_len = len(token)
suffix_search, suffix_len = self.search_suffix(token)
if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search))
continue
prefix_search, prefix_len = self.search_prefix(token)
if prefix_search and self.trie.get(token[:prefix_len]):
yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search))
continue
else:
c = token_types.PHRASE
yield t, c, len(t), map(safe_decode, data)
def gen_phrases(self, s, canonical_only=False, languages=None):
tokens = tokenize(s)
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
if not languages:
languages = None
elif not hasattr(languages, '__iter__'):
languages = [languages]
if not hasattr(languages, '__contains__'):
languages = set(languages)
for t, c, length, data in self.filter(norm_tokens):
if c == token_types.PHRASE:
if not canonical_only and languages is None:
yield six.u(' ').join([t_i for t_i, c_i in t])
else:
phrase = None
for d in data:
lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'):
phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
yield phrase
def string_contains_phrases(self, s, canonical_only=False, languages=None):
phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages)
try:
phrases.next()
return True
except StopIteration:
return False
def extract_phrases(self, s, canonical_only=False, languages=None):
return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
STREET_TYPES_ONLY_DICTIONARIES = ('street_types',
'directionals',
'concatenated_suffixes_separable',
'concatenated_suffixes_inseparable',
'people',
'personal_suffixes',
'personal_titles',
)
STREET_TYPES_DICTIONARIES = STREET_TYPES_ONLY_DICTIONARIES + ('concatenated_prefixes_separable',
'organizations',
'qualifiers',
'stopwords',
)
GIVEN_NAME_DICTIONARY = 'given_names'
SURNAME_DICTIONARY = 'surnames'
CHAIN_DICTIONARY = 'chains'
SYNONYM_DICTIONARY = 'synonyms'
PERSONAL_NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
SURNAME_DICTIONARY,)
NAME_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
'building_types',
'company_types',
'place_names',
'qualifiers',
'synonyms',
'toponyms',
)
QUALIFIERS_DICTIONARY = 'qualifiers'
HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number')
POSTCODE_DICTIONARIES = ('postcode',)
TOPONYMS_DICTIONARY = 'toponyms'
TOPONYM_ABBREVIATION_DICTIONARIES = ('qualifiers',
'directionals',
'personal_titles',
'synonyms',
)
UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement',
'level_types_mezzanine',
'level_types_numbered',
'level_types_standalone',
'level_types_sub_basement',
'number',
'post_office',
'unit_types_numbered',
'unit_types_standalone',
)
VENUE_NAME_DICTIONARIES = ('academic_degrees',
'building_types',
'chains',
'company_types',
'directionals',
'given_names',
'organizations',
'people',
'personal_suffixes',
'personal_titles',
'place_names',
'stopwords',
'surnames',
)
ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
NAME_DICTIONARIES + \
UNIT_ABBREVIATION_DICTIONARIES + \
('no_number', 'nulls',)
_gazetteers = []
def create_gazetteer(*dictionaries):
g = DictionaryPhraseFilter(*dictionaries)
_gazetteers.append(g)
return g
street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
street_types_only_gazetteer = create_gazetteer(*STREET_TYPES_ONLY_DICTIONARIES)
qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY)
names_gazetteer = create_gazetteer(*NAME_DICTIONARIES)
chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, )))
abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES)
toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY)
given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
venue_names_gazetteer = create_gazetteer(*VENUE_NAME_DICTIONARIES)