Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

View File

@@ -0,0 +1,233 @@
import random
import re
import six
from geodata.address_expansions.gazetteers import *
from geodata.encoding import safe_decode, safe_encode
from geodata.text.tokenize import tokenize_raw, token_types
from geodata.text.utils import non_breaking_dash_regex
LOWER, UPPER, TITLE, MIXED = range(4)
def token_capitalization(s):
if s.istitle():
return TITLE
elif s.islower():
return LOWER
elif s.isupper():
return UPPER
else:
return MIXED
expansion_token_regex = re.compile('([^ \-\.]+)([\.\- ]+|$)')
def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
expansion_tokens = expansion_token_regex.findall(expansion)
if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
expansion_tokenized = tokenize(expansion)
is_acronym = len(expansion_tokenized) == 1 and expansion_tokenized[0][1] == token_types.ACRONYM
if len(expansion) <= 3 or is_acronym:
return expansion.upper()
else:
return expansion.title()
elif len(tokens) == len(expansion_tokens):
strings = []
for (t, c), (e, suf) in zip(tokens, expansion_tokens):
cap = token_capitalization(t)
if suf == six.u(' '):
suf = space_token
if cap == LOWER:
strings.append(six.u('').join((e.lower(), suf)))
elif cap == UPPER:
strings.append(six.u('').join((e.upper(), suf)))
elif cap == TITLE:
strings.append(six.u('').join((e.title(), suf)))
elif t.lower() == e.lower():
strings.append(t)
else:
strings.append(six.u('').join((e.title(), suf)))
return six.u('').join(strings)
else:
strings = []
for e, suf in expansion_tokens:
strings.append(e.title())
if suf == six.u(' '):
strings.append(space_token)
else:
strings.append(suf)
return six.u('').join(strings)
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
'''
Abbreviations
-------------
OSM discourages abbreviations, but to make our training data map better
to real-world input, we can safely replace the canonical phrase with an
abbreviated version and retain the meaning of the words
'''
raw_tokens = tokenize_raw(s)
s_utf8 = safe_encode(s)
tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
n = len(tokens)
abbreviated = []
i = 0
def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
data = [d.split(six.b('|')) for d in data]
# local copy
abbreviated = []
n = len(t)
# Append the original tokens with whitespace if there is any
if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0])
if j < n - 1:
abbreviated.append(space_token)
return abbreviated
for lang, dictionary, is_canonical, canonical in data:
if lang not in (language, 'all'):
continue
is_canonical = int(is_canonical)
is_stopword = dictionary == 'stopword'
is_prefix = dictionary.startswith('concatenated_prefixes')
is_suffix = dictionary.startswith('concatenated_suffixes')
is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
suffix = None
prefix = None
if not is_canonical:
continue
if not is_prefix and not is_suffix:
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
# TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
# would require an audit of the dictionaries though so abbreviations are listed from
# left-to-right by frequency of usage
token = random.choice(abbreviations) if abbreviations else canonical
token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
abbreviated.append(token)
break
elif is_prefix:
token = tokens[i][0]
prefix, token = token[:length], token[length:]
abbreviated.append(prefix)
if random.random() < separate_prob:
sub_tokens = tokenize(token)
if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
token = six.u('').join((t for t, c in sub_tokens[1:]))
abbreviated.append(space_token)
if token.islower():
abbreviated.append(token.title())
else:
abbreviated.append(token)
abbreviated.append(space_token)
break
elif is_suffix:
token = tokens[i][0]
token, suffix = token[:-length], token[-length:]
concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
separated_abbreviations = []
phrase = gazetteer.trie.get(suffix.rstrip('.'))
suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
for l, d, _, c in suffix_data:
if l == lang and c == canonical:
separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
separate = random.random() < separate_prob
if concatenated_abbreviations and not separate:
abbreviation = random.choice(concatenated_abbreviations)
elif separated_abbreviations:
abbreviation = random.choice(separated_abbreviations)
else:
abbreviation = canonical
if separate:
sub_tokens = tokenize(token)
if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
token = six.u('').join((t for t, c in sub_tokens[:-1]))
abbreviated.append(token)
if separate:
abbreviated.append(space_token)
if suffix.isupper():
abbreviated.append(abbreviation.upper())
elif separate:
abbreviated.append(abbreviation.title())
else:
abbreviated.append(abbreviation)
break
else:
for j, (t_i, c_i) in enumerate(t):
abbreviated.append(tokens[i + j][0])
if j < n - 1:
abbreviated.append(space_token)
return abbreviated
for t, c, length, data in gazetteer.filter(norm_tokens):
if c == token_types.PHRASE:
abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
abbreviated.extend(abbrev_tokens)
if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
abbreviated.append(six.u(' '))
i += len(t)
else:
token = tokens[i][0]
if not non_breaking_dash_regex.search(token):
abbreviated.append(token)
else:
sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
sub_token_abbreviated = []
sub_i = 0
sub_n = len(sub_tokens)
for t, c, length, data in gazetteer.filter(sub_tokens_norm):
if c == token_types.PHRASE:
abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
sub_token_abbreviated.extend(abbrev_tokens)
sub_i += len(t)
if sub_i < sub_n:
if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
sub_token_abbreviated.append(six.u('.'))
sub_token_abbreviated.append(six.u('-'))
else:
sub_token_abbreviated.append(sub_tokens[sub_i][0])
sub_i += 1
if sub_i < sub_n:
sub_token_abbreviated.append(six.u('-'))
abbreviated.append(six.u('').join(sub_token_abbreviated))
if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
abbreviated.append(six.u(' '))
i += 1
return six.u('').join(abbreviated).strip()

View File

@@ -0,0 +1,254 @@
import os
import sys
from collections import defaultdict
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.encoding import safe_encode, safe_decode
ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'dictionaries')
ADDRESS_HEADER_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_rule.h')
ADDRESS_DATA_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c')
address_language_index_template = u'{{{language}, {index}, {length}}}'
address_expansion_rule_template = u'{{{phrase}, {num_dictionaries}, {{{dictionaries}}}, {canonical_index}}}'
address_expansion_rule_header_template = u'''
#ifndef ADDRESS_EXPANSION_RULE_H
#define ADDRESS_EXPANSION_RULE_H
#include <stdlib.h>
#include <stdint.h>
#include "constants.h"
#include "gazetteers.h"
#define MAX_DICTIONARY_TYPES {max_dictionary_types}
typedef struct address_expansion_rule {{
char *phrase;
uint32_t num_dictionaries;
dictionary_type_t dictionaries[MAX_DICTIONARY_TYPES];
int32_t canonical_index;
}} address_expansion_rule_t;
typedef struct address_language_index {{
char language[MAX_LANGUAGE_LEN];
uint32_t index;
size_t len;
}} address_language_index_t;
#endif
'''
address_expansion_data_file_template = u'''
char *canonical_strings[] = {{
{canonical_strings}
}};
address_expansion_rule_t expansion_rules[] = {{
{expansion_rules}
}};
address_language_index_t expansion_languages[] = {{
{address_languages}
}};
'''
gazetteer_types = {
'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE',
'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION',
'building_types': 'DICTIONARY_BUILDING_TYPE',
'categories': 'DICTIONARY_CATEGORY',
'categories_plural': 'DICTIONARY_CATEGORY_PLURAL',
'chains': 'DICTIONARY_CHAIN',
'company_types': 'DICTIONARY_COMPANY_TYPE',
'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE',
'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE',
'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE',
'cross_streets': 'DICTIONARY_CROSS_STREET',
'directionals': 'DICTIONARY_DIRECTIONAL',
'elisions': 'DICTIONARY_ELISION',
'entrances': 'DICTIONARY_ENTRANCE',
'given_names': 'DICTIONARY_GIVEN_NAME',
'house_numbers': 'DICTIONARY_HOUSE_NUMBER',
'level_types_basement': 'DICTIONARY_LEVEL_BASEMENT',
'level_types_mezzanine': 'DICTIONARY_LEVEL_MEZZANINE',
'level_types_numbered': 'DICTIONARY_LEVEL_NUMBERED',
'level_types_standalone': 'DICTIONARY_LEVEL_STANDALONE',
'level_types_sub_basement': 'DICTIONARY_LEVEL_SUB_BASEMENT',
'near': 'DICTIONARY_NEAR',
'no_number': 'DICTIONARY_NO_NUMBER',
'number': 'DICTIONARY_NUMBER',
'nulls': 'DICTIONARY_NULL',
'organizations': 'DICTIONARY_NAMED_ORGANIZATION',
'people': 'DICTIONARY_NAMED_PERSON',
'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX',
'personal_titles': 'DICTIONARY_PERSONAL_TITLE',
'place_names': 'DICTIONARY_PLACE_NAME',
'post_office': 'DICTIONARY_POST_OFFICE',
'postcodes': 'DICTIONARY_POSTAL_CODE',
'qualifiers': 'DICTIONARY_QUALIFIER',
'staircases': 'DICTIONARY_STAIRCASE',
'stopwords': 'DICTIONARY_STOPWORD',
'street_names': 'DICTIONARY_STREET_NAME',
'street_types': 'DICTIONARY_STREET_TYPE',
'surnames': 'DICTIONARY_SURNAME',
'synonyms': 'DICTIONARY_SYNONYM',
'toponyms': 'DICTIONARY_TOPONYM',
'unit_directions': 'DICTIONARY_UNIT_DIRECTION',
'unit_types_numbered': 'DICTIONARY_UNIT_NUMBERED',
'unit_types_standalone': 'DICTIONARY_UNIT_STANDALONE',
}
class InvalidAddressFileException(Exception):
pass
def read_dictionary_file(path):
for i, line in enumerate(open(path)):
line = safe_decode(line.rstrip())
if not line.strip():
continue
if u'}' in line:
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
phrases = line.split(u'|')
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
yield phrases
def quote_string(s):
return u'"{}"'.format(safe_decode(s).replace('\\', '\\\\').replace('"', '\\"'))
class AddressPhraseDictionaries(object):
def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
self.base_dir = base_dir
self.languages = []
self.language_dictionaries = defaultdict(list)
self.phrases = defaultdict(list)
for language in os.listdir(base_dir):
language_dir = os.path.join(base_dir, language)
if not os.path.isdir(language_dir):
continue
self.languages.append(language)
for filename in os.listdir(language_dir):
if not filename.endswith('.txt'):
raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename))
dictionary_name = filename.split('.')[0].lower()
if dictionary_name not in gazetteer_types:
raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types))))
self.language_dictionaries[language].append(dictionary_name)
path = os.path.join(language_dir, filename)
for i, line in enumerate(open(path)):
line = safe_decode(line.rstrip())
if not line.strip():
continue
if u'}' in line:
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
phrases = line.split(u'|')
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
self.phrases[(language, dictionary_name)].append(phrases)
self.language_dictionaries = dict(self.language_dictionaries)
self.phrases = dict(self.phrases)
address_phrase_dictionaries = AddressPhraseDictionaries()
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
address_languages = []
expansion_rules = []
canonical_strings = []
max_dictionary_types = 0
for language in address_phrase_dictionaries.languages:
num_language_rules = 0
language_index = len(expansion_rules)
language_canonical_dictionaries = defaultdict(list)
canonical_indices = {}
for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]:
dictionary_type = gazetteer_types[dictionary_name]
for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]:
canonical = phrases[0]
if len(phrases) > 1:
canonical_index = canonical_indices.get(canonical, None)
if canonical_index is None:
canonical_index = len(canonical_strings)
canonical_strings.append(quote_string(canonical))
canonical_indices[canonical] = canonical_index
else:
canonical_index = -1
for i, p in enumerate(phrases):
language_canonical_dictionaries[p, canonical_index if i > 0 else -1].append(dictionary_type)
for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
num_dictionaries=str(len(dictionary_types)),
dictionaries=', '.join(dictionary_types),
canonical_index=canonical_index)
expansion_rules.append(rule_template)
num_language_rules += 1
address_languages.append(address_language_index_template.format(language=quote_string(language),
index=language_index,
length=num_language_rules))
header = address_expansion_rule_header_template.format(
max_dictionary_types=str(max_dictionary_types)
)
out = open(header_file, 'w')
out.write(safe_encode(header))
out.close()
data_file = address_expansion_data_file_template.format(
canonical_strings=u''',
'''.join(canonical_strings),
expansion_rules=u''',
'''.join(expansion_rules),
address_languages=u''',
'''.join(address_languages),
)
out = open(output_file, 'w')
out.write(safe_encode(data_file))
out.close()
if __name__ == '__main__':
if len(sys.argv) > 1:
input_dir = sys.argv[1]
else:
input_dir = ADDRESS_EXPANSIONS_DIR
create_address_expansion_rules_file(base_dir=input_dir, output_file=ADDRESS_DATA_FILE)

View File

@@ -0,0 +1,56 @@
import random
import re
import six
from itertools import izip
from geodata.address_expansions.gazetteers import *
from geodata.encoding import safe_decode, safe_encode
from geodata.text.normalize import normalized_tokens
from geodata.text.tokenize import tokenize_raw, token_types
from geodata.text.utils import non_breaking_dash_regex
def canonicals_for_language(data, language):
canonicals = set()
for d in data:
lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
if language is None or lang == language:
canonicals.add(canonical)
return canonicals
def equivalent(s1, s2, gazetteer, language):
'''
Address/place equivalence
-------------------------
OSM discourages abbreviations, but to make our training data map better
to real-world input, we can safely replace the canonical phrase with an
abbreviated version and retain the meaning of the words
'''
tokens_s1 = normalized_tokens(s1)
tokens_s2 = normalized_tokens(s2)
abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1))
abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2))
if len(abbreviated_s1) != len(abbreviated_s2):
return False
for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2):
if c1 != token_types.PHRASE and c2 != token_types.PHRASE:
if t1 != t2:
return False
elif c2 == token_types.PHRASE and c2 == token_types.PHRASE:
canonicals_s1 = canonicals_for_language(d1, language)
canonicals_s2 = canonicals_for_language(d2, language)
if not canonicals_s1 & canonicals_s2:
return False
else:
return False
return True

View File

@@ -0,0 +1,260 @@
import os
import six
from collections import defaultdict, OrderedDict
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
from geodata.encoding import safe_decode, safe_encode
from geodata.i18n.unicode_paths import DATA_DIR
from geodata.text.normalize import normalized_tokens, normalize_string
from geodata.text.tokenize import tokenize, token_types
from geodata.text.phrases import PhraseFilter
from geodata.enum import EnumValue
from marisa_trie import BytesTrie
DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')
PREFIX_KEY = u'\x02'
SUFFIX_KEY = u'\x03'
POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix',
'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix',
'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc',
'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
'm', 'mm', 'mmm', 'mmmm'])
class DictionaryPhraseFilter(PhraseFilter):
serialize = safe_encode
deserialize = safe_decode
def __init__(self, *dictionaries):
self.dictionaries = dictionaries
self.canonicals = {}
kvs = defaultdict(OrderedDict)
for language in address_phrase_dictionaries.languages:
for dictionary_name in self.dictionaries:
is_suffix_dictionary = 'suffixes' in dictionary_name
is_prefix_dictionary = 'prefixes' in dictionary_name
for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
canonical = phrases[0]
canonical_normalized = normalize_string(canonical)
self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]
for i, phrase in enumerate(phrases):
if phrase in POSSIBLE_ROMAN_NUMERALS:
continue
is_canonical = normalize_string(phrase) == canonical_normalized
if is_suffix_dictionary:
phrase = SUFFIX_KEY + phrase[::-1]
elif is_prefix_dictionary:
phrase = PREFIX_KEY + phrase
kvs[phrase][(language, dictionary_name, canonical)] = is_canonical
kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
self.trie = BytesTrie(kvs)
def serialize(self, s):
return s
def deserialize(self, s):
return s
def search_substring(self, s):
if len(s) == 0:
return None, 0
for i in xrange(len(s) + 1):
if not self.trie.has_keys_with_prefix(s[:i]):
i -= 1
break
if i > 0:
return (self.trie.get(s[:i]), i)
else:
return None, 0
def search_suffix(self, token):
suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
if suffix_len > 0:
suffix_len -= len(SUFFIX_KEY)
return suffix_search, suffix_len
def search_prefix(self, token):
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
if prefix_len > 0:
prefix_len -= len(PREFIX_KEY)
return prefix_search, prefix_len
def basic_filter(self, tokens):
return super(DictionaryPhraseFilter, self).filter(tokens)
def filter(self, tokens):
for p, t, data in self.basic_filter(tokens):
if not p:
t, c = t
token = t
token_len = len(token)
suffix_search, suffix_len = self.search_suffix(token)
if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search))
continue
prefix_search, prefix_len = self.search_prefix(token)
if prefix_search and self.trie.get(token[:prefix_len]):
yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search))
continue
else:
c = token_types.PHRASE
yield t, c, len(t), map(safe_decode, data)
def gen_phrases(self, s, canonical_only=False, languages=None):
tokens = tokenize(s)
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
if not languages:
languages = None
elif not hasattr(languages, '__iter__'):
languages = [languages]
if not hasattr(languages, '__contains__'):
languages = set(languages)
for t, c, length, data in self.filter(norm_tokens):
if c == token_types.PHRASE:
if not canonical_only and languages is None:
yield six.u(' ').join([t_i for t_i, c_i in t])
else:
phrase = None
for d in data:
lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'):
phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
yield phrase
def string_contains_phrases(self, s, canonical_only=False, languages=None):
phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages)
try:
phrases.next()
return True
except StopIteration:
return False
def extract_phrases(self, s, canonical_only=False, languages=None):
return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
STREET_TYPES_ONLY_DICTIONARIES = ('street_types',
'directionals',
'concatenated_suffixes_separable',
'concatenated_suffixes_inseparable',
'people',
'personal_suffixes',
'personal_titles',
)
STREET_TYPES_DICTIONARIES = STREET_TYPES_ONLY_DICTIONARIES + ('concatenated_prefixes_separable',
'organizations',
'qualifiers',
'stopwords',
)
GIVEN_NAME_DICTIONARY = 'given_names'
SURNAME_DICTIONARY = 'surnames'
CHAIN_DICTIONARY = 'chains'
SYNONYM_DICTIONARY = 'synonyms'
PERSONAL_NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
SURNAME_DICTIONARY,)
NAME_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
'building_types',
'company_types',
'place_names',
'qualifiers',
'synonyms',
'toponyms',
)
QUALIFIERS_DICTIONARY = 'qualifiers'
HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number')
POSTCODE_DICTIONARIES = ('postcode',)
TOPONYMS_DICTIONARY = 'toponyms'
TOPONYM_ABBREVIATION_DICTIONARIES = ('qualifiers',
'directionals',
'personal_titles',
'synonyms',
)
UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement',
'level_types_mezzanine',
'level_types_numbered',
'level_types_standalone',
'level_types_sub_basement',
'number',
'post_office',
'unit_types_numbered',
'unit_types_standalone',
)
VENUE_NAME_DICTIONARIES = ('academic_degrees',
'building_types',
'chains',
'company_types',
'directionals',
'given_names',
'organizations',
'people',
'personal_suffixes',
'personal_titles',
'place_names',
'stopwords',
'surnames',
)
ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
NAME_DICTIONARIES + \
UNIT_ABBREVIATION_DICTIONARIES + \
('no_number', 'nulls',)
_gazetteers = []
def create_gazetteer(*dictionaries):
g = DictionaryPhraseFilter(*dictionaries)
_gazetteers.append(g)
return g
street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
street_types_only_gazetteer = create_gazetteer(*STREET_TYPES_ONLY_DICTIONARIES)
qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY)
names_gazetteer = create_gazetteer(*NAME_DICTIONARIES)
chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, )))
abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES)
toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY)
given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
venue_names_gazetteer = create_gazetteer(*VENUE_NAME_DICTIONARIES)

View File

@@ -0,0 +1,29 @@
import six
from collections import defaultdict
class Aliases(object):
def __init__(self, aliases):
self.aliases = aliases
self.priorities = {k: i for i, k in enumerate(aliases)}
def key_priority(self, key):
return self.priorities.get(key, len(self.priorities))
def get(self, key, default=None):
return self.aliases.get(key, default)
def replace(self, components):
replacements = defaultdict(list)
values = {}
for k in list(components):
new_key = self.aliases.get(k)
if new_key and new_key not in components:
value = components.pop(k)
values[k] = value
replacements[new_key].append(k)
for key, source_keys in six.iteritems(replacements):
source_keys.sort(key=self.key_priority)
value = values[source_keys[0]]
components[key] = value

View File

@@ -0,0 +1,924 @@
# -*- coding: utf-8 -*-
import copy
import os
import pystache
import random
import re
import six
import subprocess
import yaml
from collections import OrderedDict, defaultdict
from itertools import ifilter
from geodata.address_formatting.aliases import Aliases
from geodata.configs.utils import nested_get, recursive_merge
from geodata.math.floats import isclose
from geodata.math.sampling import weighted_choice, cdf
from geodata.text.tokenize import tokenize, tokenize_raw, token_types
from geodata.encoding import safe_decode
FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting'
this_dir = os.path.realpath(os.path.dirname(__file__))
FORMATTER_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'formatting', 'global.yaml')
class AddressFormatter(object):
'''
Approximate Python port of lokku's Geo::Address::Formatter
Usage:
address_formatter = AddressFormatter()
components = {
'house': u'Anticafé',
'house_number': '2',
'road': u'Calle de la Unión',
'postcode': '28013',
'city': u'Madrid',
}
country = 'es'
language = 'es'
address_formatter.format_address(components, country, language)
'''
whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
splitter = ' | '
separator_tag = 'SEP'
field_separator_tag = 'FSEP'
CATEGORY = 'category'
NEAR = 'near'
ATTENTION = 'attention'
CARE_OF = 'care_of'
HOUSE = 'house'
HOUSE_NUMBER = 'house_number'
PO_BOX = 'po_box'
ROAD = 'road'
BUILDING = 'building'
ENTRANCE = 'entrance'
STAIRCASE = 'staircase'
LEVEL = 'level'
UNIT = 'unit'
INTERSECTION = 'intersection'
SUBDIVISION = 'subdivision'
METRO_STATION = 'metro_station'
SUBURB = 'suburb'
CITY_DISTRICT = 'city_district'
CITY = 'city'
ISLAND = 'island'
STATE = 'state'
STATE_DISTRICT = 'state_district'
POSTCODE = 'postcode'
COUNTRY_REGION = 'country_region'
COUNTRY = 'country'
WORLD_REGION = 'world_region'
component_order = {k: i for i, k in enumerate([
CATEGORY,
NEAR,
ATTENTION,
CARE_OF,
HOUSE,
PO_BOX,
HOUSE_NUMBER,
BUILDING,
ENTRANCE,
STAIRCASE,
LEVEL,
UNIT,
ROAD,
INTERSECTION,
SUBDIVISION,
METRO_STATION,
SUBURB,
CITY,
CITY_DISTRICT,
ISLAND,
STATE,
STATE_DISTRICT,
POSTCODE,
COUNTRY_REGION,
COUNTRY,
WORLD_REGION,
])}
BOUNDARY_COMPONENTS_ORDERED = [
SUBDIVISION,
METRO_STATION,
SUBURB,
CITY_DISTRICT,
CITY,
ISLAND,
STATE_DISTRICT,
STATE,
COUNTRY_REGION,
COUNTRY,
WORLD_REGION,
]
BOUNDARY_COMPONENTS = set(BOUNDARY_COMPONENTS_ORDERED)
SUB_BUILDING_COMPONENTS = {
ENTRANCE,
STAIRCASE,
LEVEL,
UNIT,
}
STREET_COMPONENTS = {
HOUSE_NUMBER,
ROAD,
}
ADDRESS_LEVEL_COMPONENTS = STREET_COMPONENTS | SUB_BUILDING_COMPONENTS
NAME_COMPONENTS = {
ATTENTION,
CARE_OF,
HOUSE,
}
address_formatter_fields = set(component_order)
aliases = Aliases(
OrderedDict([
('street', ROAD),
('street_name', ROAD),
('hamlet', CITY),
('village', CITY),
('neighborhood', SUBURB),
('neighbourhood', SUBURB),
('city_district', CITY_DISTRICT),
('county', STATE_DISTRICT),
('state_code', STATE),
('country_name', COUNTRY),
('continent', WORLD_REGION),
('postal_code', POSTCODE),
('post_code', POSTCODE),
])
)
category_template = '{{{category}}} {{{near}}} {{{place}}}'
chain_template = '{{{house}}} {{{near}}} {{{place}}}'
intersection_template = '{{{road1}}} {{{intersection}}} {{{road2}}} {{{place}}}'
template_address_parts = [HOUSE, HOUSE_NUMBER, ROAD]
template_admin_parts = [CITY, STATE, COUNTRY]
template_address_parts_re = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in template_address_parts]))
template_admin_parts_re = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in template_admin_parts]))
MINIMAL_COMPONENT_KEYS = [
(ROAD, HOUSE_NUMBER),
(ROAD, HOUSE),
(ROAD, POSTCODE)
]
FIRST, BEFORE, AFTER, LAST = range(4)
def __init__(self, scratch_dir='/tmp', splitter=None):
if splitter is not None:
self.splitter = splitter
self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
self.clone_repo()
self.load_config()
self.load_country_formats()
self.language_code_replacements = self.config['language_code_replacements']
self.setup_insertion_probabilities()
self.setup_no_name_templates()
self.setup_place_only_templates()
self.template_cache = {}
self.parsed_cache = {}
def clone_repo(self):
subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
def load_country_formats(self):
config = yaml.load(open(os.path.join(self.formatter_repo_path,
'conf', 'countries', 'worldwide.yaml')))
self.country_aliases = {}
self.house_number_ordering = {}
for key in list(config):
country = key
language = None
if '_' in key:
country, language = country.split('_', 1)
value = config[key]
if hasattr(value, 'items'):
address_template = value.get('address_template')
if not address_template and 'use_country' in value:
# Temporary fix for Norway territories (NO unquoted is a boolean) and recursive references
if value['use_country'] in (country, False):
continue
self.country_aliases[country] = value['use_country']
address_template = config[value['use_country']]['address_template']
if address_template:
value['address_template'] = self.add_postprocessing_tags(address_template, country, language=language)
post_format_replacements = value.get('postformat_replace')
if post_format_replacements:
value['postformat_replace'] = [[pattern, replacement.replace('$', '\\')] for pattern, replacement in post_format_replacements]
else:
address_template = value
config[country] = self.add_postprocessing_tags(value, country, language=language)
try:
house_number_index = address_template.index(self.tag_token(self.HOUSE_NUMBER))
road_index = address_template.index(self.tag_token(self.ROAD))
if house_number_index < road_index:
self.house_number_ordering[key.lower()] = -1
else:
self.house_number_ordering[key.lower()] = 1
except ValueError:
self.house_number_ordering[key.lower()] = 0
self.country_formats = config
def load_config(self):
config = yaml.load(open(FORMATTER_CONFIG))
self.config = config.get('global', {})
language_configs = config.get('languages', {})
self.language_configs = {}
for language in language_configs:
language_config = language_configs[language]
config_copy = copy.deepcopy(self.config)
self.language_configs[language] = recursive_merge(config_copy, language_config)
country_configs = config.get('countries', {})
self.country_configs = {}
for country in country_configs:
country_config = country_configs[country]
config_copy = copy.deepcopy(self.config)
self.country_configs[country] = recursive_merge(config_copy, country_config)
def get_property(self, keys, country, language=None, default=None):
if isinstance(keys, six.string_types):
keys = keys.split('.')
keys = tuple(keys)
value = nested_get(self.language_configs, (language,) + keys, default=default)
if not value:
value = nested_get(self.country_configs, (country,) + keys, default=default)
if not value:
value = nested_get(self.config, keys, default=default)
return value
def insertion_distribution(self, insertions):
values = []
probs = []
for k, v in six.iteritems(insertions):
if k == 'conditional' or not v:
continue
if 'before' in v:
val = (self.BEFORE, v['before'])
elif 'after' in v:
val = (self.AFTER, v['after'])
elif 'last' in v:
val = (self.LAST, None)
elif 'first' in v:
val = (self.FIRST, None)
else:
raise ValueError('Insertions must contain one of {{first, before, after, last}}. Value was: {}'.format(v))
prob = v['probability']
values.append(val)
probs.append(prob)
# If the probabilities don't sum to 1, add a "do nothing" action
if not isclose(sum(probs), 1.0):
probs.append(1.0 - sum(probs))
values.append((None, None, False))
return values, cdf(probs)
def insertion_probs(self, config):
component_insertions = {}
for component, insertions in six.iteritems(config):
component_insertions[component] = self.insertion_distribution(insertions)
return component_insertions
def inverted(self, template):
lines = template.split(six.u('\n'))
return six.u('\n').join(reversed(lines))
def house_number_before_road(self, country, language=None):
key = value = None
if language is not None:
key = six.u('_').join((country.lower(), language.lower()))
if key in self.house_number_ordering:
value = self.house_number_ordering[key]
if value is None:
key = country
if key in self.house_number_ordering:
value = self.house_number_ordering[key]
if value is None:
value = 0
if value <= 0:
return True
else:
return False
def conditional_insertion_probs(self, conditionals):
conditional_insertions = defaultdict(OrderedDict)
for component, value in six.iteritems(conditionals):
if 'conditional' in value:
conditionals = value['conditional']
for c in conditionals:
other = c['component']
conditional_insertions[component][other] = self.insertion_distribution(c['probabilities'])
return conditional_insertions
def setup_insertion_probabilities(self):
config = self.config['insertions']
self.global_insertions = self.insertion_probs(config)
self.global_conditionals = self.conditional_insertion_probs(config)
self.global_invert_probability = self.config.get('invert_probability', 0.0)
self.country_insertions = {}
self.country_conditionals = {}
self.country_invert_probabilities = {}
for country, config in six.iteritems(self.country_configs):
if 'insertions' in config:
self.country_insertions[country.lower()] = self.insertion_probs(config['insertions'])
self.country_conditionals[country.lower()] = self.conditional_insertion_probs(config['insertions'])
if 'invert_probability' in config:
self.country_invert_probabilities[country] = config['invert_probability']
self.language_insertions = {}
self.language_conditionals = {}
for language, config in six.iteritems(self.language_configs):
if 'insertions' in config:
self.language_insertions[language.lower()] = self.insertion_probs(config['insertions'])
self.language_conditionals[language.lower()] = self.conditional_insertion_probs(config['insertions'])
def setup_no_name_templates(self):
self.templates_no_name = {}
for country, config in six.iteritems(self.country_formats):
if hasattr(config, 'items') and 'address_template' in config:
address_template = self.remove_components(config['address_template'], self.NAME_COMPONENTS)
self.templates_no_name[country] = address_template
def setup_place_only_templates(self):
self.templates_place_only = {}
for country, config in six.iteritems(self.country_formats):
if hasattr(config, 'items') and 'address_template' in config:
address_template = self.remove_components(config['address_template'], self.NAME_COMPONENTS | self.ADDRESS_LEVEL_COMPONENTS)
self.templates_place_only[country] = address_template
def country_template(self, c):
return self.country_formats.get(c, self.country_formats['default'])
def is_reverse(self, template):
address_parts_match = self.template_address_parts_re.search(template)
admin_parts_match = list(self.template_admin_parts_re.finditer(template))
# last instance of city/state/country occurs before the first instance of house_number/road
return admin_parts_match[-1].start() < address_parts_match.start()
def build_first_of_template(self, keys):
""" For constructing """
return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys]))
def tag_token(self, key):
return '{{{{{{{key}}}}}}}'.format(key=key)
def remove_components(self, template, tags):
new_components = []
tags = set(tags)
parsed = pystache.parse(safe_decode(template))
last_removed = False
for i, el in enumerate(parsed._parse_tree):
if hasattr(el, 'parsed'):
keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key') and e.key not in tags]
if keys:
new_components.append(self.build_first_of_template(keys))
last_removed = False
else:
last_removed = True
elif hasattr(el, 'key'):
if el.key not in tags:
new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
last_removed = False
else:
last_removed = True
elif not last_removed:
new_components.append(el)
else:
last_removed = False
return ''.join(new_components).strip()
def insert_component(self, template, tag, before=None, after=None, first=False, last=False,
separate=True, is_reverse=False, exact_order=True):
if not before and not after and not first and not last:
return
template = template.rstrip()
if not exact_order:
first_template_regex = re.compile(six.u('{{#first}}.*?{{/first}}'), re.UNICODE)
sans_firsts = first_template_regex.sub(six.u(''), template)
tag_match = re.compile(self.tag_token(tag)).search(sans_firsts)
if before:
before_match = re.compile(self.tag_token(before)).search(sans_firsts)
if before_match and tag_match and before_match.start() > tag_match.start():
return template
if after:
after_match = re.compile(self.tag_token(after)).search(sans_firsts)
if after_match and tag_match and tag_match.start() > after_match.start():
return template
key_added = False
skip_next_non_token = False
new_components = []
tag_token = self.tag_token(tag)
parsed = pystache.parse(safe_decode(template))
num_tokens = len(parsed._parse_tree)
for i, el in enumerate(parsed._parse_tree):
if hasattr(el, 'parsed'):
keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key')]
if (before in set(keys) or first) and not key_added:
token = new_components[-1] if new_components and '{' not in new_components[-1] else '\n'
new_components.extend([tag_token, token])
key_added = True
keys = [k for k in keys if self.aliases.get(k, k) != tag]
if keys:
new_components.append(self.build_first_of_template(keys))
else:
while new_components and '{' not in new_components[-1]:
new_components.pop()
continue
if (after in set(keys) or i == num_tokens - 1) and not key_added:
token = '\n'
if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
token = parsed._parse_tree[i + 1]
new_components.extend([token, tag_token])
key_added = True
elif hasattr(el, 'key'):
if el.key == tag:
if i == num_tokens - 1 and last:
new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
skip_next_non_token = True
continue
if (el.key == before or first) and not key_added:
token = '\n'
if new_components and '{' not in new_components[-1]:
token = new_components[-1]
new_components.extend([tag_token, token])
key_added = True
new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
if (el.key == after or i == num_tokens - 1) and not key_added:
token = '\n'
if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
token = parsed._parse_tree[i + 1]
new_components.extend([token, tag_token])
key_added = True
elif not skip_next_non_token:
new_components.append(el)
if i == num_tokens - 1 and not key_added:
key_added = True
new_components.append(tag_token)
skip_next_non_token = False
return ''.join(new_components)
def add_postprocessing_tags(self, template, country, language=None):
is_reverse = self.is_reverse(template)
i = None
pivot = None
pivot_keys = (AddressFormatter.CITY, AddressFormatter.STATE, AddressFormatter.COUNTRY)
for component in pivot_keys:
token = self.tag_token(component)
if token in template:
i = self.BOUNDARY_COMPONENTS_ORDERED.index(component)
pivot = component
break
if i is None:
raise ValueError('Template {} does not contain one of {{{}}}'.format(country, ','.join(pivot_keys)))
prev = pivot
if i > 1:
for component in self.BOUNDARY_COMPONENTS_ORDERED[i - 1:0:-1]:
kw = {'before': prev} if not is_reverse else {'after': prev}
template = self.insert_component(template, component, exact_order=False, **kw)
prev = component
prev = pivot
if i < len(self.BOUNDARY_COMPONENTS_ORDERED) - 1:
for component in self.BOUNDARY_COMPONENTS_ORDERED[i + 1:]:
kw = {'after': prev} if not is_reverse else {'before': prev}
template = self.insert_component(template, component, exact_order=False, **kw)
prev = component
return template
def render_template(self, template, components, tagged=False):
def render_first(text):
text = pystache.render(text, **components)
splits = (e.strip() for e in text.split('||'))
selected = next(ifilter(bool, splits), '')
return selected
output = pystache.render(template, first=render_first,
**components).strip()
values = self.whitespace_component_regex.split(output)
splitter = self.splitter if not tagged else ' {}/{} '.format(self.splitter.strip(), self.field_separator_tag)
values = [self.strip_component(val, tagged=tagged) for val in values]
output = splitter.join([
val for val in values if val.strip()
])
return output
def minimal_components(self, components):
for component_list in self.MINIMAL_COMPONENT_KEYS:
if all((c in components for c in component_list)):
return True
return False
def post_replacements(self, template, text):
components = []
seen = set()
for component in text.split(self.splitter):
component = component.strip()
if component not in seen:
components.append(component)
seen.add(component)
text = self.splitter.join(components)
post_format_replacements = template.get('postformat_replace')
if post_format_replacements:
for regex, replacement in post_format_replacements:
text = re.sub(regex, replacement, text)
return text
def revised_template(self, template, components, country, language=None):
if not template:
return None
country_language = None
if language:
country_language = '{}_{}'.format(country, language)
alias_country = self.country_aliases.get(country.upper(), country).lower()
for term in (country, country_language):
if term in self.country_insertions or term in self.country_conditionals:
break
else:
country = alias_country
cache_keys = []
invert_probability = self.country_invert_probabilities.get(country, self.global_invert_probability)
if random.random() < invert_probability:
cache_keys.append('inverted')
cache_key = tuple(sorted(cache_keys))
if cache_key in self.template_cache:
template = self.template_cache[cache_key]
else:
template = self.inverted(template)
self.template_cache[cache_key] = template
for component in sorted(components, key=self.component_order.get):
scope = country
insertions = nested_get(self.country_insertions, (country, component), default=None)
conditionals = nested_get(self.country_conditionals, (country, component), default=None)
if insertions is None and language:
insertions = nested_get(self.country_insertions, (country_language, component), default=None)
scope = country_language
if conditionals is None and language:
conditionals = nested_get(self.country_conditionals, (country_language, component), default=None)
if insertions is None and language:
insertions = nested_get(self.language_insertions, (language, component), default=None)
scope = 'lang:{}'.format(language)
if conditionals is None and language:
conditionals = nested_get(self.language_conditionals, (language, component), default=None)
if insertions is None:
insertions = nested_get(self.global_insertions, (component,), default=None)
scope = None
if conditionals is None:
conditionals = nested_get(self.global_conditionals, (component,), default=None)
if insertions is not None:
conditional_insertions = None
if conditionals is not None:
for k, v in six.iteritems(conditionals):
if k in components:
conditional_insertions = v
break
order, other = None, None
# Check the conditional probabilities first
if conditional_insertions is not None:
values, probs = conditional_insertions
order, other = weighted_choice(values, probs)
# If there are no conditional probabilites or the "default" value was chosen, sample from the marginals
if other is None:
values, probs = insertions
order, other = weighted_choice(values, probs)
# Even though we may change the value of "other" below, use
# the original cache key because changes from here on are
# deterministic and should be cached.
insertion_id = (scope, component, order, other)
cache_keys.append(insertion_id)
cache_key = tuple(sorted(cache_keys))
if cache_key in self.template_cache:
template = self.template_cache[cache_key]
continue
other_token = self.tag_token(other)
# Don't allow insertions between road and house_number
# This can happen if e.g. "level" is supposed to be inserted
# after house number assuming that it's a continental European
# address where house number comes after road. If in a previous
# insertion we were to swap house_number and road to create an
# English-style address, the final ordering would be
# house_number, unit, road, which we don't want. So effectively
# treat house_number and road as an atomic unit.
if other == self.HOUSE_NUMBER and component != self.ROAD:
road_tag = self.tag_token(self.ROAD)
house_number_tag = other_token
if house_number_tag in template and road_tag in template:
road_after_house_number = template.index(road_tag) > template.index(house_number_tag)
if road_after_house_number and order == self.AFTER:
other = self.ROAD
elif not road_after_house_number and order == self.BEFORE:
other = self.ROAD
elif other == self.ROAD and component != self.HOUSE_NUMBER:
house_number_tag = self.tag_token(self.HOUSE_NUMBER)
road_tag = other_token
if house_number_tag in template and road_tag in template:
road_before_house_number = template.index(road_tag) < template.index(house_number_tag)
if road_before_house_number and order == self.AFTER:
other = self.HOUSE_NUMBER
elif not road_before_house_number and order == self.BEFORE:
other = self.HOUSE_NUMBER
if order == self.BEFORE and other_token in template:
template = self.insert_component(template, component, before=other)
elif order == self.AFTER and other_token in template:
template = self.insert_component(template, component, after=other)
elif order == self.LAST:
template = self.insert_component(template, component, last=True)
elif order == self.FIRST:
template = self.insert_component(template, component, first=True)
else:
continue
self.template_cache[cache_key] = template
return template
def remove_repeat_template_separators(self, template):
return re.sub('(?:[\s]*([,;\-]/{})[\s]*){{2,}}'.format(self.separator_tag), r' \1 ', template)
def tag_template_separators(self, template):
template = re.sub(r'}\s*([,\-;])\s*', r'}} \1/{} '.format(self.separator_tag), template)
return template
def strip_component(self, value, tagged=False):
if not tagged:
comma = token_types.COMMA.value
hyphen = token_types.HYPHEN.value
start = end = 0
tokens = tokenize_raw(value.strip())
for token_start, token_length, token_type in tokens:
start = token_start
if token_type not in (comma, hyphen):
break
else:
start = token_start + token_length
for token_start, token_length, token_type in reversed(tokens):
end = token_start + token_length
if token_type not in (comma, hyphen):
break
else:
end = token_start
return value[start:end]
else:
start = end = 0
tokens = value.split()
separator_tag = self.separator_tag
for i, t in enumerate(tokens):
t, c = t.rsplit('/', 1)
start = i
if c != separator_tag:
break
else:
start = i + 1
num_tokens = len(tokens)
for j, t in enumerate(reversed(tokens)):
t, c = t.rsplit('/', 1)
end = num_tokens - j
if c != separator_tag:
break
else:
end = num_tokens - j - 1
return six.u(' ').join(tokens[start:end])
def get_template_from_config(self, config, country, language=None):
template = None
if language:
language = self.language_code_replacements.get(language, language.split('_')[0])
# For countries like China and Japan where the country format varies
# based on which language is being used
template = config.get('{}_{}'.format(country.upper(), language.lower()), None)
if not template:
template = config.get(country.upper())
if not template:
return None
return template
def get_template(self, country, language=None):
return self.get_template_from_config(self.country_formats, country, language=language)
def get_no_name_template(self, country, language=None):
return self.get_template_from_config(self.templates_no_name, country, language=language)
def get_place_template(self, country, language=None):
return self.get_template_from_config(self.templates_place_only, country, language=language)
def tagged_tokens(self, name, label):
return six.u(' ').join([six.u('{}/{}').format(t.replace(' ', ''), label if t != ',' else self.separator_tag) for t, c in tokenize(name)])
def template_language_matters(self, country, language):
return '{}_{}'.format(country.upper(), language) in self.country_formats or '{}_{}'.format(country, language) in self.country_formats
def format_category_query(self, category_query, address_components, country, language, tag_components=True):
if tag_components:
components = {self.CATEGORY: self.tagged_tokens(category_query.category, self.CATEGORY)}
if category_query.prep is not None:
components[self.NEAR] = self.tagged_tokens(category_query.prep, self.NEAR)
else:
components = {self.CATEGORY: category_query.category}
if category_query.prep is not None:
components[self.NEAR] = category_query.prep
if category_query.add_place_name or category_query.add_address:
place_formatted = self.format_address(address_components, country, language=language,
minimal_only=False, tag_components=tag_components)
if not place_formatted:
return None
components['place'] = place_formatted
return self.render_template(self.category_template, components, tagged=tag_components)
def format_chain_query(self, chain_query, address_components, country, language, tag_components=True):
if tag_components:
components = {self.HOUSE: self.tagged_tokens(chain_query.name, self.HOUSE)}
if chain_query.prep is not None:
components[self.NEAR] = self.tagged_tokens(chain_query.prep, self.NEAR)
else:
components = {self.HOUSE: chain_query.name}
if chain_query.prep is not None:
components[self.NEAR] = chain_query.prep
if chain_query.add_place_name or chain_query.add_address:
place_formatted = self.format_address(address_components, country, language=language,
minimal_only=False, tag_components=tag_components)
if not place_formatted:
return None
components['place'] = place_formatted
return self.render_template(self.chain_template, components, tagged=tag_components)
def format_intersection(self, intersection_query, place_components, country, language, tag_components=True):
components = {}
if tag_components:
components = {'road1': self.tagged_tokens(intersection_query.road1, self.ROAD),
'intersection': self.tagged_tokens(intersection_query.intersection_phrase, self.INTERSECTION),
'road2': self.tagged_tokens(intersection_query.road2, self.ROAD),
}
else:
components = {'road1': intersection_query.road1,
'intersection': intersection_query.intersection_phrase,
'road2': intersection_query.road2}
if place_components:
place_formatted = self.format_address(place_components, country, language=language,
minimal_only=False, tag_components=tag_components)
if place_formatted:
components['place'] = place_formatted
return self.render_template(self.intersection_template, components, tagged=tag_components)
def format_address(self, components, country, language,
minimal_only=True, tag_components=True, replace_aliases=True):
if minimal_only and not self.minimal_components(components):
return None
template = self.get_template(country, language=language)
if not template:
return None
if not template or 'address_template' not in template:
return None
template_text = template['address_template']
template_text = self.revised_template(template_text, components, country, language=language)
if template_text is None:
return None
if tag_components:
template_text = self.tag_template_separators(template_text)
if template_text in self.parsed_cache:
template = self.parsed_cache[template_text]
else:
template = pystache.parse(template_text)
self.parsed_cache[template_text] = template
if replace_aliases:
self.aliases.replace(components)
if tag_components:
components = {k: self.tagged_tokens(v, k) for k, v in six.iteritems(components)}
text = self.render_template(template, components, tagged=tag_components)
text = self.remove_repeat_template_separators(text)
return text

View File

View File

@@ -0,0 +1,59 @@
import random
import six
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumberedComponent
from geodata.encoding import safe_decode
from geodata.configs.utils import nested_get
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
class Block(NumberedComponent):
max_blocks = 10
block_range = range(1, max_blocks + 1)
block_range_probs = zipfian_distribution(len(block_range), 2.0)
block_range_cdf = cdf(block_range_probs)
@classmethod
def random(cls, language, country=None):
num_type, num_type_props = cls.choose_alphanumeric_type('blocks.alphanumeric', language, country=country)
if num_type is None:
return None
if num_type == cls.NUMERIC:
number = weighted_choice(cls.block_range, cls.block_range_cdf)
return safe_decode(number)
else:
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
if alphabet_probability is not None and random.random() >= alphabet_probability:
alphabet = latin_alphabet
letter = sample_alphabet(alphabet, 2.0)
if num_type == cls.ALPHA:
return safe_decode(letter)
else:
number = weighted_choice(cls.block_range, cls.block_range_cdf)
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
if num_type == cls.ALPHA_PLUS_NUMERIC:
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
elif num_type == cls.NUMERIC_PLUS_ALPHA:
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
@classmethod
def phrase(cls, block, language, country=None):
if block is None:
return None
phrase_prob = address_config.get_property('blocks.alphanumeric_phrase_probability', language, country=country, default=0.0)
if random.random() < phrase_prob:
return cls.numeric_phrase('blocks.alphanumeric', block, language,
dictionaries=['qualifiers'], country=country)
else:
return None

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,152 @@
import copy
import os
import six
import yaml
from collections import Mapping
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge, alternative_probabilities
from geodata.math.sampling import cdf, check_probability_distribution
this_dir = os.path.realpath(os.path.dirname(__file__))
ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'addresses')
DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'dictionaries')
class AddressConfig(object):
def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
self.address_configs = {}
self.cache = {}
for filename in os.listdir(config_dir):
if not filename.endswith('.yaml'):
continue
config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
countries = config.pop('countries', {})
for k in countries.keys():
country_config = countries[k]
config_copy = copy.deepcopy(config)
countries[k] = recursive_merge(config_copy, country_config)
config['countries'] = countries
lang = filename.rsplit('.yaml')[0]
self.address_configs[lang] = config
self.sample_phrases = {}
for language in address_phrase_dictionaries.languages:
for dictionary in address_phrase_dictionaries.language_dictionaries[language]:
self.sample_phrases[(language, dictionary)] = {}
for phrases in address_phrase_dictionaries.phrases[(language, dictionary)]:
self.sample_phrases[(language, dictionary)][phrases[0]] = phrases[1:]
def get_property(self, key, language, country=None, default=None):
keys = key.split('.')
config = self.address_configs.get(language, {})
if country:
country_config = config.get('countries', {}).get(country, {})
if country_config:
config = country_config
value = nested_get(config, keys)
if value is not DoesNotExist:
return value
return default
def cache_key(self, prop, language, dictionaries=(), country=None):
return (prop, language, country, tuple(dictionaries))
def alternative_probabilities(self, prop, language, dictionaries=(), country=None):
'''Get a probability distribution over alternatives'''
key = self.cache_key(prop, language, dictionaries, country=country)
if key not in self.cache:
properties = self.get_property(prop, language, country=country, default=None)
if properties is None:
return None, None
alternatives, probs = alternative_probabilities(properties)
if alternatives is None:
return None, None
forms = []
form_probs = []
for props, prob in zip(alternatives, probs):
phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
forms.extend([(p, props) for p in phrases])
form_probs.extend([prob * p for p in phrase_probs])
sample_probability = properties.get('sample_probability')
if sample_probability is not None:
sample_phrases = []
for dictionary in dictionaries:
phrases = self.sample_phrases.get((language, dictionary), [])
for canonical, surface_forms in six.iteritems(phrases):
sample_phrases.append(canonical)
sample_phrases.extend(surface_forms)
# Note: use the outer properties dictionary e.g. units.alphanumeric
forms.extend([(p, properties) for p in sample_phrases])
form_probs.extend([float(sample_probability) / len(sample_phrases)] * len(sample_phrases))
try:
check_probability_distribution(form_probs)
except AssertionError:
print 'values were: {}'.format(forms)
raise
form_probs_cdf = cdf(form_probs)
self.cache[key] = (forms, form_probs_cdf)
return self.cache[key]
def form_probabilities(self, properties, language, dictionaries=()):
probs = []
alternatives = []
canonical_prob = properties.get('canonical_probability', 1.0)
canonical = properties['canonical']
alternatives.append(canonical)
probs.append(canonical_prob)
if 'abbreviated_probability' in properties:
probs.append(properties['abbreviated_probability'])
abbreviated = properties['abbreviated']
assert isinstance(abbreviated, basestring)
alternatives.append(abbreviated)
if properties.get('sample', False) and 'sample_probability' in properties:
sample_prob = properties['sample_probability']
samples = set()
for dictionary in dictionaries:
phrases = self.sample_phrases.get((language, dictionary), {})
samples |= set(phrases.get(canonical, []))
if 'sample_exclude' in properties:
samples -= set(properties['sample_exclude'])
if samples:
for phrase in samples:
probs.append(sample_prob / float(len(samples)))
alternatives.append(phrase)
else:
total = sum(probs)
probs = [p / total for p in probs]
try:
check_probability_distribution(probs)
except AssertionError:
print 'values were: {}'.format(alternatives)
raise
return alternatives, probs
address_config = AddressConfig()

View File

@@ -0,0 +1,37 @@
import six
from geodata.addresses.config import address_config
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice
class Conjunction(object):
DEFAULT_WHITESPACE_JOIN = ', '
DEFAULT_NON_WHITESPACE_JOIN = ''
key = 'and'
@classmethod
def join(cls, phrases, language, country=None):
if not hasattr(phrases, '__iter__'):
raise ValueError('Param phrases must be iterable')
values, probs = address_config.alternative_probabilities(cls.key, language, country=country)
phrase, props = weighted_choice(values, probs)
whitespace = props.get('whitespace', True)
whitespace_phrase = six.u(' ') if whitespace else six.u('')
phrases = [safe_decode(p) for p in phrases]
max_phrase_join = props.get('max_phrase_join', 2)
if len(phrases) > max_phrase_join:
default_join = safe_decode(props.get('default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')])
else:
prefix = six.u('')
if whitespace:
phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase)
joined_phrase = phrase.join(phrases[-max_phrase_join:])
return six.u('').join([prefix, joined_phrase])

View File

@@ -0,0 +1,19 @@
import random
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumberedComponent
from geodata.encoding import safe_decode
class ConscriptionNumber(NumberedComponent):
@classmethod
def phrase(cls, number, language, country=None):
if number is None:
return number
key = 'conscription_numbers.alphanumeric'
dictionaries = ['house_numbers']
default = safe_decode(number)
return cls.numeric_phrase(key, safe_decode(number), language,
dictionaries=dictionaries, country=country)

View File

@@ -0,0 +1,42 @@
import operator
import six
from geodata.graph.topsort import topsort
class ComponentDependencies(object):
'''
Declare an address component and its dependencies e.g.
a house_numer cannot be used in the absence of a road name.
'''
component_bit_values = {}
def __init__(self, graph):
self.dependencies = {}
self.all_values = long('1' * len(graph), 2)
self.dependency_order = [c for c in topsort(graph)]
for component, deps in six.iteritems(graph):
self.dependencies[component] = self.component_bitset(deps) if deps else self.all_values
def __getitem__(self, key):
return self.dependencies.__getitem__(key)
def __contains__(self, key):
return self.dependencies.__contains__(key)
@classmethod
def get_component_bit_value(cls, name):
val = cls.component_bit_values.get(name)
if val is None:
num_values = len(cls.component_bit_values)
val = 1 << num_values
cls.component_bit_values[name] = val
return val
@classmethod
def component_bitset(cls, components):
return reduce(operator.or_, [cls.get_component_bit_value(name) for name in components])

View File

@@ -0,0 +1,37 @@
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumericPhrase
from geodata.math.sampling import weighted_choice
class RelativeDirection(NumericPhrase):
key = 'directions'
dictionaries = ['unit_directions']
class AnteroposteriorDirection(RelativeDirection):
key = 'directions.anteroposterior'
class LateralDirection(RelativeDirection):
key = 'directions.lateral'
class CardinalDirection(NumericPhrase):
key = 'cardinal_directions'
dictionaries = ['cardinal_directions']
class Direction(object):
CARDINAL = 'cardinal'
RELATIVE = 'relative'
@classmethod
def random(cls, language, country=None, cardinal_proability=0.5):
values = [cls.CARDINAL, cls.RELATIVE]
probs_cdf = [cardinal_proability, 1.0]
choice = weighted_choice(values, probs_cdf)
if choice == cls.CARDINAL:
return CardinalDirection.phrase(None, language, country=country)
else:
return RelativeDirection.phrase(None, language, country=country)

View File

@@ -0,0 +1,66 @@
import random
import six
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumberedComponent
from geodata.encoding import safe_decode
from geodata.configs.utils import nested_get
from geodata.addresses.directions import RelativeDirection
from geodata.addresses.floors import Floor
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
class Entrance(NumberedComponent):
max_entrances = 10
entrance_range = range(1, max_entrances + 1)
entrance_range_probs = zipfian_distribution(len(entrance_range), 2.0)
entrance_range_cdf = cdf(entrance_range_probs)
@classmethod
def random(cls, language, country=None):
num_type, num_type_props = cls.choose_alphanumeric_type('entrances.alphanumeric', language, country=country)
if num_type is None:
return None
if num_type == cls.NUMERIC:
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
return safe_decode(number)
elif num_type == cls.HYPHENATED_NUMBER:
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
number2 = number + weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
return u'{}-{}'.format(number, number2)
else:
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
if alphabet_probability is not None and random.random() >= alphabet_probability:
alphabet = latin_alphabet
letter = sample_alphabet(alphabet, 2.0)
if num_type == cls.ALPHA:
return safe_decode(letter)
else:
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
whitespace_phrase = u''
r = random.random()
if r < whitespace_probability:
whitespace_phrase = u' '
elif r < (whitespace_probability + hyphen_probability):
whitespace_phrase = u'-'
if num_type == cls.ALPHA_PLUS_NUMERIC:
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
elif num_type == cls.NUMERIC_PLUS_ALPHA:
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
@classmethod
def phrase(cls, entrance, language, country=None):
if entrance is None:
return None
return cls.numeric_phrase('entrances.alphanumeric', entrance, language,
dictionaries=['entrances'], country=country)

View File

@@ -0,0 +1,165 @@
import random
import six
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
from geodata.numbers.spellout import numeric_expressions
class Floor(NumberedComponent):
# When we don't know the number of floors, use a Zipfian distribution
# to choose randomly between 1 and max_floors with 1 being much more
# likely than 2, etc.
max_floors = 10
max_basements = 2
numbered_floors = range(max_floors + 1) + range(-1, -max_basements - 1, -1)
floor_probs = zipfian_distribution(len(numbered_floors), 0.75)
floor_probs_cdf = cdf(floor_probs)
# For use with letters e.g. A0 is probably not as common
floors_letters = range(1, max_floors + 1) + [0]
floors_letters_probs = zipfian_distribution(len(floors_letters), 2.0)
floors_letters_cdf = cdf(floors_letters_probs)
@classmethod
def sample_floors(cls, num_floors, num_basements=0):
num_floors = int(num_floors)
return random.randint(-num_basements, (num_floors - 1) if num_floors > 0 else 0)
@classmethod
def sample_floors_range(cls, min_floor, max_floor):
return random.randint(min_floor, (max_floor - 1) if max_floor > min_floor else min_floor)
@classmethod
def random_int(cls, language, country=None, num_floors=None, num_basements=None):
number = None
if num_floors is not None:
try:
num_floors = int(num_floors)
except (ValueError, TypeError):
return weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
if num_floors <= cls.max_floors:
number = cls.sample_floors(num_floors, num_basements=num_basements or 0)
else:
number = cls.sample_floors_range(cls.max_floors + 1, num_floors)
else:
number = weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
return number
@classmethod
def random_from_int(cls, number, language, country=None):
num_type, num_type_props = cls.choose_alphanumeric_type('levels.alphanumeric', language, country=country)
if num_type is None:
return None
numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
if number >= 0:
number += numbering_starts_at
if num_type == cls.NUMERIC:
return safe_decode(number)
elif num_type == cls.ROMAN_NUMERAL:
roman_numeral = numeric_expressions.roman_numeral(number)
if roman_numeral is not None:
return roman_numeral
else:
return safe_decode(number)
elif num_type == cls.HYPHENATED_NUMBER:
number2 = number + sample_floors_range(1, cls.max_floors)
return u'{}-{}'.format(number, number2)
else:
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
if alphabet_probability is not None and random.random() >= alphabet_probability:
alphabet = latin_alphabet
letter = sample_alphabet(alphabet)
if num_type == cls.ALPHA:
return letter
else:
number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf)
if num_type == cls.ALPHA_PLUS_NUMERIC:
return six.u('{}{}').format(letter, number)
elif num_type == cls.NUMERIC_PLUS_ALPHA:
return six.u('{}{}').format(number, letter)
return None
@classmethod
def random(cls, language, country=None, num_floors=None, num_basements=None):
number = cls.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
return cls.random_from_int(number, language, country=country)
@classmethod
def phrase(cls, floor, language, country=None, num_floors=None):
if floor is None:
return None
integer_floor = False
floor = safe_decode(floor)
try:
floor = int(floor)
integer_floor = True
except (ValueError, TypeError):
try:
floor = float(floor)
integer_floor = int(floor) == floor
except (ValueError, TypeError):
return cls.numeric_phrase('levels.alphanumeric', floor, language,
dictionaries=['level_types_numbered'], country=country)
numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
try:
num_floors = int(num_floors)
top_floor = num_floors if numbering_starts_at == 1 else num_floors - 1
is_top = num_floors and floor == top_floor
except (ValueError, TypeError):
is_top = False
alias_prefix = 'levels.aliases'
aliases = address_config.get_property(alias_prefix, language, country=country)
if aliases:
alias = None
if not integer_floor and floor >= 0 and 'half_floors' in aliases:
floor = int(floor)
alias = 'half_floors'
elif not integer_floor and floor < 0 and 'half_floors_negative' in aliases:
floor = int(floor)
alias = 'half_floors_negative'
elif floor < -1 and '<-1' in aliases:
alias = '<-1'
elif is_top and 'top' in aliases:
alias = 'top'
elif safe_decode(floor) in aliases:
alias = safe_decode(floor)
floor = safe_decode(floor)
if alias:
alias_props = aliases.get(alias)
# Aliases upon aliases, e.g. for something like "Upper Mezzanine"
# where it's an alias for "1" under the half_floors key
if safe_decode(floor) in alias_props.get('aliases', {}):
alias_prefix = '{}.{}.aliases'.format(alias_prefix, alias)
alias = safe_decode(floor)
if alias:
return cls.numeric_phrase('{}.{}'.format(alias_prefix, alias), floor, language,
dictionaries=['level_types_basement',
'level_types_mezzanine',
'level_types_numbered',
'level_types_standalone',
'level_types_sub_basement'],
country=country)
return cls.numeric_phrase('levels.alphanumeric', floor, language,
dictionaries=['level_types_numbered'], country=country)

View File

@@ -0,0 +1,26 @@
import random
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumberedComponent
from geodata.encoding import safe_decode
class HouseNumber(NumberedComponent):
@classmethod
def phrase(cls, number, language, country=None):
if number is not None:
prob_key = 'house_numbers.alphanumeric_phrase_probability'
key = 'house_numbers.alphanumeric'
dictionaries = ['house_numbers', 'number']
default = safe_decode(number)
else:
prob_key = 'house_numbers.no_number_probability'
key = 'house_numbers.no_number'
dictionaries = ['no_number']
default = None
phrase_prob = address_config.get_property(prob_key, language, country=country, default=0.0)
if random.random() < phrase_prob:
return cls.numeric_phrase(key, safe_decode(number), language,
dictionaries=dictionaries, country=country)
return default

View File

@@ -0,0 +1,24 @@
from geodata.addresses.config import address_config
import random
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumericPhrase
from geodata.encoding import safe_decode
class MetroStationPhrase(NumericPhrase):
key = 'metro_stations.alphanumeric'
dictionaries = ['qualifiers']
class MetroStation(object):
@classmethod
def phrase(cls, station, language, country=None):
if station is None:
return None
phrase_prob = address_config.get_property('metro_stations.alphanumeric_phrase_probability', language, country=country, default=0.0)
if random.random() < phrase_prob:
return MetroStationPhrase.phrase(station, language, country=country)
return None

View File

@@ -0,0 +1,434 @@
# -*- coding: utf-8 -*-
import random
import six
from geodata.addresses.config import address_config
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
from geodata.math.floats import isclose
from geodata.numbers.ordinals import ordinal_expressions
from geodata.numbers.spellout import numeric_expressions
from geodata.text.tokenize import tokenize, token_types
alphabets = {}
def sample_alphabet(alphabet, b=1.5):
'''
Sample an "alphabet" using a Zipfian distribution (frequent items are very
frequent, long tail of infrequent items). If we look at something like
unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
"Unit Z" simply because most dwellings only have a few units. Sampling
letters from a Zipfian distribution rather than uniformly means that instead
of every letter having the same likelihood (1/26), letters toward the beginning
of the alphabet are much more likely to be selected. Letters toward the end can
still be selected sometimes, but are not very likely.
Note letters don't necessarily need to be sorted alphabetically, just in order
of frequency.
'''
global alphabets
alphabet = tuple(alphabet)
if alphabet not in alphabets:
probs = zipfian_distribution(len(alphabet), b)
probs_cdf = cdf(probs)
alphabets[alphabet] = probs_cdf
probs_cdf = alphabets[alphabet]
return weighted_choice(alphabet, probs_cdf)
latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
class Digits(object):
ASCII = 'ascii'
SPELLOUT = 'spellout'
UNICODE_FULL_WIDTH = 'unicode_full_width'
ROMAN_NUMERAL = 'roman_numeral'
CARDINAL = 'cardinal'
ORDINAL = 'ordinal'
unicode_full_width_map = {
'0': safe_decode(''),
'1': safe_decode(''),
'2': safe_decode(''),
'3': safe_decode(''),
'4': safe_decode(''),
'5': safe_decode(''),
'6': safe_decode(''),
'7': safe_decode(''),
'8': safe_decode(''),
'9': safe_decode(''),
}
full_width_digit_map = {
v: k for k, v in six.iteritems(unicode_full_width_map)
}
@classmethod
def rewrite_full_width(cls, s):
return six.u('').join([cls.unicode_full_width_map.get(c, c) for c in s])
@classmethod
def rewrite_standard_width(cls, s):
return six.u('').join([cls.full_width_digit_map.get(c, c) for c in s])
@classmethod
def rewrite_roman_numeral(cls, s):
roman_numeral = None
if s.isdigit():
roman_numeral = numeric_expressions.roman_numeral(s)
if roman_numeral:
return roman_numeral
else:
return s
@classmethod
def rewrite_spellout(cls, s, lang, num_type, props):
if s.isdigit():
num = int(s)
spellout = None
gender = props.get('gender')
category = props.get('category')
if num_type == cls.CARDINAL:
spellout = numeric_expressions.spellout_cardinal(num, lang, gender=gender, category=category)
elif num_type == cls.ORDINAL:
spellout = numeric_expressions.spellout_ordinal(num, lang, gender=gender, category=category)
if spellout:
return spellout.title()
return s
else:
return s
@classmethod
def rewrite(cls, d, lang, props, num_type=CARDINAL):
if not props:
return d
d = safe_decode(d)
values = []
probs = []
for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL):
key = '{}_probability'.format(digit_type)
if key in props:
values.append(digit_type)
probs.append(props[key])
if not isclose(sum(probs), 1.0):
values.append(cls.ASCII)
probs.append(1.0 - sum(probs))
probs = cdf(probs)
digit_type = weighted_choice(values, probs)
if digit_type == cls.ASCII:
return d
elif digit_type == cls.SPELLOUT:
return cls.rewrite_spellout(d, lang, num_type, props)
elif digit_type == cls.ROMAN_NUMERAL:
roman_numeral = cls.rewrite_roman_numeral(d)
if random.random() < props.get('ordinal_suffix_probability', 0.0):
ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None))
if ordinal_suffix:
roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix)
return roman_numeral
elif digit_type == cls.UNICODE_FULL_WIDTH:
return cls.rewrite_full_width(d)
else:
return d
class NumericPhrase(object):
key = None
NUMERIC = 'numeric'
NUMERIC_AFFIX = 'numeric_affix'
@classmethod
def pick_phrase_and_type(cls, number, language, country=None):
values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
if not values:
return None, safe_decode(number) if number is not None else None, None
phrase, phrase_props = weighted_choice(values, probs)
values = []
probs = []
for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
key = '{}_probability'.format(num_type)
prob = phrase_props.get(key, None)
if prob is not None:
values.append(num_type)
probs.append(prob)
if not probs:
num_type = cls.NUMERIC
else:
probs = cdf(probs)
num_type = weighted_choice(values, probs)
return num_type, phrase, phrase_props[num_type]
@classmethod
def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):
if num_type == cls.NUMERIC_AFFIX:
phrase = props['affix']
if 'zero_pad' in props and number.isdigit():
number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))
direction = props['direction']
whitespace = props.get('whitespace', whitespace_default)
whitespace_probability = props.get('whitespace_probability')
if whitespace_probability is not None:
whitespace = random.random() < whitespace_probability
if props.get('title_case', True):
# Title case unless the config specifies otherwise
phrase = phrase.title()
if number is None:
return phrase
whitespace_phrase = six.u(' ') if whitespace else six.u('')
# Phrase goes to the left of hte number
if direction == 'left':
return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
# Phrase goes to the right of the number
elif direction == 'right':
return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
# Need to specify a direction, otherwise return naked number
else:
return safe_decode(number)
@classmethod
def phrase(cls, number, language, country=None):
num_type, phrase, props = cls.pick_phrase_and_type(number, language, country=country)
whitespace_default = num_type == cls.NUMERIC
return cls.combine_with_number(number, phrase, num_type, props, whitespace_default=whitespace_default)
class Number(NumericPhrase):
key = 'numbers'
dictionaries = ['number']
class NumberedComponent(object):
NUMERIC = 'numeric'
ALPHA = 'alpha'
ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric'
NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha'
HYPHENATED_NUMBER = 'hyphenated_number'
ROMAN_NUMERAL = 'roman_numeral'
@classmethod
def choose_alphanumeric_type(cls, key, language, country=None):
alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
if alphanumeric_props is None:
return None, None
values = []
probs = []
for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.HYPHENATED_NUMBER, cls.ROMAN_NUMERAL):
key = '{}_probability'.format(num_type)
prob = alphanumeric_props.get(key)
if prob is not None:
values.append(num_type)
probs.append(prob)
if not values:
return None, None
probs = cdf(probs)
num_type = weighted_choice(values, probs)
num_type_props = alphanumeric_props.get(num_type, {})
return num_type, num_type_props
@classmethod
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
has_alpha = False
has_numeric = True
is_integer = False
is_none = False
if num is not None:
try:
num_int = int(num)
is_integer = True
except ValueError:
try:
num_float = float(num)
except ValueError:
tokens = tokenize(safe_decode(num))
has_numeric = False
for t, c in tokens:
if c == token_types.NUMERIC:
has_numeric = True
if any((ch.isalpha() for ch in t)):
has_alpha = True
if strict_numeric and has_alpha:
return safe_decode(num)
else:
is_none = True
values, probs = None, None
if is_alpha:
values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)
# Pick a phrase given the probability distribution from the config
if values is None:
values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
if not values:
return safe_decode(num) if not is_none else None
phrase, phrase_props = weighted_choice(values, probs)
values = []
probs = []
# Dictionaries are lowercased, so title case here
if phrase_props.get('title_case', True):
phrase = phrase.title()
'''
There are a few ways we can express the number itself
1. Alias it as some standalone word like basement (for floor "-1")
2. Use the number itself, so "Floor 2"
3. Append/prepend an affix e.g. 2/F for second floor
4. As an ordinal expression e.g. "2nd Floor"
'''
have_standalone = False
have_null = False
for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
key = '{}_probability'.format(num_type)
prob = phrase_props.get(key)
if prob is not None:
if num_type == 'standalone':
have_standalone = True
elif num_type == 'null':
have_null = True
values.append(num_type)
probs.append(prob)
elif num_type in phrase_props:
values.append(num_type)
probs.append(1.0)
break
if not probs or is_none:
return phrase
# If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
if has_alpha:
values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
total = float(sum(probs))
if isclose(total, 0.0):
return None
probs = [p / total for p in probs]
probs = cdf(probs)
if len(values) < 2:
if have_standalone:
num_type = 'standalone'
elif have_null:
num_type = 'null'
else:
num_type = 'numeric'
else:
num_type = weighted_choice(values, probs)
if num_type == 'standalone':
return phrase
elif num_type == 'null':
return safe_decode(num)
props = phrase_props[num_type]
if is_integer:
num_int = int(num)
if phrase_props.get('number_abs_value', False):
num_int = abs(num_int)
num = num_int
if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']:
return None
if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']:
return None
if phrase_props.get('number_subtract_abs_value'):
num_int -= phrase_props['number_subtract_abs_value']
num = num_int
num = safe_decode(num)
digits_props = props.get('digits')
if digits_props:
# Inherit the gender and category e.g. for ordinals
for k in ('gender', 'category'):
if k in props:
digits_props[k] = props[k]
num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)
# Do we add the numeric phrase e.g. Floor No 1
add_number_phrase = props.get('add_number_phrase', False)
if add_number_phrase and random.random() < props['add_number_phrase_probability']:
num = Number.phrase(num, language, country=country)
whitespace_default = True
if num_type == 'numeric_affix':
phrase = props['affix']
if props.get('upper_case', True):
phrase = phrase.upper()
if 'zero_pad' in props and num.isdigit():
num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
whitespace_default = False
elif num_type == 'ordinal' and safe_decode(num).isdigit():
ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))
if ordinal_expression is not None:
num = ordinal_expression
if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
if random.random() < props['null_phrase_probability']:
return num
direction = props['direction']
whitespace = props.get('whitespace', whitespace_default)
whitespace_probability = props.get('whitespace_probability')
if whitespace_probability is not None:
whitespace = random.random() < whitespace_probability
# Occasionally switch up if direction_probability is specified
if random.random() > props.get('direction_probability', 1.0):
if direction == 'left':
direction = 'right'
elif direction == 'right':
direction = 'left'
whitespace_phrase = six.u(' ') if whitespace else six.u('')
# Phrase goes to the left of hte number
if direction == 'left':
return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
# Phrase goes to the right of the number
elif direction == 'right':
return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
# Need to specify a direction, otherwise return naked number
else:
return safe_decode(num)

View File

@@ -0,0 +1,76 @@
import random
import six
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
from geodata.encoding import safe_decode
from geodata.math.sampling import cdf, weighted_choice
class POBox(NumberedComponent):
@classmethod
def random_digits(cls, num_digits):
# Note: PO Boxes can have leading zeros but not important for the parser
# since it only cares about how many digits there are in a number
low = 10 ** (num_digits - 1)
high = (10 ** num_digits) - 1
return random.randint(low, high)
@classmethod
def random_digits_with_prefix(cls, num_digits, prefix=six.u('')):
return six.u('').join([prefix, safe_decode(cls.random_digits(num_digits))])
@classmethod
def random_digits_with_suffix(cls, num_digits, suffix=six.u('')):
return six.u('').join([safe_decode(cls.random_digits(num_digits)), suffix])
@classmethod
def random_letter(cls, language, country=None):
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
return sample_alphabet(alphabet)
@classmethod
def random(cls, language, country=None):
num_type, num_type_props = cls.choose_alphanumeric_type('po_boxes.alphanumeric', language, country=country)
if num_type is None:
return None
if num_type != cls.ALPHA:
digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[])
values = []
probs = []
for val in digit_config:
values.append(val['length'])
probs.append(val['probability'])
probs = cdf(probs)
num_digits = weighted_choice(values, probs)
digits = cls.random_digits(num_digits)
number = Digits.rewrite(digits, language, num_type_props)
if num_type == cls.NUMERIC:
return safe_decode(number)
else:
letter = cls.random_letter(language, country=country)
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
if num_type == cls.ALPHA_PLUS_NUMERIC:
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
elif num_type == cls.NUMERIC_PLUS_ALPHA:
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
else:
return cls.random_letter(language, country=country)
@classmethod
def phrase(cls, box_number, language, country=None):
if box_number is None:
return None
return cls.numeric_phrase('po_boxes.alphanumeric', safe_decode(box_number), language,
dictionaries=['post_office'], country=country)

View File

@@ -0,0 +1,11 @@
from geodata.addresses.numbering import NumberedComponent
from geodata.encoding import safe_decode
class PostCode(NumberedComponent):
@classmethod
def phrase(cls, postcode, language, country=None):
if postcode is None:
return None
return cls.numeric_phrase('postcodes.alphanumeric', postcode, language,
dictionaries=['postcodes'], country=country)

View File

@@ -0,0 +1,66 @@
import random
import six
from geodata.addresses.config import address_config
from geodata.addresses.numbering import NumberedComponent
from geodata.encoding import safe_decode
from geodata.configs.utils import nested_get
from geodata.addresses.directions import RelativeDirection
from geodata.addresses.floors import Floor
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
class Staircase(NumberedComponent):
max_staircases = 10
staircase_range = range(1, max_staircases + 1)
staircase_range_probs = zipfian_distribution(len(staircase_range), 2.0)
staircase_range_cdf = cdf(staircase_range_probs)
@classmethod
def random(cls, language, country=None):
num_type, num_type_props = cls.choose_alphanumeric_type('staircases.alphanumeric', language, country=country)
if num_type is None:
return None
if num_type == cls.NUMERIC:
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
return safe_decode(number)
elif num_type == cls.HYPHENATED_NUMBER:
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
return u'{}-{}'.format(number, number2)
else:
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
if alphabet_probability is not None and random.random() >= alphabet_probability:
alphabet = latin_alphabet
letter = sample_alphabet(alphabet, 2.0)
if num_type == cls.ALPHA:
return safe_decode(letter)
else:
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
whitespace_phrase = u''
r = random.random()
if r < whitespace_probability:
whitespace_phrase = u' '
elif r < (whitespace_probability + hyphen_probability):
whitespace_phrase = u'-'
if num_type == cls.ALPHA_PLUS_NUMERIC:
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
elif num_type == cls.NUMERIC_PLUS_ALPHA:
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
@classmethod
def phrase(cls, staircase, language, country=None):
if staircase is None:
return None
return cls.numeric_phrase('staircases.alphanumeric', staircase, language,
dictionaries=['staircases'], country=country)

View File

@@ -0,0 +1,285 @@
import itertools
import random
import six
from geodata.addresses.config import address_config
from geodata.addresses.directions import RelativeDirection, LateralDirection, AnteroposteriorDirection
from geodata.addresses.floors import Floor
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
from geodata.configs.utils import nested_get
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
from geodata.text.utils import is_numeric_strict
class Unit(NumberedComponent):
# When we don't know the number of units, use a Zipfian distribution
# to choose randomly between 1 and max_units with 1 being much more
# likely than 2, etc.
max_units = 99
max_basements = 2
hundreds_numbered_units_tens = [range(101, 110) + [100],
range(201, 210) + [200],
range(301, 310) + [300],
range(401, 410) + [400],
range(501, 510) + [500],
]
hundreds_numbered_units = [range(110, 200),
range(210, 300),
range(310, 400),
range(410, 500),
range(510, 600),
]
thousands_numbered_units = [range(1001, 1030) + [1000],
range(2001, 2030) + [2000],
range(3001, 3030) + [3000],
range(4001, 4030) + [4000],
range(5001, 5030) + [5000]
]
numbered_units = range(1, 10)
numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units_tens)))
numbered_units.extend(range(10, 100))
numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units)))
numbered_units.extend(itertools.chain(*itertools.izip(*thousands_numbered_units)))
numbered_units.extend(range(10001, 10100) + [10000])
numbered_units.append(0)
numbered_units.extend(range(0, -max_basements - 1, -1))
unit_probs = zipfian_distribution(len(numbered_units), 0.7)
unit_probs_cdf = cdf(unit_probs)
num_digits = [2, 3, 4]
num_digits_probs = zipfian_distribution(len(num_digits), 4.0)
num_digits_cdf = cdf(num_digits_probs)
# For use with floors e.g. #301 more common than #389
positive_units_floors = range(1, 10) + [0] + range(10, max_units + 1)
positive_units_floors_probs = zipfian_distribution(len(positive_units_floors), 0.6)
positive_units_floors_cdf = cdf(positive_units_floors_probs)
# For basic positive units
positive_units = range(1, max_units + 1)
positive_units_probs = zipfian_distribution(len(positive_units), 0.6)
positive_units_cdf = cdf(positive_units_probs)
# For use with letters e.g. A0 less common
positive_units_letters = range(1, max_units + 1) + [0]
positive_units_letters_probs = zipfian_distribution(len(positive_units_letters), 0.6)
positive_units_letters_cdf = cdf(positive_units_letters_probs)
RESIDENTIAL = 'residential'
COMMERCIAL = 'commercial'
INDUSTRIAL = 'industrial'
UNIVERSITY = 'university'
@classmethod
def sample_num_digits(cls):
return weighted_choice(cls.num_digits, cls.num_digits_cdf)
@classmethod
def for_floor(cls, floor_number, num_digits=None):
num_digits = num_digits if num_digits is not None else cls.sample_num_digits()
unit = weighted_choice(cls.positive_units_floors, cls.positive_units_floors_cdf)
return six.u('{}{}').format(floor_number, safe_decode(unit).zfill(num_digits))
@classmethod
def random(cls, language, country=None, num_floors=None, num_basements=None, floor=None):
num_type, num_type_props = cls.choose_alphanumeric_type('units.alphanumeric', language, country=country)
if num_type is None:
return None
use_floor_prob = address_config.get_property('units.alphanumeric.use_floor_probability', language, country=country, default=0.0)
use_positive_numbers_prob = address_config.get_property('units.alphanumeric.use_positive_numbers_probability', language, country=country, default=0.0)
if (num_floors is None and floor is None) or random.random() >= use_floor_prob:
if random.random() >= use_positive_numbers_prob:
number = weighted_choice(cls.numbered_units, cls.unit_probs_cdf)
else:
number = weighted_choice(cls.positive_units, cls.positive_units_cdf)
else:
if floor is None or not floor.isdigit():
floor = Floor.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
floor_numbering_starts_at = address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)
ground_floor_starts_at = address_config.get_property('units.alphanumeric.use_floor_ground_starts_at', language, country=country, default=None)
if ground_floor_starts_at is not None:
try:
floor = int(floor)
if floor >= floor_numbering_starts_at:
floor -= floor_numbering_starts_at
floor += ground_floor_starts_at
floor = safe_decode(floor)
except (TypeError, ValueError):
pass
use_floor_affix_prob = address_config.get_property('units.alphanumeric.use_floor_numeric_affix_probability', language, country=country, default=0.0)
if use_floor_affix_prob and random.random() < use_floor_affix_prob:
floor_phrase = Floor.phrase(floor, language, country=country)
# Only works if the floor phrase is strictly numeric e.g. "1" or "H1"
if is_numeric_strict(floor_phrase):
unit = weighted_choice(cls.positive_units, cls.positive_units_cdf)
unit_num_digits = address_config.get_property('units.alphanumeric.use_floor_unit_num_digits', language, country=country, default=None)
if unit_num_digits is not None:
unit = safe_decode(unit).zfill(unit_num_digits)
return six.u('{}{}').format(floor_phrase, unit)
floor_num_digits = address_config.get_property('units.alphanumeric.use_floor_floor_num_digits', language, country=country, default=None)
if floor_num_digits is not None and floor.isdigit():
floor = floor.zfill(floor_num_digits)
number = cls.for_floor(floor)
if num_type == cls.NUMERIC:
return safe_decode(number)
elif num_type == cls.HYPHENATED_NUMBER:
number2 = weighted_choice(cls.positive_units, cls.positive_units_cdf)
range_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.range_probability', language, country=country, default=0.5))
direction = address_config.get_property('units.alphanumeric.hyphenated_number.direction', language, country=country, default='right')
direction_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.direction_probability', language, country=country, default=0.0))
if random.random() < direction_prob:
direction = 'left' if direction == 'right' else 'right'
direction_right = direction == 'right'
if random.random() < range_prob:
if direction_right:
number2 += number
else:
number2 = max(0, number - number2)
if direction == 'right':
return u'{}-{}'.format(number, number2)
else:
return u'{}-{}'.format(number2, number)
else:
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
if alphabet_probability is not None and random.random() >= alphabet_probability:
alphabet = latin_alphabet
letter = sample_alphabet(alphabet)
if num_type == cls.ALPHA:
return safe_decode(letter)
else:
if num_floors is None:
number = weighted_choice(cls.positive_units_letters, cls.positive_units_letters_cdf)
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
whitespace_phrase = u''
r = random.random()
if r < whitespace_probability:
whitespace_phrase = u' '
elif r < (whitespace_probability + hyphen_probability):
whitespace_phrase = u'-'
if num_type == cls.ALPHA_PLUS_NUMERIC:
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
elif num_type == cls.NUMERIC_PLUS_ALPHA:
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
@classmethod
def add_direction(cls, key, unit, language, country=None):
add_direction_probability = address_config.get_property('{}.add_direction_probability'.format(key),
language, country=country, default=0.0)
if not random.random() < add_direction_probability:
return unit
add_direction_numeric = address_config.get_property('{}.add_direction_numeric'.format(key),
language, country=country)
try:
unit = int(unit)
integer_unit = True
except (ValueError, TypeError):
integer_unit = False
if add_direction_numeric and integer_unit:
return RelativeDirection.phrase(unit, language, country=country)
elif not integer_unit:
add_direction_standalone = address_config.get_property('{}.add_direction_standalone'.format(key),
language, country=country)
if add_direction_standalone:
return RelativeDirection.phrase(None, language, country=country)
@classmethod
def add_quadrant(cls, key, unit, language, country=None):
add_quadrant_probability = address_config.get_property('{}.add_quadrant_probability'.format(key),
language, country=country, default=0.0)
if not random.random() < add_quadrant_probability:
return unit
add_quadrant_numeric = address_config.get_property('{}.add_quadrant_numeric'.format(key),
language, country=country)
try:
unit = int(unit)
integer_unit = True
except (ValueError, TypeError):
integer_unit = False
first_direction = address_config.get_property('{}.add_quadrant_first_direction'.format(key),
language, country=country)
if first_direction == 'lateral':
ordering = (LateralDirection, AnteroposteriorDirection)
elif first_direction == 'anteroposterior':
ordering = (AnteroposteriorDirection, LateralDirection)
else:
return unit
if not integer_unit:
add_quadrant_standalone = address_config.get_property('{}.add_quadrant_standalone'.format(key),
language, country=country)
if add_quadrant_standalone:
unit = None
else:
return None
last_num_type = None
for i, c in enumerate(ordering):
num_type, phrase, props = c.pick_phrase_and_type(unit, language, country=country)
whitespace_default = num_type == c.NUMERIC or last_num_type == c.NUMERIC
unit = c.combine_with_number(unit, phrase, num_type, props, whitespace_default=whitespace_default)
last_num_type = num_type
return unit
@classmethod
def phrase(cls, unit, language, country=None, zone=None):
if unit is not None:
key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(zone)
if not address_config.get_property(key, language, country=country):
return None
is_alpha = safe_decode(unit).isalpha()
direction_unit = None
add_direction = address_config.get_property('{}.add_direction'.format(key), language, country=country)
if add_direction:
direction_unit = cls.add_direction(key, unit, language, country=country)
if direction_unit and direction_unit != unit:
unit = direction_unit
is_alpha = False
else:
add_quadrant = address_config.get_property('{}.add_quadrant'.format(key), language, country=country)
if add_quadrant:
unit = cls.add_quadrant(key, unit, language, country=country)
is_alpha = False
return cls.numeric_phrase(key, safe_decode(unit), language,
dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha)
else:
key = 'units.standalone'
values, probs = address_config.alternative_probabilities(key, language,
dictionaries=['unit_types_standalone'],
country=country)
if values is None:
return None
phrase, phrase_props = weighted_choice(values, probs)
return phrase.title()

View File

View File

@@ -0,0 +1,167 @@
import os
import random
import re
import six
import yaml
from collections import defaultdict
from geodata.configs.utils import nested_get, DoesNotExist, alternative_probabilities
from geodata.encoding import safe_decode
from geodata.math.floats import isclose
from geodata.math.sampling import cdf, weighted_choice
from geodata.encoding import safe_encode
this_dir = os.path.realpath(os.path.dirname(__file__))
BOUNDARY_NAMES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'boundaries', 'names')
BOUNDARY_NAMES_CONFIG = os.path.join(BOUNDARY_NAMES_DIR, 'global.yaml')
class BoundaryNames(object):
DEFAULT_NAME_KEY = 'name'
def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
config = yaml.load(open(config_file))
default_names = nested_get(config, ('names', 'keys'))
name_keys, probs = alternative_probabilities(default_names)
self.name_keys = name_keys
self.name_key_probs = cdf(probs)
self.component_name_keys = {}
for component, component_config in six.iteritems(nested_get(config, ('names', 'components'), default={})):
component_names = component_config.get('keys')
component_name_keys, component_probs = alternative_probabilities(component_names)
self.component_name_keys[component] = (component_name_keys, cdf(component_probs))
self.country_regex_replacements = defaultdict(list)
for props in nested_get(config, ('names', 'regex_replacements',), default=[]):
country = props.get('country')
re_flags = re.I | re.UNICODE
if not props.get('case_insensitive', True):
re.flags ^= re.I
pattern = re.compile(props['pattern'], re_flags)
replace_group = props['replace_with_group']
replace_probability = props['replace_probability']
self.country_regex_replacements[country].append((pattern, replace_group, replace_probability))
self.country_regex_replacements = dict(self.country_regex_replacements)
self.prefixes = {}
self.prefix_regexes = {}
self.suffixes = {}
self.suffix_regexes = {}
for language, components in six.iteritems(nested_get(config, ('names', 'prefixes', 'language'), default={}) ):
for component, affixes in six.iteritems(components):
affix_values, probs = alternative_probabilities(affixes)
for val in affix_values:
if 'prefix' not in val:
raise AssertionError(six.u('Invalid prefix value for (language={}, component={}): {} ').format(language, component, val))
prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values])
self.prefix_regexes[(language, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U)
if not isclose(sum(probs), 1.0):
affix_values.append(None)
probs.append(1.0 - sum(probs))
affix_probs_cdf = cdf(probs)
self.prefixes[(language, component)] = affix_values, affix_probs_cdf
for language, components in six.iteritems(nested_get(config, ('names', 'suffixes', 'language'), default={}) ):
for component, affixes in six.iteritems(components):
affix_values, probs = alternative_probabilities(affixes)
for val in affix_values:
if 'suffix' not in val:
raise AssertionError(six.u('Invalid suffix value for (language={}, component={}): {} ').format(language, component, val))
suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values])
self.suffix_regexes[(language, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U)
if not isclose(sum(probs), 1.0):
affix_values.append(None)
probs.append(1.0 - sum(probs))
affix_probs_cdf = cdf(probs)
self.suffixes[(language, component)] = affix_values, affix_probs_cdf
self.exceptions = {}
for props in nested_get(config, ('names', 'exceptions'), default=[]):
object_type = props['type']
object_id = safe_encode(props['id'])
keys = [props['default']]
probs = [props['probability']]
for alt in props.get('alternatives', []):
keys.append(alt['alternative'])
probs.append(alt['probability'])
probs = cdf(probs)
self.exceptions[(object_type, object_id)] = (keys, probs)
def _string_as_regex(self, s):
return safe_decode(s).replace(six.u('.'), six.u('\\.'))
def valid_name(self, object_type, object_id, name):
exceptions, probs = self.exceptions.get((object_type, object_id), ((), ()))
return not exceptions or name in exceptions
def name_key_dist(self, props, component):
object_type = props.get('type')
object_id = safe_encode(props.get('id', ''))
if (object_type, object_id) in self.exceptions:
values, probs = self.exceptions[(object_type, object_id)]
return values, probs
name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs))
return name_keys, probs
def name_key(self, props, component):
name_keys, probs = self.name_key_dist(props, component)
return weighted_choice(name_keys, probs)
def name(self, country, language, component, name):
all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])
prefixes, prefix_probs = self.prefixes.get((language, component), (None, None))
suffixes, suffix_probs = self.suffixes.get((language, component), (None, None))
if not all_replacements and not prefixes and not suffixes:
return name
for regex, group, prob in all_replacements:
match = regex.match(name)
if match and random.random() < prob:
name = match.group(group)
for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0),
(suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
if affixes is not None:
regex = regexes[language, component]
if regex.match(name):
continue
affix = weighted_choice(affixes, affix_probs)
if affix is not None:
whitespace = affix.get('whitespace', True)
space_val = six.u(' ') if whitespace else six.u('')
affix = affix[key]
if direction == 0:
return six.u('{}{}{}').format(affix, space_val, safe_decode(name))
else:
return six.u('{}{}{}').format(safe_decode(name), space_val, affix)
return name
boundary_names = BoundaryNames()

View File

View File

@@ -0,0 +1,72 @@
import csv
import os
import six
import random
import sys
from collections import defaultdict
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.encoding import safe_decode
CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'categories')
class CategoryConfig(object):
def __init__(self, base_dir=CATEGORIES_DIR):
self.language_categories_singular = {}
self.language_categories_plural = {}
self.language_property_names = defaultdict(set)
if not os.path.exists(base_dir):
raise RuntimeError('{} does not exist'.format(base_dir))
for filename in os.listdir(base_dir):
if not filename.endswith('.tsv'):
continue
lang = filename.rsplit('.tsv')[0]
base_lang = lang.split('_')[0]
singular_rules = self.language_categories_singular.get(base_lang, defaultdict(list))
plural_rules = self.language_categories_plural.get(base_lang, defaultdict(list))
reader = csv.reader(open(os.path.join(CATEGORIES_DIR, filename)), delimiter='\t')
reader.next() # headers
for key, value, is_plural, phrase in reader:
self.language_property_names[lang].add(key)
is_plural = bool(int(is_plural))
if is_plural:
plural_rules[(key, value)].append(phrase)
else:
singular_rules[(key, value)].append(phrase)
self.language_categories_singular[base_lang] = singular_rules
self.language_categories_plural[base_lang] = plural_rules
self.language_categories_singular = {key: dict(value) for key, value
in six.iteritems(self.language_categories_singular)}
self.language_categories_plural = {key: dict(value) for key, value
in six.iteritems(self.language_categories_plural)}
def has_keys(self, language, keys):
prop_names = self.language_property_names.get(language, set())
return [k for k in keys if k in prop_names]
def get_phrase(self, language, key, value, is_plural=False):
config = self.language_categories_singular if not is_plural else self.language_categories_plural
if language not in config:
return None
language_config = config[language]
choices = language_config.get((key, value))
if not choices:
return None
return random.choice(choices)
category_config = CategoryConfig()

View File

@@ -0,0 +1,31 @@
from geodata.addresses.config import address_config
from geodata.categories.config import category_config
from geodata.math.sampling import weighted_choice, cdf
class CategoryPreposition(object):
NEAR = 'near'
NEARBY = 'nearby'
NEAR_ME = 'near_me'
IN = 'in'
NULL = 'null'
@classmethod
def random(cls, language, country=None):
category_props = address_config.get_property('categories', language, country=country)
if category_props is None:
return None
values = []
probs = []
for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
k = '{}_probability'.format(prep_phrase_type)
prob = category_props.get(k, None)
if prob is not None:
values.append(prep_phrase_type)
probs.append(prob)
probs = cdf(probs)
return weighted_choice(values, probs)

View File

@@ -0,0 +1,38 @@
from collections import namedtuple
from geodata.addresses.config import address_config
from geodata.categories.config import category_config
from geodata.categories.preposition import CategoryPreposition
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice
CategoryQuery = namedtuple('CategoryQuery', 'category, prep, add_place_name, add_address')
NULL_CATEGORY_QUERY = CategoryQuery(None, None, False, False)
class Category(object):
@classmethod
def phrase(cls, language, key, value, is_plural=False, country=None):
category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
if not category_phrase:
return NULL_CATEGORY_QUERY
category_phrase = safe_decode(category_phrase)
prep_phrase_type = CategoryPreposition.random(language, country=country)
if prep_phrase_type in (None, CategoryPreposition.NULL):
return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
if not values:
return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
prep_phrase = safe_decode(prep_phrase)
add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)

View File

@@ -0,0 +1,125 @@
'''
scrape_nominatim_special_phrases.py
-----------------------------------
Simple script to scrape https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases
for category-related phrases sometimes found in geocoder input.
Populates a per-language CSV with (phrase, OSM key, OSM value, plural):
OSM keys/values are like:
amenity=restaurant
tourism=museum
shop=books
Using these phrases, it is possible to construct queries like "restaurants in Brooklyn"
'''
import csv
import os
import re
import requests
import six
import sys
import time
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.encoding import safe_decode, safe_encode
DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'categories')
# Use Special:Export to get wiki markup
WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
IGNORE_LANGUAGES = {
# Interlingua
'ia'
}
IGNORE_PLURAL_LANGUAGES = {
# For Japanese, seems to just put an s on the end, which doesn't seem right
# Need input from a native speaker on that one
'ja',
}
# Wait this many seconds between page fetches
POLITENESS_DELAY = 5.0
def scrape_nominatim_category_page(url, ignore_plurals=False):
result = requests.get(url)
if not result or not result.content:
return
for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
if operator and operator != '-':
continue
is_plural = plural == 'Y'
if is_plural and ignore_plurals:
continue
yield safe_decode(phrase).lower(), key, value, is_plural
def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
print('Fetching main page')
result = requests.get(url)
languages = {}
if not result or not result.content:
return languages
time.sleep(POLITENESS_DELAY)
for entity, anchor_text in wiki_link_re.findall(result.content):
if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
continue
lang = entity.rstrip('/').rsplit('/')[-1].lower()
if lang in IGNORE_LANGUAGES:
continue
link = WIKI_BASE_URL + entity.replace(' ', '_')
ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
print('Doing {}'.format(lang))
phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
time.sleep(POLITENESS_DELAY)
if not phrases:
continue
languages[lang] = phrases
return languages
def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
languages = scrape_all_nominatim_category_pages(url=url)
for lang, phrases in six.iteritems(languages):
filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
with open(filename, 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerow(('key', 'value', 'is_plural', 'phrase'))
for phrase, key, value, is_plural in phrases:
writer.writerow((safe_encode(key), safe_encode(value),
str(int(is_plural)), safe_encode(phrase)))
print('Done')
if __name__ == '__main__':
main()

View File

View File

@@ -0,0 +1,23 @@
if [ "$#" -ge 1 ]; then
DATA_DIR=$1
else
DATA_DIR=$(pwd)
fi
PWD=$(pwd)
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
python $SCRIPT_DIR/chains_tsv.py $DATA_DIR/planet-venues.osm $DATA_DIR/chains.tsv
cd $DATA_DIR
split -d -C524200 chains.tsv chains.split.
for filename in chains.split.*; do
extension="${filename##*.0}"
name="${filename%%.*}"
echo -e "name_lower\tname\tcanonical\tknown_chain\tcount" | cat - $filename > /tmp/out
mv /tmp/out $name.$extension.tsv
rm $filename
done
cd $PWD

View File

@@ -0,0 +1,78 @@
import csv
import os
import glob
import six
import sys
from collections import defaultdict
from collections import Counter
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.address_expansions.address_dictionaries import ADDRESS_EXPANSIONS_DIR
from geodata.osm.extract import *
from geodata.encoding import safe_encode
class VenueNames(object):
def __init__(self, venues_filename):
self.venues_filename = venues_filename
self.all_chains = set()
self.chain_canonical = {}
for filename in glob.glob(os.path.join(ADDRESS_EXPANSIONS_DIR, '**', 'chains.txt')):
f = open(filename)
for line in f:
line = line.rstrip()
phrases = safe_decode(line).split(six.u('|'))
self.all_chains |= set(phrases)
canonical = phrases[0]
for p in phrases[1:]:
self.chain_canonical[p] = canonical
self.names = Counter()
self.names_lower = Counter()
self.names_cap = defaultdict(Counter)
def count(self):
i = 0
for node_id, value, deps in parse_osm(self.venues_filename):
name = value.get('name')
if not name:
continue
self.names[name] += 1
self.names_lower[name.lower()] += 1
self.names_cap[name.lower()][name] += 1
if i % 1000 == 0 and i > 0:
print 'did', i
i += 1
def write_to_tsv(self, out_filename, min_threshold=5):
writer = csv.writer(open(out_filename, 'w'), delimiter='\t')
for k, v in self.names_lower.most_common():
if v < min_threshold:
break
canonical = self.chain_canonical.get(k)
if canonical:
canonical = self.names_cap[canonical].most_common(1)[0][0]
else:
canonical = ''
most_common_cap = self.names_cap[k].most_common(1)[0][0]
writer.writerow((safe_encode(k),
safe_encode(most_common_cap),
safe_encode(canonical),
safe_encode(1) if k in self.all_chains else '',
safe_encode(v)))
if __name__ == '__main__':
if len(sys.argv) < 3:
print('Usage: python chains_tsv.py infile outfile')
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
names = VenueNames(input_file)
names.count()
names.write_to_tsv(output_file)

View File

@@ -0,0 +1,100 @@
import random
import six
from collections import namedtuple
from geodata.addresses.config import address_config
from geodata.address_expansions.gazetteers import chains_gazetteer
from geodata.categories.config import category_config
from geodata.categories.preposition import CategoryPreposition
from geodata.math.sampling import weighted_choice, cdf
from geodata.text.normalize import normalized_tokens
from geodata.text.tokenize import tokenize, token_types
from geodata.encoding import safe_decode
ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name, add_address')
NULL_CHAIN_QUERY = ChainQuery(None, None, False, False)
class Chain(object):
@classmethod
def tokenize_name(cls, name):
if not name:
return []
tokens = normalized_tokens(name)
return tokens
@classmethod
def possible_chain(cls, name):
'''
Determines if a venue name contains the name of a known chain store.
Returns a tuple of:
(True/False, known chain phrases, other tokens)
Handles cases like "Hard Rock Cafe Times Square" and allows for downstream
decision making (i.e. if the tokens have a low IDF in the local area we might
want to consider it a chain).
'''
tokens = cls.tokenize_name(name)
if not tokens:
return False, [], []
matches = chains_gazetteer.filter(tokens)
other_tokens = []
phrases = []
for t, c, l, d in matches:
if c == token_types.PHRASE:
phrases.append((t, c, l, d))
else:
other_tokens.append((t, c))
return len(phrases) > 0, phrases, other_tokens if len(phrases) > 0 else []
@classmethod
def extract(cls, name):
'''
Determines if an entire venue name matches a known chain store.
Note: to avoid false positives, only return True if all of the tokens
in the venue's name are part of a single chain store phrase. This will
miss a few things like "Hard Rock Cafe Times Square" and the like.
It will however handle compound chain stores like Subway/Taco Bell
'''
possible, phrases, other_tokens = cls.possible_chain(name)
is_chain = possible and not any((c in token_types.WORD_TOKEN_TYPES for t, c in other_tokens))
return is_chain, phrases if is_chain else []
@classmethod
def alternate_form(cls, language, dictionary, canonical):
choices = address_config.sample_phrases.get((language, dictionary), {}).get(canonical)
if not choices:
return canonical
return random.choice(choices)
@classmethod
def phrase(cls, chain, language, country=None):
if not chain:
return NULL_CHAIN_QUERY
chain_phrase = safe_decode(chain)
prep_phrase_type = CategoryPreposition.random(language, country=country)
if prep_phrase_type in (None, CategoryPreposition.NULL):
return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
if not values:
return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
prep_phrase = safe_decode(prep_phrase)
add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)

View File

View File

@@ -0,0 +1,61 @@
import six
from collections import Mapping
def recursive_merge(a, b):
for k, v in six.iteritems(b):
if isinstance(v, Mapping) and v:
existing = a.get(k, v)
merged = recursive_merge(existing, v)
a[k] = merged
else:
a[k] = b[k]
return a
class DoesNotExist:
pass
def nested_get(obj, keys, default=DoesNotExist):
if len(keys) == 0:
return obj
try:
for key in keys[:-1]:
obj = obj.get(key, {})
if not hasattr(obj, 'items'):
return default
key = keys[-1]
return obj.get(key, default)
except AttributeError:
return default
def alternative_probabilities(properties):
if properties is None:
return None, None
probs = []
alternatives = []
if 'probability' in properties:
prob = properties['probability']
props = properties['default']
probs.append(prob)
alternatives.append(props)
elif 'alternatives' not in properties and 'default' in properties:
prob = 1.0
props = properties['default']
probs.append(prob)
alternatives.append(props)
elif 'alternatives' not in properties and 'default' not in properties:
return None, None
alts = properties.get('alternatives', [])
for alt in alts:
prob = alt.get('probability', 1.0 / len(alts))
props = alt['alternative']
probs.append(prob)
alternatives.append(props)
return alternatives, probs

View File

View File

@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
'''
geodata.coordinates.conversion
------------------------------
Geographic coordinates typically come in two flavors: decimal and
DMS (degree-minute-second). This module parses a coordinate string
in just about any format. This was originally created for parsing
lat/lons found on the web.
Usage:
>>> latlon_to_decimal('40°4246″N', '74°0021″W') # returns (40.71277777777778, 74.00583333333333)
>>> latlon_to_decimal('40,74 N', '74,001 W') # returns (40.74, -74.001)
>>> to_valid_longitude(360.0)
>>> latitude_is_valid(90.0)
'''
import math
import re
from geodata.encoding import safe_decode
from geodata.math.floats import isclose
beginning_re = re.compile('^[^0-9\-]+', re.UNICODE)
end_re = re.compile('[^0-9]+$', re.UNICODE)
latitude_dms_regex = re.compile(ur'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$', re.I | re.UNICODE)
longitude_dms_regex = re.compile(ur'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$', re.I | re.UNICODE)
latitude_decimal_with_direction_regex = re.compile('^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I)
longitude_decimal_with_direction_regex = re.compile('^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$', re.I)
direction_sign_map = {'n': 1, 's': -1, 'e': 1, 'w': -1}
def direction_sign(d):
if d is None:
return 1
d = d.lower().strip()
if d in direction_sign_map:
return direction_sign_map[d]
else:
raise ValueError('Invalid direction: {}'.format(d))
def int_or_float(d):
try:
return int(d)
except ValueError:
return float(d)
def degrees_to_decimal(degrees, minutes, seconds):
degrees = int_or_float(degrees)
minutes = int_or_float(minutes)
seconds = int_or_float(seconds)
return degrees + (minutes / 60.0) + (seconds / 3600.0)
def is_valid_latitude(latitude):
'''Latitude must be real number between -90.0 and 90.0'''
try:
latitude = float(latitude)
except (ValueError, TypeError):
return False
if latitude > 90.0 or latitude < -90.0 or math.isinf(latitude) or math.isnan(latitude):
return False
return True
def is_valid_longitude(longitude):
'''Allow any valid real number to be a longitude'''
try:
longitude = float(longitude)
except (ValueError, TypeError):
return False
return not math.isinf(longitude) and not math.isnan(longitude)
def to_valid_latitude(latitude):
'''Convert longitude into the -180 to 180 scale'''
if not is_valid_latitude(latitude):
raise ValueError('Invalid latitude {}'.format(latitude))
if isclose(latitude, 90.0):
latitude = 89.9999
elif isclose(latitude, -90.0):
latitude = -89.9999
return latitude
def to_valid_longitude(longitude):
'''Convert longitude into the -180 to 180 scale'''
if not is_valid_longitude(longitude):
raise ValueError('Invalid longitude {}'.format(longitude))
while longitude <= -180.0:
longitude += 360.0
while longitude > 180.0:
longitude -= 360.0
return longitude
def latlon_to_decimal(latitude, longitude):
have_lat = False
have_lon = False
latitude = safe_decode(latitude).strip(u' ,;|')
longitude = safe_decode(longitude).strip(u' ,;|')
latitude = latitude.replace(u',', u'.')
longitude = longitude.replace(u',', u'.')
lat_dms = latitude_dms_regex.match(latitude)
lat_dir = latitude_decimal_with_direction_regex.match(latitude)
if lat_dms:
d, m, s, c = lat_dms.groups()
sign = direction_sign(c)
latitude = degrees_to_decimal(d or 0, m or 0, s or 0)
have_lat = True
elif lat_dir:
d, c = lat_dir.groups()
sign = direction_sign(c)
latitude = return_type(d) * sign
have_lat = True
else:
latitude = re.sub(beginning_re, u'', latitude)
latitude = re.sub(end_re, u'', latitude)
lon_dms = longitude_dms_regex.match(longitude)
lon_dir = longitude_decimal_with_direction_regex.match(longitude)
if lon_dms:
d, m, s, c = lon_dms.groups()
sign = direction_sign(c)
longitude = degrees_to_decimal(d or 0, m or 0, s or 0)
have_lon = True
elif lon_dir:
d, c = lon_dir.groups()
sign = direction_sign(c)
longitude = return_type(d) * sign
have_lon = True
else:
longitude = re.sub(beginning_re, u'', longitude)
longitude = re.sub(end_re, u'', longitude)
latitude = float(latitude)
longitude = float(longitude)
if not is_valid_latitude(latitude):
raise ValueError('Invalid latitude: {}'.format(latitude))
if not is_valid_longitude(longitude):
raise ValueError('Invalid longitude: {}'.format(longitude))
latitude = to_valid_latitude(latitude)
longitude = to_valid_longitude(longitude)
return latitude, longitude

View File

View File

@@ -0,0 +1,262 @@
import pycountry
class Countries(object):
AFGHANISTAN = 'af'
ALAND_ISLANDS = 'ax'
ALBANIA = 'al'
ALGERIA = 'dz'
AMERICAN_SAMOA = 'as'
ANDORRA = 'ad'
ANGOLA = 'ao'
ANGUILLA = 'ai'
ANTARCTICA = 'aq'
ANTIGUA_AND_BARBUDA = 'ag'
ARGENTINA = 'ar'
ARMENIA = 'am'
ARUBA = 'aw'
AUSTRALIA = 'au'
AUSTRIA = 'at'
AZERBAIJAN = 'az'
BAHAMAS = 'bs'
BAHRAIN = 'bh'
BANGLADESH = 'bd'
BARBADOS = 'bb'
BELARUS = 'by'
BELGIUM = 'be'
BELIZE = 'bz'
BENIN = 'bj'
BERMUDA = 'bm'
BHUTAN = 'bt'
BOLIVIA = 'bo'
BONAIRE = 'bq'
BOSNIA_AND_HERZEGOVINA = 'bq'
BOTSWANA = 'bw'
BOUVET_ISLAND = 'bv'
BRAZIL = 'br'
BRITISH_INDIAN_OCEAN_TERRITORY = 'io'
BRITISH_VIRGIN_ISLANDS = 'vg'
BRUNEI_DARUSSALAM = 'bn'
BULGARIA = 'bg'
BURKINA_FASO = 'bf'
BURUNDI = 'bi'
CAMBODIA = 'kh'
CAMEROON = 'cm'
CANADA = 'ca'
CAPE_VERDE = 'cv'
CAYMAN_ISLANDS = 'ky'
CENTRAL_AFRICAN_REPUBLIC = 'cf'
CHAD = 'td'
CHILE = 'cl'
CHINA = 'cn'
CHRISTMAS_ISLAND = 'cx'
COCOS_KEELING_ISLANDS = 'cc'
COLOMBIA = 'co'
COMOROS = 'km'
COOK_ISLANDS = 'ck'
COSTA_RICA = 'cr'
COTE_DIVOIRE = 'ci'
CROATIA = 'hr'
CUBA = 'cu'
CURACAO = 'cw'
CYPRUS = 'cy'
CZECH_REPUBLIC = 'cz'
DENMARK = 'dk'
DEMOCRATIC_REPUBLIC_OF_THE_CONGO = 'cd'
DJIBOUTI = 'dj'
DOMINICA = 'dm'
DOMINICAN_REPUBLIC = 'do'
ECUADOR = 'ec'
EGYPT = 'eg'
EL_SALVADOR = 'sv'
EQUATORIAL_GUINEA = 'gq'
ERITREA = 'er'
ESTONIA = 'ee'
ETHIOPIA = 'et'
FALKLAND_ISLANDS_MALVINAS = 'fk'
FAROE_ISLANDS = 'fo'
FEDERATED_STATES_OF_MICRONESIA = 'fm'
FIJI = 'fj'
FINLAND = 'fi'
FRANCE = 'fr'
FRENCH_GUIANA = 'gf'
FRENCH_POLYNESIA = 'pf'
FRENCH_SOUTHERN_TERRITORIES = 'tf'
GABON = 'ga'
GAMBIA = 'gm'
GEORGIA = 'ge'
GERMANY = 'de'
GHANA = 'gh'
GIBRALTAR = 'gi'
GREECE = 'gr'
GREENLAND = 'gl'
GRENADA = 'gd'
GUADELOUPE = 'gp'
GUAM = 'gu'
GUATEMALA = 'gt'
GUERNSEY = 'gg'
GUINEA = 'gn'
GUINEA_BISSAU = 'gw'
GUYANA = 'gy'
HAITI = 'ht'
HEARD_ISLAND_AND_MCDONALD_ISLANDS = 'hm'
HONDURAS = 'hn'
HONG_KONG = 'hk'
HUNGARY = 'hu'
ICELAND = 'is'
INDIA = 'in'
INDONESIA = 'id'
IRAN = 'ir'
IRAQ = 'iq'
IRELAND = 'ie'
ISLE_OF_MAN = 'im'
ISRAEL = 'il'
ITALY = 'it'
JAMAICA = 'jm'
JAPAN = 'jp'
JERSEY = 'je'
JORDAN = 'jo'
KAZAKHSTAN = 'kz'
KENYA = 'ke'
KIRIBATI = 'ki'
KUWAIT = 'kw'
KYRGYZSTAN = 'kg'
LAOS = 'la'
LATVIA = 'lv'
LEBANON = 'lb'
LESOTHO = 'ls'
LIBERIA = 'lr'
LIBYA = 'ly'
LIECHTENSTEIN = 'li'
LITHUANIA = 'lt'
LUXEMBOURG = 'lu'
MACAO = 'mo'
MACEDONIA = 'mk'
MADAGASCAR = 'mg'
MALAWI = 'mw'
MALAYSIA = 'my'
MALDIVES = 'mv'
MALI = 'ml'
MALTA = 'mt'
MARSHALL_ISLANDS = 'mh'
MARTINIQUE = 'mq'
MAURITANIA = 'mr'
MAURITIUS = 'mu'
MAYOTTE = 'yt'
MEXICO = 'mx'
MOLDOVA = 'md'
MONACO = 'mc'
MONGOLIA = 'mn'
MONTENEGRO = 'me'
MONTSERRAT = 'ms'
MOROCCO = 'ma'
MOZAMBIQUE = 'mz'
MYANMAR = 'mm'
NAMIBIA = 'na'
NAURU = 'nr'
NEPAL = 'np'
NETHERLANDS = 'nl'
NEW_CALEDONIA = 'nc'
NEW_ZEALAND = 'nz'
NICARAGUA = 'ni'
NIGER = 'ne'
NIGERIA = 'ng'
NIUE = 'nu'
NORFOLK_ISLAND = 'nf'
NORTH_KOREA = 'kp'
NORTHERN_MARIANA_ISLANDS = 'mp'
NORWAY = 'no'
OMAN = 'om'
PAKISTAN = 'pk'
PALAU = 'pw'
PALESTINE = 'ps'
PANAMA = 'pa'
PAPUA_NEW_GUINEA = 'pg'
PARAGUAY = 'py'
PERU = 'pe'
PHILIPPINES = 'ph'
PITCAIRN_ISLANDS = 'pn'
POLAND = 'pl'
PORTUGAL = 'pt'
PUERTO_RICO = 'pr'
QATAR = 'qa'
REPUBLIC_OF_CONGO = 'cg'
REUNION = 're'
ROMANIA = 'ro'
RUSSIA = 'ru'
RWANDA = 'rw'
SAINT_BARTHELEMY = 'bl'
SAINT_HELENA_ASCENSION_AND_TRISTAN_DA_CUNHA = 'sh'
SAINT_KITTS_AND_NEVIS = 'kn'
SAINT_LUCIA = 'lc'
SAINT_MARTIN = 'mf'
SAINT_PIERRE_AND_MIQUELON = 'pm'
SAINT_VINCENT_AND_THE_GRENADINES = 'vc'
SAMOA = 'ws'
SAN_MARINO = 'sm'
SAO_TOME_AND_PRINCIPE = 'st'
SAUDI_ARABIA = 'sa'
SENEGAL = 'sn'
SERBIA = 'rs'
SEYCHELLES = 'sc'
SIERRA_LEONE = 'sl'
SINGAPORE = 'sg'
SINT_MAARTEN = 'sx'
SLOVAKIA = 'sk'
SLOVENIA = 'si'
SOLOMON_ISLANDS = 'sb'
SOMALIA = 'so'
SOUTH_AFRICA = 'za'
SOUTH_GEORGIA_AND_THE_SOUTH_SANDWICH_ISLANDS = 'gs'
SOUTH_KOREA = 'kr'
SOUTH_SUDAN = 'ss'
SPAIN = 'es'
SRI_LANKA = 'lk'
SUDAN = 'sd'
SURINAME = 'sr'
SVALBARD_AND_JAN_MAYEN = 'sj'
SWAZILAND = 'sz'
SWEDEN = 'se'
SWITZERLAND = 'ch'
SYRIA = 'sy'
TAIWAN = 'tw'
TAJIKISTAN = 'tj'
TANZANIA = 'tz'
THAILAND = 'th'
TIMOR_LESTE = 'tl'
TOGO = 'tg'
TOKELAU = 'tk'
TONGA = 'to'
TRINIDAD_AND_TOBAGO = 'tt'
TUNISIA = 'tn'
TURKEY = 'tr'
TURKMENISTAN = 'tm'
TURKS_AND_CAICOS_ISLANDS = 'tc'
TUVALU = 'tv'
UGANDA = 'ug'
UKRAINE = 'ua'
UNITED_ARAB_EMIRATES = 'ae'
UNITED_KINGDOM = 'gb'
UNITED_STATES = 'us'
UNITED_STATES_MINOR_OUTLYING_ISLANDS = 'um'
URUGUAY = 'uy'
US_VIRGIN_ISLANDS = 'vi'
UZBEKISTAN = 'uz'
VANUATU = 'vu'
VATICAN = 'va'
VENEZUELA = 've'
VIETNAM = 'vn'
WALLIS_AND_FUTUNA = 'wf'
WESTERN_SAHARA = 'eh'
YEMEN = 'ye'
ZAMBIA = 'zm'
ZIMBABWE = 'zw'
FORMER_SOVIET_UNION_COUNTRIES = set([RUSSIA, UKRAINE, BELARUS, KAZAKHSTAN, AZERBAIJAN, KYRGYZSTAN, GEORGIA, UZBEKISTAN, ARMENIA, TAJIKISTAN, MOLDOVA, TURKMENISTAN, LATVIA, LITHUANIA, ESTONIA])
CJK_COUNTRIES = set([CHINA, JAPAN, SOUTH_KOREA, TAIWAN, HONG_KONG, MACAO])
all_country_iso_codes = set([c.alpha2.lower() for c in pycountry.countries])
@classmethod
def is_valid_country_code(cls, alpha2_code):
return alpha2_code and alpha2_code.lower() in cls.all_country_iso_codes

View File

@@ -0,0 +1,187 @@
# -*- coding: utf-8 -*-
import os
import six
import sys
import pycountry
from collections import OrderedDict
from lxml import etree
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.i18n.unicode_paths import CLDR_DIR
from geodata.i18n.languages import *
from geodata.encoding import safe_decode
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'countries', 'names.yaml')
IGNORE_COUNTRIES = set([six.u('ZZ')])
COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')])
COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')])
LANGUAGE_COUNTRY_OVERRIDES = {
'en': {
'CD': safe_decode('Democratic Republic of the Congo'),
'CG': safe_decode('Republic of the Congo'),
},
# Countries where the local language is absent from CLDR
# Tajik / Tajikistan
'tg': {
'TJ': safe_decode('Тоҷикистон'),
},
# Maldivan / Maldives
'dv': {
'MV': safe_decode('ދިވެހިރާއްޖެ'),
}
}
class CountryNames(object):
def __init__(self, base_dir=CLDR_MAIN_PATH):
self.base_dir = base_dir
self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries}
self.iso_3166_names = {c.alpha2.lower(): c.name for c in pycountry.countries}
self.language_country_names = {}
self.country_language_names = defaultdict(dict)
self.country_official_names = defaultdict(OrderedDict)
self.country_local_names = defaultdict(OrderedDict)
local_languages = {}
country_local_language_names = defaultdict(dict)
for filename in os.listdir(base_dir):
lang = filename.split('.xml')[0]
if len(lang) > 3:
continue
names = self.cldr_country_names(lang)
lang = lang.lower()
self.language_country_names[lang] = names
for country, name in names.iteritems():
country = country.lower()
languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
local_languages[country] = languages
self.country_language_names[country.lower()][lang.lower()] = name
if lang in local_languages.get(country, {}):
country_local_language_names[country][lang] = name
for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES):
if l not in self.language_country_names:
self.language_country_names[l.lower()] = names
for c, name in six.iteritems(names):
self.country_language_names[c.lower()][l.lower()] = name
if c.lower() not in country_local_language_names:
country_local_language_names[c.lower()][l.lower()] = name
for country, langs in six.iteritems(local_languages):
names = country_local_language_names[country]
num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
for i, (lang, default) in enumerate(langs.iteritems()):
name = names.get(lang)
if not name:
continue
if default or num_defaults == 0:
self.country_official_names[country][lang] = name
if num_defaults == 0:
break
self.country_local_names[country][lang] = name
def cldr_country_names(self, language):
'''
Country names are tricky as there can be several versions
and levels of verbosity e.g. United States of America
vs. the more commonly used United States. Most countries
have a similarly verbose form.
The CLDR repo (http://cldr.unicode.org/) has the most
comprehensive localized database of country names
(among other things), organized by language. This function
parses CLDR XML for a given language and returns a dictionary
of {country_code: name} for that language.
'''
filename = os.path.join(self.base_dir, '{}.xml'.format(language))
xml = etree.parse(open(filename))
country_names = defaultdict(dict)
for territory in xml.xpath('*//territories/*'):
country_code = territory.attrib['type']
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
continue
country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
display_names = {}
for country_code, names in country_names.iteritems():
if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
continue
default_name = names.get(None)
if country_code in COUNTRY_USE_SHORT_NAME:
display_names[country_code] = names.get('short', default_name)
elif country_code in COUNTRY_USE_VARIANT_NAME:
display_names[country_code] = names.get('variant', default_name)
elif default_name is not None:
display_names[country_code] = default_name
return display_names
def localized_name(self, country_code, language=None):
'''
Get the display name for a country code in the local language
e.g. Россия for Russia, España for Spain, etc.
For most countries there is a single official name. For countries
with more than one official language, this will return a concatenated
version separated by a slash e.g. Maroc / المغرب for Morocco.
Note that all of the exceptions in road_sign_languages.tsv are also
taken into account here so India for example uses the English name
rather than concatenating all 27 toponyms.
This method should be roughly consistent with OSM's display names.
Usage:
>>> country_names.localized_name('jp') # returns '日本'
>>> country_names.localized_name('be') # returns 'België / Belgique / Belgien'
'''
country_code = country_code.lower()
if language is None:
return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' '))
for n in self.country_official_names[country_code].values()).keys())
else:
return self.country_language_names.get(country_code, {}).get(language)
def alpha3_code(self, alpha2_code):
alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
return alpha3.upper() if alpha3 else None
def iso_3166_name(self, alpha2_code):
return self.iso_3166_names.get(alpha2_code.lower())
country_names = CountryNames()

View File

@@ -0,0 +1,16 @@
import csv
import re
from encoding import safe_encode, safe_decode
newline_regex = re.compile('\r\n|\r|\n')
csv.register_dialect('tsv_no_quote', delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')
def tsv_string(s):
return safe_encode(newline_regex.sub(u', ', safe_decode(s).strip()).replace(u'\t', u' '))
def unicode_csv_reader(filename, **kw):
for line in csv.reader(filename, **kw):
yield [unicode(c, 'utf-8') for c in line]

View File

View File

@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
import math
EARTH_RADIUS_KM = 6373
def haversine_distance(lat1, lon1, lat2, lon2, radius=EARTH_RADIUS_KM):
"""Calculate the Haversine distance between two lat/lon pairs, given by:
a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
c = 2 ⋅ atan2( √a, √(1a) )
d = R ⋅ c
where R is the radius of the Earth (in kilometers). By default we use 6373 km,
a radius optimized for calculating distances at approximately 39 degrees from
the equator i.e. Washington, DC
:param lat1: first latitude
:param lon1: first longitude (use negative range for longitudes West of the Prime Meridian)
:param lat2: second latitude
:param lon2: second longitude (use negative range for longitudes West of the Prime Meridian)
:param radius: radius of the Earth in (miles|kilometers) depending on the desired units
"""
lat1 = math.radians(lat1)
lat2 = math.radians(lat2)
lon1 = math.radians(lon1)
lon2 = math.radians(lon2)
dlon = lon2 - lon1
dlat = lat2 - lat1
a = (math.sin(dlat / 2.0)) ** 2 + math.cos(lat1) * math.cos(lat2) * (math.sin(dlon/2.0)) ** 2
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
d = radius * c
return d

View File

@@ -0,0 +1,34 @@
import six
text_type = six.text_type
string_types = six.string_types
binary_type = six.binary_type
def safe_decode(value, encoding='utf-8', errors='strict'):
if isinstance(value, text_type):
return value
if isinstance(value, (string_types, binary_type)):
return value.decode(encoding, errors)
else:
return binary_type(value).decode(encoding, errors)
def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'):
if not isinstance(value, (string_types, binary_type)):
return binary_type(value)
if isinstance(value, text_type):
return value.encode(encoding, errors)
else:
if hasattr(incoming, 'lower'):
incoming = incoming.lower()
if hasattr(encoding, 'lower'):
encoding = encoding.lower()
if value and encoding != incoming:
value = safe_decode(value, encoding, errors)
return value.encode(encoding, errors)
else:
return value

62
scripts/geodata/enum.py Normal file
View File

@@ -0,0 +1,62 @@
class EnumValue(object):
def __init__(self, value, name=None):
self.value = value
self.name = name
def __hash__(self):
return self.value
def __cmp__(self, other):
if isinstance(other, EnumValue):
return self.value.__cmp__(other.value)
else:
return self.value.__cmp__(other)
def __unicode__(self):
return self.name
def __str__(self):
return self.name
def __repr__(self):
return self.name
class EnumMeta(type):
def __init__(self, name, bases, dict_):
self.registry = self.registry.copy()
self.name_registry = self.name_registry.copy()
for k, v in dict_.iteritems():
if isinstance(v, EnumValue) and v not in self.registry:
if v.name is None:
v.name = k
self.registry[v.value] = v
self.name_registry[v.name] = v
return super(EnumMeta, self).__init__(name, bases, dict_)
def __iter__(self):
return self.registry.itervalues()
def __getitem__(self, key):
return self.registry[key]
class Enum(object):
__metaclass__ = EnumMeta
registry = {}
name_registry = {}
@classmethod
def from_id(cls, value):
try:
return cls.registry[value]
except KeyError:
raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value))
@classmethod
def from_string(cls, name):
try:
return cls.name_registry[name]
except KeyError:
raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name))

View File

@@ -0,0 +1,38 @@
import os
import subprocess
import six
def download_file(url, dest, retries=3, retry_delay=5):
ensure_dir(os.path.dirname(dest))
return subprocess.check_output(['curl', url, '-L', '-w', '%{http_code}',
'--retry', six.text_type(retries),
'--retry-delay', six.text_type(retry_delay),
'-o', dest, '--silent']) == '200'
def unzip_file(filename, dest):
ensure_dir(dest)
return subprocess.check_call(['unzip', '-o', filename, '-d', dest]) == 0
def remove_file(filename):
os.unlink(filename)
def ensure_dir(d):
if not os.path.exists(d):
os.makedirs(d)
class cd:
"""Context manager for changing the current working directory"""
def __init__(self, path):
self.path = path
def __enter__(self):
self.saved_path = os.getcwd()
os.chdir(self.path)
def __exit__(self, etype, value, traceback):
os.chdir(self.saved_path)

View File

View File

@@ -0,0 +1,688 @@
'''
create_geonames_tsv.py
----------------------
This script formats the open GeoNames database (as well as
its accompanying postal codes data set) into a schema'd
tab-separated value file.
It generates a C header which uses an enum for the field names.
This way if new fields are added or there's a typo, etc. the
error will show up at compile-time.
The relevant C modules which operate on this data are:
geodb_builder.c
geonames.c
As well as the generated headers:
geonames_fields.h
postal_fields.h
'''
import argparse
import csv
import logging
import operator
import os
import re
import sqlite3
import subprocess
import sys
import pycountry
import unicodedata
import urllib
import urlparse
from collections import defaultdict, OrderedDict
from lxml import etree
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.csv_utils import *
from geodata.file_utils import *
from geodata.countries.country_names import *
from geodata.encoding import safe_encode, safe_decode
from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
from geodata.i18n.languages import *
from geodata.i18n.unicode_paths import CLDR_DIR
from geodata.log import log_to_file
multispace_regex = re.compile('[\s]+')
def encode_field(value):
return multispace_regex.sub(' ', safe_encode((value if value is not None else '')))
log_to_file(sys.stderr)
DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
os.path.pardir, 'data', 'geonames')
COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
CONTINENT_FEATURE_CODES = ('CONT',)
ADMIN_1_FEATURE_CODES = ('ADM1',)
ADMIN_2_FEATURE_CODES = ('ADM2',)
ADMIN_3_FEATURE_CODES = ('ADM3',)
ADMIN_4_FEATURE_CODES = ('ADM4',)
OTHER_ADMIN_FEATURE_CODES = ('ADM5',)
ADMIN_OTHER_FEATURE_CODES = ('ADMD', )
POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL',
'PPLR', 'PPLS', 'STLMT')
NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
class boundary_types:
COUNTRY = 0
ADMIN1 = 1
ADMIN2 = 2
ADMIN3 = 3
ADMIN4 = 4
ADMIN_OTHER = 5
LOCALITY = 6
NEIGHBORHOOD = 7
geonames_admin_dictionaries = OrderedDict([
(boundary_types.COUNTRY, COUNTRY_FEATURE_CODES),
(boundary_types.ADMIN1, ADMIN_1_FEATURE_CODES),
(boundary_types.ADMIN2, ADMIN_2_FEATURE_CODES),
(boundary_types.ADMIN3, ADMIN_3_FEATURE_CODES),
(boundary_types.ADMIN4, ADMIN_4_FEATURE_CODES),
(boundary_types.ADMIN_OTHER, ADMIN_OTHER_FEATURE_CODES),
(boundary_types.LOCALITY, POPULATED_PLACE_FEATURE_CODES),
(boundary_types.NEIGHBORHOOD, NEIGHBORHOOD_FEATURE_CODES),
])
# Inserted post-query
DUMMY_BOUNDARY_TYPE = '-1 as type'
DUMMY_HAS_WIKIPEDIA_ENTRY = '0 as has_wikipedia_entry'
DUMMY_LANGUAGE_PRIORITY = '0 as language_priority'
class GeonamesField(object):
def __init__(self, name, c_constant, default=None, is_dummy=False):
self.name = name
self.c_constant = c_constant
self.default = default
self.is_dummy = is_dummy
geonames_fields = [
# Field if alternate_names present, default field name if not, C header constant
GeonamesField('alternate_name', 'GEONAMES_NAME', default='gn.name'),
GeonamesField('gn.geonames_id as geonames_id', 'GEONAMES_ID'),
GeonamesField('gn.name as canonical', 'GEONAMES_CANONICAL'),
GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True),
GeonamesField(DUMMY_HAS_WIKIPEDIA_ENTRY, 'GEONAMES_HAS_WIKIPEDIA_ENTRY', is_dummy=True),
GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"),
GeonamesField(DUMMY_LANGUAGE_PRIORITY, 'GEONAMES_LANGUAGE_PRIORITY', is_dummy=True),
GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'),
GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'),
GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'),
GeonamesField('is_historic', 'GEONAMES_IS_HISTORICAL', default='0'),
GeonamesField('gn.population', 'GEONAMES_POPULATION'),
GeonamesField('gn.latitude', 'GEONAMES_LATITUDE'),
GeonamesField('gn.longitude', 'GEONAMES_LONGITUDE'),
GeonamesField('gn.feature_code', 'GEONAMES_FEATURE_CODE'),
GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'),
GeonamesField('c.geonames_id as country_gn_id', 'GEONAMES_COUNTRY_ID'),
GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'),
GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'),
GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'),
GeonamesField('a2.geonames_id as a2_gn_id', 'GEONAMES_ADMIN2_ID'),
GeonamesField('gn.admin3_code as admin3_code', 'GEONAMES_ADMIN3_CODE'),
GeonamesField('a3.geonames_id as a3_gn_id', 'GEONAMES_ADMIN3_ID'),
GeonamesField('gn.admin4_code as admin4_code', 'GEONAMES_ADMIN4_CODE'),
GeonamesField('a4.geonames_id as a4_gn_id', 'GEONAMES_ADMIN4_ID'),
]
def geonames_field_index(s):
for i, f in enumerate(geonames_fields):
if f.c_constant == s:
return i
return None
DUMMY_BOUNDARY_TYPE_INDEX = geonames_field_index('GEONAMES_BOUNDARY_TYPE')
DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX = geonames_field_index('GEONAMES_HAS_WIKIPEDIA_ENTRY')
GEONAMES_ID_INDEX = geonames_field_index('GEONAMES_ID')
LANGUAGE_INDEX = geonames_field_index('GEONAMES_ISO_LANGUAGE')
DUMMY_LANGUAGE_PRIORITY_INDEX = geonames_field_index('GEONAMES_LANGUAGE_PRIORITY')
CANONICAL_NAME_INDEX = geonames_field_index('GEONAMES_CANONICAL')
NAME_INDEX = geonames_field_index('GEONAMES_NAME')
COUNTRY_CODE_INDEX = geonames_field_index('GEONAMES_COUNTRY_CODE')
POPULATION_INDEX = geonames_field_index('GEONAMES_POPULATION')
PREFERRED_INDEX = geonames_field_index('GEONAMES_IS_PREFERRED_NAME')
HISTORICAL_INDEX = geonames_field_index('GEONAMES_IS_HISTORICAL')
geonames_admin_joins = '''
left join admin1_codes a1
on a1.code = gn.admin1_code
and a1.country_code = gn.country_code
left join admin2_codes a2
on a2.code = gn.admin2_code
and a2.admin1_code = gn.admin1_code
and a2.country_code = gn.country_code
left join admin3_codes a3
on a3.code = gn.admin3_code
and a3.admin1_code = gn.admin1_code
and a3.admin2_code = gn.admin2_code
and a3.country_code = gn.country_code
left join admin4_codes a4
on a4.code = gn.admin4_code
and a4.admin1_code = gn.admin1_code
and a4.admin2_code = gn.admin2_code
and a4.admin3_code = gn.admin3_code
and a4.country_code = gn.country_code
'''
# Canonical names are stored in the geonames table with alternates
# stored in a separate table. UNION ALL query will capture them all.
base_geonames_query = '''
select {geonames_fields}
from geonames gn
join countries c
on gn.country_code = c.country_code
{admin_joins}
{{predicate}}
union all
select {alt_name_fields}
from geonames gn
join countries c
on gn.country_code = c.country_code
join alternate_names an
on an.geonames_id = gn.geonames_id
and iso_language not in ('doi','faac','iata',
'icao','link','post','tcid')
and an.alternate_name != gn.name
{admin_joins}
{{predicate}}
'''.format(
geonames_fields=', '.join((f.name if f.default is None else
'{} as {}'.format(f.default, f.name)
for f in geonames_fields)),
alt_name_fields=', '.join((f.name for f in geonames_fields)),
admin_joins=geonames_admin_joins
)
IGNORE_COUNTRY_POSTAL_CODES = set([
'AR', # GeoNames has pre-1999 postal codes
])
postal_code_fields = [
GeonamesField('postal_code', 'GN_POSTAL_CODE'),
GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'),
GeonamesField('c.geonames_id as country_geonames_id', 'GN_POSTAL_COUNTRY_GEONAMES_ID'),
GeonamesField('c.population as country_population', 'GN_POSTAL_COUNTRY_POPULATION'),
GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'),
GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'),
GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'),
GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'),
]
def postal_code_field_index(s):
for i, f in enumerate(postal_code_fields):
if f.c_constant == s:
return i
return None
POSTAL_CODE_INDEX = postal_code_field_index('GN_POSTAL_CODE')
POSTAL_CODE_POP_INDEX = postal_code_field_index('GN_POSTAL_COUNTRY_POPULATION')
postal_codes_query = '''
select
{fields}
from postal_codes p
join countries c
on p.country_code = c.country_code
left join (
select
gn.geonames_id,
alternate_name,
country_code,
gn.name
from alternate_names an
join geonames gn
on an.geonames_id = gn.geonames_id
where iso_language = 'post'
) as n
on p.postal_code = n.alternate_name
and p.country_code = n.country_code
left join admin1_codes a1
on a1.code = p.admin1_code
and p.country_code = a1.country_code
left join admin2_codes a2
on a2.code = p.admin2_code
and a2.admin1_code = p.admin1_code
and a2.country_code = p.country_code
left join admin3_codes a3
on a3.code = p.admin3_code
and a3.admin1_code = p.admin1_code
and a3.admin2_code = p.admin2_code
and a3.country_code = p.country_code
where p.country_code not in ({exclude_country_codes})
group by postal_code, p.country_code
'''.format(
fields=','.join([f.name for f in postal_code_fields]),
exclude_country_codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))
wikipedia_query = '''
select alternate_name, geonames_id, is_preferred_name
from alternate_names
where iso_language = 'link'
and alternate_name like '%%en.wikipedia%%'
order by alternate_name, is_preferred_name
'''
BATCH_SIZE = 2000
wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')
def normalize_wikipedia_title(title):
return safe_decode(title).replace(u'_', u' ')
def normalize_wikipedia_url(url):
url = urllib.unquote_plus(url)
parsed = urlparse.urlsplit(url)
if parsed.query:
params = urlparse.parse_qs(parsed.query)
if 'title' in params:
return normalize_wikipedia_title(params['title'][0])
title = parsed.path.rsplit('/', 1)[-1]
if title not in ('index.php', 'index.html'):
return normalize_wikipedia_title(title)
return None
def normalize_name(name):
name = name.replace('&', 'and')
name = name.replace('-', ' ')
name = name.replace(', ', ' ')
name = name.replace(',', ' ')
return name
saint_replacements = [
('st.', 'saint'),
('st.', 'st'),
('st', 'saint')
]
abbreviated_saint_regex = re.compile(r'\bSt(\.|\b)')
def normalize_display_name(name):
return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')
def utf8_normalize(s, form='NFD'):
return unicodedata.normalize(form, s)
def get_wikipedia_titles(db):
d = defaultdict(dict)
cursor = db.execute(wikipedia_query)
while True:
batch = cursor.fetchmany(BATCH_SIZE)
if not batch:
break
for (url, geonames_id, is_preferred) in batch:
title = normalize_wikipedia_url(safe_encode(url))
if title is not None and title.strip():
title = utf8_normalize(normalize_name(title))
d[title.lower()][geonames_id] = int(is_preferred or 0)
return d
def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
'''
Writes geonames.tsv using the specified db to the specified data directory
'''
filename = os.path.join(out_dir, 'geonames.tsv')
temp_filename = filename + '.tmp'
f = open(temp_filename, 'w')
writer = csv.writer(f, 'tsv_no_quote')
init_languages()
init_country_names()
wiki_titles = get_wikipedia_titles(db)
logging.info('Fetched Wikipedia titles')
# Iterate over GeoNames boundary types from largest (country) to smallest (neighborhood)
for boundary_type, codes in geonames_admin_dictionaries.iteritems():
if boundary_type != boundary_types.COUNTRY:
predicate = 'where gn.feature_code in ({codes})'.format(
codes=','.join(['"{}"'.format(c) for c in codes])
)
else:
# The query for countries in GeoNames is somewhat non-trivial
predicate = 'where gn.geonames_id in (select geonames_id from countries)'
query = base_geonames_query.format(
predicate=predicate
)
cursor = db.execute(query)
i = 1
while True:
# Fetch rows in batches to save memory
batch = cursor.fetchmany(BATCH_SIZE)
if not batch:
break
rows = []
for row in batch:
row = list(row)
row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
language = row[LANGUAGE_INDEX]
country_code = row[COUNTRY_CODE_INDEX]
is_preferred = int(row[PREFERRED_INDEX] or 0)
is_historical = int(row[HISTORICAL_INDEX] or 0)
lang_spoken = get_country_languages(country_code.lower(), official=False).get(language, None)
lang_official = get_country_languages(country_code.lower()).get(language, None) == 1
null_language = not language.strip()
is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
alpha2_code = None
is_orig_name = False
if boundary_type == boundary_types.COUNTRY:
alpha2_code = row[COUNTRY_CODE_INDEX]
is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
# Set the canonical for countries to the local name, see country_official_name in country_names.py
country_canonical = country_localized_display_name(alpha2_code.lower())
if not country_canonical or not country_canonical.strip():
raise ValueError('Could not get local canonical name for country code={}'.format(alpha2_code))
row[CANONICAL_NAME_INDEX] = country_canonical
geonames_id = row[GEONAMES_ID_INDEX]
name = utf8_normalize(safe_decode(row[NAME_INDEX]))
# For non-postal codes, don't count
if name.isdigit():
continue
wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), {}))
row[NAME_INDEX] = name
if boundary_type == boundary_types.COUNTRY:
norm_name = normalize_name(name.lower())
for s, repl in saint_replacements:
if not wikipedia_entries:
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
wiki_row = []
have_wikipedia = geonames_id in wikipedia_entries
wiki_preferred = wikipedia_entries.get(geonames_id, 0)
'''
The following set of heuristics assigns a numerical value to a given name
alternative, such that in the case of ambiguous names, this value can be
used as part of the ranking function (as indeed it will be during sort).
The higher the value, the more likely the given entity resolution.
'''
if is_historical:
# Historical names, unlikely to be used
language_priority = 0
elif not null_language and language != 'abbr' and lang_spoken is None:
# Name of a place in language not widely spoken e.g. Japanese name for a US toponym
language_priority = 1
elif null_language and not is_preferred and not is_canonical:
# Null-language alternate names not marked as preferred, dubious
language_priority = 2
elif language == 'abbr' and not is_preferred:
# Abbreviation, not preferred
language_priority = 3
elif language == 'abbr' and is_preferred:
# Abbreviation, preferred e.g. NYC, UAE
language_priority = 4
elif lang_spoken and not lang_official and not is_preferred:
# Non-preferred name but in a spoken (non-official) language
language_priority = 5
elif lang_official == 1 and not is_preferred:
# Name in an official language, not preferred
language_priority = 6
elif null_language and not is_preferred and is_canonical:
# Canonical name, may be overly official e.g. Islamic Republic of Pakistan
language_priority = 7
elif is_preferred and not lang_official:
# Preferred names, not an official language
language_priority = 8
elif is_preferred and lang_official:
# Official language preferred
language_priority = 9
row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
if have_wikipedia:
wiki_row = row[:]
wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
rows.append(map(encode_field, wiki_row))
canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
have_normalized = False
if is_orig_name:
canonical_row = wiki_row[:] if have_wikipedia else row[:]
canonical_row_name = normalize_display_name(name)
if canonical_row_name != name:
canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
have_normalized = True
rows.append(map(encode_field, canonical_row))
if not have_wikipedia:
rows.append(map(encode_field, row))
# Country names have more specialized logic
if boundary_type == boundary_types.COUNTRY:
wikipedia_entries = wiki_titles.get(canonical.lower(), {})
canonical_row_name = normalize_display_name(canonical)
canonical_row = row[:]
if is_orig_name:
canonical = safe_decode(canonical)
canonical_row[NAME_INDEX] = safe_encode(canonical)
norm_name = normalize_name(canonical.lower())
for s, repl in saint_replacements:
if not wikipedia_entries:
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
if not wikipedia_entries:
norm_name = normalize_name(canonical_row_name.lower())
for s, repl in saint_replacements:
if not wikipedia_entries:
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
have_wikipedia = geonames_id in wikipedia_entries
wiki_preferred = wikipedia_entries.get(geonames_id, 0)
if have_wikipedia:
canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
if (name != canonical):
rows.append(map(encode_field, canonical_row))
if canonical_row_name != canonical and canonical_row_name != name:
canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
rows.append(map(encode_field, canonical_row))
if alpha2_code and is_orig_name:
alpha2_row = row[:]
alpha2_row[NAME_INDEX] = alpha2_code
alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
rows.append(map(encode_field, alpha2_row))
if alpha2_code.lower() in country_alpha3_map and is_orig_name:
alpha3_row = row[:]
alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code.lower()]
alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
rows.append(map(encode_field, alpha3_row))
writer.writerows(rows)
logging.info('Did {} batches'.format(i))
i += 1
cursor.close()
f.flush()
f.close()
logging.info('Sorting...')
env = os.environ.copy()
env['LC_ALL'] = 'C'
command = ['sort', '-t\t', '-u', '--ignore-case',
'-k{0},{0}'.format(NAME_INDEX + 1),
# If there's a Wikipedia link to this name for the given id, sort first
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
# Language priority rules as above
'-k{0},{0}nr'.format(DUMMY_LANGUAGE_PRIORITY_INDEX + 1),
# Sort descending by population (basic proxy for relevance)
'-k{0},{0}nr'.format(POPULATION_INDEX + 1),
# group rows for the same geonames ID together
'-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
# preferred names come first within that grouping
'-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
# since uniquing is done on the sort key, add language
'-k{0},{0}'.format(LANGUAGE_INDEX + 1),
'-o', filename, temp_filename]
p = subprocess.Popen(command, env=env)
return_code = p.wait()
if return_code != 0:
raise subprocess.CalledProcessError(return_code, command)
os.unlink(temp_filename)
def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR):
filename = os.path.join(out_dir, 'postal_codes.tsv')
temp_filename = filename + '.tmp'
f = open(temp_filename, 'w')
writer = csv.writer(f, 'tsv_no_quote')
cursor = db.execute(postal_codes_query)
i = 1
while True:
batch = cursor.fetchmany(BATCH_SIZE)
if not batch:
break
rows = [
map(encode_field, row)
for row in batch
]
writer.writerows(rows)
logging.info('Did {} batches'.format(i))
i += 1
cursor.close()
f.close()
logging.info('Sorting...')
subprocess.check_call([
'sort', '-t\t', '--ignore-case',
'-k{0},{0}'.format(POSTAL_CODE_INDEX + 1),
'-k{0},{0}nr'.format(POSTAL_CODE_POP_INDEX + 1),
'-o', filename,
temp_filename
])
os.unlink(temp_filename)
# Generates a C header telling us the order of the fields as written
GEONAMES_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'src', 'geonames_fields.h')
GEONAMES_FIELDS_HEADER_FILE = '''enum geonames_fields {{
{fields},
NUM_GEONAMES_FIELDS
}};
'''.format(fields=''',
'''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(geonames_fields)]))
def write_geonames_fields_header(filename=GEONAMES_FIELDS_HEADER):
with open(filename, 'w') as f:
f.write(GEONAMES_FIELDS_HEADER_FILE)
POSTAL_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'src', 'postal_fields.h')
POSTAL_FIELDS_HEADER_FILE = '''enum gn_postal_fields {{
{fields},
NUM_POSTAL_FIELDS
}};
'''.format(fields=''',
'''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(postal_code_fields)]))
def write_postal_fields_header(filename=POSTAL_FIELDS_HEADER):
with open(filename, 'w') as f:
f.write(POSTAL_FIELDS_HEADER_FILE)
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--db',
default=DEFAULT_GEONAMES_DB_PATH,
help='SQLite db file')
parser.add_argument('-o', '--out',
default=DEFAULT_DATA_DIR, help='output directory')
args = parser.parse_args()
db = sqlite3.connect(args.db)
create_geonames_tsv(db, args.out)
create_postal_codes_tsv(db, args.out)
write_geonames_fields_header()
write_postal_fields_header()
db.close()

View File

@@ -0,0 +1,30 @@
import sqlite3
from collections import defaultdict
class GeoNamesDB(object):
names_query = '''
select iso_language, alternate_name,
is_preferred_name, is_short_name
from alternate_names
where geonames_id = ?
and is_historic != '1'
and is_colloquial != '1'
and iso_language != 'post'
order by iso_language, cast(is_preferred_name as integer) desc, cast(is_short_name as integer)
'''
def __init__(self, filename):
self.db = sqlite3.connect(filename)
def query(self, query, *params):
return self.db.execute(self.names_query, params)
def get_alternate_names(self, geonames_id):
cursor = self.query(self.names_query, geonames_id)
language_names = defaultdict(list)
for language, name, is_preferred, is_short in cursor:
language_names[language].append((name,
int(is_preferred or 0),
int(is_short or 0)))
return dict(language_names)

View File

@@ -0,0 +1,333 @@
import os
import shutil
import sqlite3
import tempfile
import urlparse
import urllib2
import subprocess
import logging
import argparse
import csv
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_decode
from geodata.geonames.paths import *
from geodata.file_utils import *
from geodata.log import *
from itertools import islice, chain
log_to_file(sys.stderr)
logger = logging.getLogger('geonames.sqlite')
BASE_URL = 'http://download.geonames.org/export/'
DUMP_URL = urlparse.urljoin(BASE_URL, 'dump/')
ALL_COUNTRIES_ZIP_FILE = 'allCountries.zip'
HIERARCHY_ZIP_FILE = 'hierarchy.zip'
ALTERNATE_NAMES_ZIP_FILE = 'alternateNames.zip'
ZIP_URL = urlparse.urljoin(BASE_URL, 'zip/')
GEONAMES_DUMP_FILES = (ALL_COUNTRIES_ZIP_FILE,
HIERARCHY_ZIP_FILE,
ALTERNATE_NAMES_ZIP_FILE)
# base_url, local_dir, is_gzipped, local_filename
GEONAMES_FILES = [(DUMP_URL, '', True, ALL_COUNTRIES_ZIP_FILE),
(DUMP_URL, '', True, HIERARCHY_ZIP_FILE),
(DUMP_URL, '', True, ALTERNATE_NAMES_ZIP_FILE),
(ZIP_URL, 'zip', True, ALL_COUNTRIES_ZIP_FILE),
]
def download_file(url, dest):
logger.info('Downloading file from {}'.format(url))
subprocess.check_call(['wget', url, '-O', dest])
def admin_ddl(admin_level):
columns = ['country_code TEXT'] + \
['admin{}_code TEXT'.format(i)
for i in xrange(1, admin_level)]
create = '''
CREATE TABLE admin{level}_codes (
geonames_id INT,
code TEXT,
name TEXT,
{fields}
)'''.format(level=admin_level,
fields=''',
'''.join(columns))
indices = (
'''CREATE INDEX admin{}_code_index ON
admin{}_codes (code)'''.format(admin_level, admin_level),
'''CREATE INDEX admin{}_gn_id_index ON
admin{}_codes (geonames_id)'''.format(admin_level, admin_level),
)
return (create, ) + indices
geonames_ddl = {
'geonames': (
'''CREATE TABLE geonames (
geonames_id INT PRIMARY KEY,
name TEXT,
ascii_name TEXT,
alternate_names TEXT,
latitude DOUBLE,
longitude DOUBLE,
feature_class TEXT,
feature_code TEXT,
country_code TEXT,
cc2 TEXT,
admin1_code TEXT,
admin2_code TEXT,
admin3_code TEXT,
admin4_code TEXT,
population LONG DEFAULT 0,
elevation INT,
dem INT,
timezone TEXT,
modification_date TEXT)''',
'''CREATE INDEX feature_code ON
geonames (feature_code)''',
'''CREATE INDEX country_code ON
geonames (country_code)''',
'''CREATE INDEX admin_codes ON
geonames (country_code, admin1_code, admin2_code, admin3_code, admin4_code)''',
),
'alternate_names': (
'''CREATE TABLE alternate_names (
alternate_name_id INT PRIMARY KEY,
geonames_id INT,
iso_language TEXT,
alternate_name TEXT,
is_preferred_name BOOLEAN DEFAULT 0,
is_short_name BOOLEAN DEFAULT 0,
is_colloquial BOOLEAN DEFAULT 0,
is_historic BOOLEAN DEFAULT 0)''',
'''CREATE INDEX geonames_id_index ON
alternate_names (geonames_id)''',
'''CREATE INDEX geonames_id_alt_name_index ON
alternate_names(geonames_id, alternate_name)''',
),
'hierarchy': (
'''CREATE TABLE hierarchy (
parent_id INT,
child_id INT,
type TEXT
);''',
'''CREATE INDEX parent_child_index ON
hierarchy (parent_id, child_id)''',
'''CREATE INDEX child_parent_index ON
hierarchy (child_id, parent_id)''',
),
'postal_codes': (
'''CREATE TABLE postal_codes (
country_code TEXT,
postal_code TEXT,
place_name TEXT,
admin1 TEXT,
admin1_code TEXT,
admin2 TEXT,
admin2_code TEXT,
admin3 TEXT,
admin3_code TEXT,
latitude DOUBLE,
longitude DOUBLE,
accuracy INT
)''',
'''CREATE INDEX post_code_index ON
postal_codes (country_code, postal_code)''',
'''CREATE INDEX postal_code_admins ON
postal_codes (country_code, admin1_code, admin2_code, admin3_code)''',
),
'admin1_codes': admin_ddl(1),
'admin2_codes': admin_ddl(2),
'admin3_codes': admin_ddl(3),
'admin4_codes': admin_ddl(4),
}
geonames_file_table_map = {
('', ALL_COUNTRIES_ZIP_FILE): 'geonames',
('', ALTERNATE_NAMES_ZIP_FILE): 'alternate_names',
('', HIERARCHY_ZIP_FILE): 'hierarchy',
('zip', ALL_COUNTRIES_ZIP_FILE): 'postal_codes',
}
country_codes_create_table = (
'drop table if exists country_codes',
'''
create table country_codes as
select distinct country_code from geonames
where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS', 'TERR')
''',
)
proper_countries_create_table = (
'drop table if exists proper_countries',
'''
create table proper_countries as
select * from geonames
where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
and country_code in (select country_code from country_codes)
''',
)
territories_create_table = (
'drop table if exists territories',
'''
create table territories as
select * from geonames where feature_code = 'TERR'
and country_code not in (select country_code from proper_countries);
''',
)
countries_create_table = (
'drop table if exists countries',
'''
create table countries as
select * from proper_countries
union
select * from territories;
''',
'create index country_geonames_id on countries (geonames_id)',
'create index conntry_country_code on countries (country_code)',
)
country_alises_create_table = (
'drop table if exists country_aliases',
'''
create table country_aliases as
select name, country_code
from countries
union
select alternate_name, country_code
from alternate_names an
join countries c
on c.geonames_id = an.geonames_id
where alternate_name != ''
and iso_language not in ('doi','faac','iata',
'icao','link','post','tcid')
'''
)
country_table_create_statements = list(chain(country_codes_create_table,
proper_countries_create_table,
territories_create_table,
countries_create_table,
country_alises_create_table))
def create_table(conn, table):
cursor = conn.cursor()
create_statements = geonames_ddl[table]
cursor.execute('DROP TABLE IF EXISTS {}'.format(table))
for statement in create_statements:
cursor.execute(statement)
conn.commit()
def batch_iter(iterable, batch_size):
source_iter = iter(iterable)
while True:
batch = list(islice(source_iter, batch_size))
if len(batch) > 0:
yield batch
else:
return
def populate_admin_table(conn, admin_level):
logging.info('Doing admin level {}'.format(admin_level))
columns = ['geonames_id',
'admin{}_code'.format(admin_level),
'name',
'country_code']
columns.extend(['admin{}_code'.format(i)
for i in xrange(1, admin_level)])
admin_insert_statement = '''
insert into "admin{}_codes"
select {}
from geonames
where feature_code = "ADM{}"
'''.format(admin_level, ','.join(columns), admin_level)
conn.execute(admin_insert_statement)
conn.commit()
logging.info('Done with admin level {}'.format(admin_level))
def import_geonames_table(conn, table, f, batch_size=2000):
# escape the brackets around the values format string so we can use later
statement = 'INSERT INTO "{}" VALUES ({{}})'.format(table)
cursor = conn.cursor()
for i, batch in enumerate(batch_iter(f, batch_size)):
num_cols = len(batch[0])
cursor.executemany(statement.format(','.join(['?'] * num_cols)), batch)
conn.commit()
cursor = conn.cursor()
logging.info('imported {} batches ({} records)'.format(i + 1, (i + 1) * batch_size))
cursor.close()
def create_geonames_sqlite_db(temp_dir, db_file=DEFAULT_GEONAMES_DB_PATH):
conn = sqlite3.connect(db_file)
logging.info('Created database at {}'.format(db_file))
for url, directory, is_gzipped, filename in GEONAMES_FILES:
table = geonames_file_table_map[(directory, filename)]
create_table(conn, table)
full_url = urlparse.urljoin(url, filename)
dest_dir = os.path.join(temp_dir, directory)
ensure_dir(dest_dir)
dest_file = os.path.join(dest_dir, filename)
download_file(full_url, dest_file)
if is_gzipped:
unzip_file(dest_file, dest_dir)
filename = dest_file.replace('.zip', '.txt')
reader = csv.reader(open(filename), delimiter='\t', quotechar=None)
lines = (map(safe_decode, line) for line in reader)
import_geonames_table(conn, table, lines)
logging.info('Creating countries tables')
for statement in country_table_create_statements:
conn.execute(statement)
conn.commit()
logging.info('Creating admin tables')
for admin_level in xrange(1, 5):
create_table(conn, 'admin{}_codes'.format(admin_level))
populate_admin_table(conn, admin_level)
conn.close()
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--temp-dir',
default=tempfile.gettempdir(),
help='Temporary work directory')
parser.add_argument('-o', '--out',
default=DEFAULT_GEONAMES_DB_PATH,
help='SQLite3 db filename')
args = parser.parse_args()
create_geonames_sqlite_db(args.temp_dir, args.out)

View File

@@ -0,0 +1,9 @@
import os
this_dir = os.path.realpath(os.path.dirname(__file__))
GEONAMES_DB_NAME = 'geonames.db'
DEFAULT_GEONAMES_DB_PATH = os.path.join(this_dir, os.path.pardir,
os.path.pardir, os.path.pardir,
'data', 'geonames', GEONAMES_DB_NAME)

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,154 @@
#!/usr/bin/env bash
: '
create_geoplanet_db.sh
-------------------------
Shell script to download Geo Planet and derive inputs
for address parser training set construction.
Usage: ./create_geoplanet_db.sh out_dir
'
if [ "$#" -ge 1 ]; then
OUT_DIR=$1
mkdir -p $OUT_DIR
else
OUT_DIR=$(pwd)
fi
GEOPLANET_ZIP_FILE="geoplanet_data_7.10.0.zip"
# Internet Archive URL
GEOPLANET_URL="https://archive.org/download/$GEOPLANET_ZIP_FILE/$GEOPLANET_ZIP_FILE"
GEOPLANET_ORIGINAL_PLACES_FILE="geoplanet_places_7.10.0.tsv"
GEOPLANET_ADMINS_FILE="geoplanet_admins_7.10.0.tsv"
GEOPLANET_ORIGINAL_ALIASES_FILE="geoplanet_aliases_7.10.0.tsv"
GEOPLANET_ALL_PLACES_FILE="geoplanet_all_places.tsv"
GEOPLANET_PLACES_FILE="geoplanet_places.tsv"
GEOPLANET_POSTAL_CODES_FILE="geoplanet_postal_codes.tsv"
GEOPLANET_ALIASES_FILE="geoplanet_aliases.tsv"
GEOPLANET_GEONAMES_CONCORDANCE_FILE="geonames-geoplanet-matches.csv"
GEOPLANET_GEONAMES_CONCORDANCE_URL="https://github.com/blackmad/geoplanet-concordance/raw/master/current/$GEOPLANET_GEONAMES_CONCORDANCE_FILE"
GEOPLANET_DB_FILE="geoplanet.db"
function download_file() {
echo "Downloading $1"
response=$(curl -sL -w "%{http_code}" $1 --retry 3 --retry-delay 5 -o $OUT_DIR/$2)
if [ $response -ne "200" ]; then
echo "Could not download $GEOPLANET_URL"
exit 1
fi
}
if [ ! -f $OUT_DIR/$GEOPLANET_ZIP_FILE ]; then
echo "Downloading GeoPlanet"
download_file $GEOPLANET_URL $GEOPLANET_ZIP_FILE
fi
cd $OUT_DIR
echo "Unzipping GeoPlanet file"
unzip -o $GEOPLANET_ZIP_FILE
echo "Creating GeoPlanet postal codes file"
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Zip") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_POSTAL_CODES_FILE
echo "Creating GeoPlanet all places file"
tail -n+2 $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_ALL_PLACES_FILE
echo "Creating GeoPlanet places file"
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Continent" || $5 == "Country" || $5 == "Nationality" || $5 == "State" || $5 == "County" || $5 == "Town" || $5 == "LocalAdmin" || $5 == "Island" || $5 == "Suburb") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_PLACES_FILE
echo "Creating GeoPlanet aliases file"
tail -n+2 $GEOPLANET_ORIGINAL_ALIASES_FILE > $GEOPLANET_ALIASES_FILE
echo "Fetching GeoNames concordance"
download_file $GEOPLANET_GEONAMES_CONCORDANCE_URL $GEOPLANET_GEONAMES_CONCORDANCE_FILE
echo "Creating SQLite db"
echo "
DROP TABLE IF EXISTS places;
CREATE TABLE places (
id integer primary key,
country_code text,
name text,
language text,
place_type text,
parent_id integer
);
.separator \t
.import $OUT_DIR/$GEOPLANET_PLACES_FILE places
CREATE INDEX places_parent_id_index on places(parent_id);
CREATE INDEX places_country_code on places(country_code);
DROP TABLE IF EXISTS all_places;
CREATE TABLE all_places AS SELECT * FROM places WHERE 0;
.import $OUT_DIR/$GEOPLANET_ALL_PLACES_FILE all_places
DROP TABLE IF EXISTS postal_codes;
CREATE TABLE postal_codes (
id integer primary key,
country_code text,
name text,
language text,
place_type text,
parent_id integer
);
.import $OUT_DIR/$GEOPLANET_POSTAL_CODES_FILE postal_codes
CREATE INDEX postal_codes_parent_id_index on postal_codes(parent_id);
CREATE INDEX postal_codes_country_code on postal_codes(country_code);
DROP TABLE IF EXISTS admins;
CREATE TABLE admins (
id integer primary key,
country_code text,
state_id integer,
county_id integer,
local_admin_id integer,
country_id integer,
continent_id integer
);
.import $OUT_DIR/$GEOPLANET_ADMINS_FILE admins
CREATE INDEX admin_country_code on admins(country_code);
CREATE INDEX admin_state_id on admins(state_id);
CREATE INDEX admin_county_id on admins(county_id);
CREATE INDEX admin_local_admin_id on admins(local_admin_id);
CREATE INDEX admin_country_id on admins(country_id);
CREATE INDEX admin_continent_id on admins(continent_id);
DROP TABLE IF EXISTS aliases;
CREATE TABLE aliases (
id integer,
name text,
name_type text,
language text
);
.import $OUT_DIR/$GEOPLANET_ALIASES_FILE aliases
CREATE INDEX alias_id on aliases(id);
DROP TABLE IF EXISTS geonames_concordance;
CREATE TABLE geonames_concordance (
id integer primary key,
geonames_id integer,
name text,
lat number,
lon number
);
.mode csv
.import $OUT_DIR/$GEOPLANET_GEONAMES_CONCORDANCE_FILE geonames_concordance
CREATE INDEX geonames_concordance_geonames_id on geonames_concordance(geonames_id);
" | sqlite3 $OUT_DIR/$GEOPLANET_DB_FILE

View File

@@ -0,0 +1,353 @@
import argparse
import csv
import itertools
import os
import six
import sqlite3
import sys
from collections import defaultdict
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.address_expansions.abbreviations import abbreviate
from geodata.address_expansions.equivalence import equivalent
from geodata.address_expansions.gazetteers import *
from geodata.address_formatting.formatter import AddressFormatter
from geodata.countries.names import country_names
from geodata.postal_codes.validation import postcode_regexes
from geodata.names.normalization import name_affixes
from geodata.places.config import place_config
from geodata.csv_utils import tsv_string, unicode_csv_reader
GEOPLANET_DB_FILE = 'geoplanet.db'
GEOPLANET_FORMAT_DATA_TAGGED_FILENAME = 'geoplanet_formatted_addresses_tagged.tsv'
GEOPLANET_FORMAT_DATA_FILENAME = 'geoplanet_formatted_addresses.tsv'
class GeoPlanetFormatter(object):
# Map of GeoPlanet language codes to ISO-639 alpha2 language codes
language_codes = {
'ENG': 'en',
'JPN': 'ja',
'GER': 'de',
'SPA': 'es',
'FRE': 'fr',
'UNK': 'unk',
'ITA': 'it',
'POR': 'pt',
'POL': 'pl',
'ARA': 'ar',
'CZE': 'cs',
'SWE': 'sv',
'CHI': 'zh',
'RUM': 'ro',
'FIN': 'fi',
'DUT': 'nl',
'NOR': 'nb',
'DAN': 'da',
'HUN': 'hu',
'KOR': 'kr',
}
non_latin_script_languages = {
'JPN', # Japanese
'ARA', # Arabic
'CHI', # Chinese
'KOR', # Korean
}
ALIAS_PREFERRED = 'P'
ALIAS_PREFERRED_FOREIGN = 'Q'
ALIAS_VARIANT = 'V'
ALIAS_ABBREVIATED = 'A'
ALIAS_COLLOQUIAL = 'S'
# Map of GeoPlanet place types to address formatter types
place_types = {
'Continent': AddressFormatter.WORLD_REGION,
'Country': AddressFormatter.COUNTRY,
'CountryRegion': AddressFormatter.COUNTRY_REGION,
'State': AddressFormatter.STATE,
'County': AddressFormatter.STATE_DISTRICT,
'Island': AddressFormatter.ISLAND,
'Town': AddressFormatter.CITY,
# Note: if we do general place queris from GeoPlanet, this
# may have to be mapped more carefully
'LocalAdmin': AddressFormatter.CITY_DISTRICT,
'Suburb': AddressFormatter.SUBURB,
}
def __init__(self, geoplanet_db):
self.db = sqlite3.connect(geoplanet_db)
# These aren't too large and it's easier to have them in memory
self.places = {row[0]: row[1:] for row in self.db.execute('select * from places')}
self.aliases = defaultdict(list)
self.coterminous_admins = {}
self.admins_with_ambiguous_city = set()
print('Doing admin ambiguities')
for row in self.db.execute('''select p.id,
(select count(*) from places where parent_id = p.id) as num_places,
(select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns,
p2.id
from places p
join places p2
on p2.parent_id = p.id
and p.name = p2.name
and p.place_type != "Town"
and p2.place_type = "Town"
group by p.id'''):
place_id, num_places, num_towns, coterminous_town_id = row
num_places = int(num_places)
num_towns = int(num_towns)
if num_places == 1 and num_towns == 1:
self.coterminous_admins[place_id] = coterminous_town_id
self.admins_with_ambiguous_city.add(place_id)
print('num coterminous: {}'.format(len(self.coterminous_admins)))
print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city)))
print('Doing aliases')
for row in self.db.execute('''select a.* from aliases a
left join places p
on a.id = p.id
and p.place_type in ("State", "County")
and a.language != p.language
where name_type != "S" -- no colloquial aliases like "The Big Apple"
and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK
and p.id is NULL -- exclude foreign-language states/county names
order by id, language,
case name_type
when "P" then 1
when "Q" then 2
when "V" then 3
when "A" then 4
when "S" then 5
else 6
end'''):
place = self.places.get(row[0])
if not place:
continue
self.aliases[row[0]].append(row[1:])
print('Doing variant aliases')
variant_aliases = 0
for i, row in enumerate(self.db.execute('''select a.*, p.name, p.country_code from aliases a
join places p using(id)
where a.name_type = "V"
and a.language = p.language''')):
place_name, country_code = row[-2:]
country = country_code.lower()
row = row[:-2]
place_id, alias, name_type, language = row
language = self.language_codes[language]
if language != 'unk':
alias_sans_affixes = name_affixes.replace_affixes(alias, language, country=country)
if alias_sans_affixes:
alias = alias_sans_affixes
place_name_sans_affixes = name_affixes.replace_affixes(place_name, language, country=country)
if place_name_sans_affixes:
place_name = place_name_sans_affixes
else:
language = None
if equivalent(place_name, alias, toponym_abbreviations_gazetteer, language):
self.aliases[row[0]].append(row[1:])
variant_aliases += 1
if i % 10000 == 0 and i > 0:
print('tested {} variant aliases with {} positives'.format(i, variant_aliases))
self.aliases = dict(self.aliases)
self.formatter = AddressFormatter()
def get_place_hierarchy(self, place_id):
all_places = []
original_place_id = place_id
place = self.places[place_id]
all_places.append((place_id, ) + place)
place_id = place[-1]
while place_id != 1 and place_id != original_place_id:
place = self.places[place_id]
all_places.append((place_id,) + place)
place_id = place[-1]
return all_places
def get_aliases(self, place_id):
return self.aliases.get(place_id, [])
def cleanup_name(self, name):
return name.strip(' ,-')
def format_postal_codes(self, tag_components=True):
all_postal_codes = self.db.execute('select * from postal_codes')
for postal_code_id, country, postal_code, language, place_type, parent_id in all_postal_codes:
country = country.lower()
postcode_language = language
language = self.language_codes[language]
if len(postal_code) <= 3:
postcode_regex = postcode_regexes.get(country)
valid_postcode = False
if postcode_regex:
match = postcode_regex.match(postal_code)
if match and match.end() == len(postal_code):
valid_postcode = True
if not valid_postcode:
continue
# If the county/state is coterminous with a city and contains only one place,
# set the parent_id to the city instead
if parent_id in self.coterminous_admins:
parent_id = self.coterminous_admins[parent_id]
place_hierarchy = self.get_place_hierarchy(parent_id)
containing_places = defaultdict(set)
language_places = {None: containing_places}
original_language = language
have_default_language = False
if place_hierarchy:
base_place_id, _, _, _, base_place_type, _ = place_hierarchy[0]
base_place_type = self.place_types[base_place_type]
else:
base_place_id = None
base_place_type = None
place_types_seen = set()
for place_id, country, name, lang, place_type, parent in place_hierarchy:
country = country.lower()
# First language
if not have_default_language and lang != postcode_language:
language = self.language_codes[lang]
have_default_language = True
place_type = self.place_types[place_type]
if AddressFormatter.CITY not in place_types_seen and place_id in self.admins_with_ambiguous_city:
continue
name = self.cleanup_name(name)
containing_places[place_type].add(name)
aliases = self.get_aliases(place_id)
for name, name_type, alias_lang in aliases:
if not alias_lang:
alias_lang = 'UNK'
if alias_lang == lang and lang != 'UNK':
alias_language = None
else:
alias_language = self.language_codes[alias_lang]
language_places.setdefault(alias_language, defaultdict(set))
lang_places = language_places[alias_language]
name = self.cleanup_name(name)
lang_places[place_type].add(name)
place_types_seen.add(place_type)
default_city_names = set([name.lower() for name in language_places.get(None, {}).get(AddressFormatter.CITY, [])])
for language, containing_places in six.iteritems(language_places):
if language is None:
language = original_language
country_localized_name = country_names.localized_name(country, language)
if country_localized_name:
containing_places[AddressFormatter.COUNTRY].add(country_localized_name)
country_alpha3_code = country_names.alpha3_code(country)
if country_alpha3_code and language in (None, 'ENG'):
containing_places[AddressFormatter.COUNTRY].add(country_alpha3_code)
keys = containing_places.keys()
all_values = containing_places.values()
keys_set = set(keys)
for i, values in enumerate(itertools.product(*all_values)):
components = {
AddressFormatter.POSTCODE: postal_code
}
if not default_city_names:
components.update(zip(keys, values))
else:
for k, v in zip(keys, values):
if k == AddressFormatter.CITY or AddressFormatter.CITY in keys_set or v.lower() not in default_city_names:
components[k] = v
format_language = language if self.formatter.template_language_matters(country, language) else None
formatted = self.formatter.format_address(components, country, language=format_language,
minimal_only=False, tag_components=tag_components)
yield (language, country, formatted)
component_keys = set(components)
components = place_config.dropout_components(components, (), country=country, population=0)
if len(components) > 1 and set(components) ^ component_keys:
formatted = self.formatter.format_address(components, country, language=format_language,
minimal_only=False, tag_components=tag_components)
yield (language, country, formatted)
def build_training_data(self, out_dir, tag_components=True):
if tag_components:
formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_TAGGED_FILENAME), 'w')
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
else:
formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_FILENAME), 'w')
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
i = 0
for language, country, formatted_address in self.format_postal_codes(tag_components=tag_components):
if not formatted_address or not formatted_address.strip():
continue
formatted_address = tsv_string(formatted_address)
if not formatted_address or not formatted_address.strip():
continue
if tag_components:
row = (language, country, formatted_address)
else:
row = (formatted_address,)
writer.writerow(row)
i += 1
if i % 1000 == 0 and i > 0:
print('did {} formatted addresses'.format(i))
if __name__ == '__main__':
if len(sys.argv) < 3:
sys.exit('Usage: python geoplanet_training_data.py geoplanet_db_path out_dir')
geoplanet_db_path = sys.argv[1]
out_dir = sys.argv[2]
geoplanet = GeoPlanetFormatter(geoplanet_db_path)
geoplanet.build_training_data(out_dir)

View File

View File

@@ -0,0 +1,41 @@
VISIT, VISIT_EDGE, POST_VISIT = range(3)
def strongly_connected_components(graph):
'''
Find strongly connected components in a graph using iterative
depth-first search.
Based on:
http://code.activestate.com/recipes/578507-strongly-connected-components-of-a-directed-graph/
'''
identified = set()
stack = []
index = {}
boundaries = []
for v in graph:
if v not in index:
todo = [(VISIT, v)]
while todo:
op, v = todo.pop()
if op == VISIT:
index[v] = len(stack)
stack.append(v)
boundaries.append(index[v])
todo.append((POST_VISIT, v))
todo.extend([(VISIT_EDGE, w) for w in graph[v]])
elif op == VISIT_EDGE:
if v not in index:
todo.append((VISIT, v))
elif v not in identified:
while index[v] < boundaries[-1]:
boundaries.pop()
else:
# op == POST_VISIT
if boundaries[-1] == index[v]:
boundaries.pop()
scc = stack[index[v]:]
del stack[index[v]:]
identified.update(scc)
yield scc

View File

@@ -0,0 +1,32 @@
def topsort(graph):
'''
Topological sort for a dependency graph, e.g.
Usage:
>>> graph = {
'a': ['b'],
'b': ['d'],
'c': ['d', 'a'],
'd': [],
}
>>> topsort(graph)
Returns: ['d', 'b', 'a', 'c']
'''
todos = set(graph.keys())
seen = set()
result = []
while todos:
for key in todos:
deps = graph[key]
if len([d for d in deps if d in seen]) == len(deps):
break
else:
raise Exception('Cycle: {}'.format(todos))
todos.remove(key)
result.append(key)
seen.add(key)
return result

View File

View File

@@ -0,0 +1,139 @@
import argparse
import csv
import os
import requests
from collections import Counter
from cStringIO import StringIO
from lxml import etree
from unicode_paths import CLDR_DIR
this_dir = os.path.realpath(os.path.dirname(__file__))
DEFAULT_LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'language', 'countries')
CLDR_SUPPLEMENTAL_DATA = os.path.join(CLDR_DIR, 'common', 'supplemental',
'supplementalData.xml')
ISO_639_3 = 'http://www-01.sil.org/iso639-3/iso-639-3.tab'
ISO_MACROLANGUAGES = 'http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab'
ISO_LANGUAGES_FILENAME = 'iso_languages.tsv'
MACROLANGUAGES_FILENAME = 'iso_macrolanguages.tsv'
COUNTRY_LANGUAGES_FILENAME = 'country_language.tsv'
SCRIPT_LANGUAGES_FILENAME = 'script_languages.tsv'
REGIONAL = 'official_regional'
UNKNOWN_COUNTRY = 'zz'
UNKNOWN_LANGUAGES = ('und', 'zxx')
def write_country_official_languages_file(xml, out_dir):
lang_file = open(os.path.join(out_dir, COUNTRY_LANGUAGES_FILENAME), 'w')
lang_writer = csv.writer(lang_file, delimiter='\t')
def get_population_pct(lang):
return int(lang.attrib.get('populationPercent', 0))
lang_scripts = {}
for lang in xml.xpath('//languageData/language'):
language_code = lang.attrib['type'].lower()
scripts = lang.get('scripts')
if not scripts:
continue
territories = lang.get('territories')
if (language_code, None) not in lang_scripts:
lang_scripts[(language_code, None)] = scripts
if not territories:
continue
for territory in territories.strip().split():
lang_scripts[(language_code, territory.lower())] = scripts
for territory in xml.xpath('//territoryInfo/territory'):
country_code = territory.attrib['type'].lower()
if country_code == UNKNOWN_COUNTRY:
continue
langs = territory.xpath('languagePopulation')
languages = Counter()
official = set()
regional = set()
for lang in langs:
language = lang.attrib['type'].lower().split('_')[0]
official_status = lang.attrib.get('officialStatus')
languages[language] += float(lang.attrib['populationPercent'])
if official_status and official_status != REGIONAL:
official.add(language)
elif official_status == REGIONAL:
regional.add(language)
if official:
languages = Counter({l: c for l, c in languages.iteritems()
if l in official or l in regional})
else:
languages = Counter({l: c for l, c in languages.most_common(1)})
for lang, pct in languages.most_common():
if lang in UNKNOWN_LANGUAGES:
continue
script = lang_scripts.get((lang, country_code), lang_scripts.get((lang, None), ''))
lang_writer.writerow((country_code, lang, script.replace(' ', ','),
str(min(pct, 100.0)), str(int(lang in official))))
RETIRED = 'R'
INDIVIDUAL = 'I'
MACRO = 'M'
LIVING = 'L'
def write_languages_file(langs, macro, out_dir):
lang_file = open(os.path.join(out_dir, 'iso_languages.tsv'), 'w')
writer = csv.writer(lang_file, delimiter='\t')
writer.writerow(('ISO 639-3', 'ISO 639-2B', 'ISO 639-2T',
'ISO 639-1', 'type', 'macro'))
macro_reader = csv.reader(StringIO(macro), delimiter='\t')
headers = macro_reader.next()
assert len(headers) == 3
macros = {minor_code: macro_code for (macro_code, minor_code, status)
in macro_reader if status != RETIRED}
lang_reader = csv.reader(StringIO(langs), delimiter='\t')
headers = lang_reader.next()
assert headers[:6] == ['Id', 'Part2B', 'Part2T',
'Part1', 'Scope', 'Language_Type']
for line in lang_reader:
iso639_3, iso639_2b, iso639_2t, iso639_1, scope, lang_type = line[:6]
macro = macros.get(iso639_3, '')
# Only living languages that are either individual or macro
if scope in (INDIVIDUAL, MACRO) and lang_type == LIVING:
writer.writerow((iso639_3, iso639_2b, iso639_2t,
iso639_1, scope, macro))
def fetch_cldr_languages(out_dir=DEFAULT_LANGUAGES_DIR):
response = requests.get(ISO_639_3)
langs = response.content
response = requests.get(ISO_MACROLANGUAGES)
macro = response.content
write_languages_file(langs, macro, out_dir)
supplemental = open(CLDR_SUPPLEMENTAL_DATA)
xml = etree.parse(supplemental)
write_country_official_languages_file(xml, out_dir)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--out',
default=DEFAULT_LANGUAGES_DIR,
help='Out directory')
args = parser.parse_args()
fetch_cldr_languages(args.out)

View File

@@ -0,0 +1,30 @@
import os
import shutil
import subprocess
import sys
import tempfile
from unicode_paths import CLDR_DIR
from geodata.file_utils import ensure_dir
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
CLDR_URL = 'http://www.unicode.org/Public/cldr/latest/core.zip'
def download_cldr(temp_dir=None):
if os.path.exists(CLDR_DIR):
shutil.rmtree(CLDR_DIR)
ensure_dir(CLDR_DIR)
if not temp_dir:
temp_dir = tempfile.gettempdir()
cldr_filename = os.path.join(temp_dir, CLDR_URL.rsplit('/', 1)[-1])
subprocess.check_call(['wget', CLDR_URL, '-O', cldr_filename])
subprocess.check_call(['unzip', cldr_filename, '-d', CLDR_DIR])
if __name__ == '__main__':
download_cldr(*sys.argv[1:])

View File

@@ -0,0 +1,37 @@
import re
import requests
import six.moves.urllib_parse as urlparse
import ujson
requests.models.json = ujson
GOOGLE_I18N_API = 'http://i18napis.appspot.com'
GOOGLE_ADDRESS_DATA_API = urlparse.urljoin(GOOGLE_I18N_API, 'address/data/')
class GoogleI18N(object):
'''
Fetches data from e.g. http://i18napis.appspot.com/address/data/GB
and caches it in a dictionary for each country. These requests are
lightweight, so for a given run of a program, max 250 requests
will be made.
'''
def __init__(self):
self.responses = {}
def get(self, country_code):
ret = self.responses.get(country_code.lower())
if ret is None:
url = urlparse.urljoin(GOOGLE_ADDRESS_DATA_API, country_code.upper())
response = requests.get(url)
if response.ok:
ret = response.json()
self.responses[country_code.lower()] = ret
else:
self.responses[country_code.lower()] = {}
return ret
google_i18n = GoogleI18N()

View File

@@ -0,0 +1,86 @@
import os
import csv
import sys
from collections import defaultdict, OrderedDict
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.csv_utils import unicode_csv_reader
LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'language')
country_languages = defaultdict(OrderedDict)
# Only official and de facto official, no official_regional
official_languages = defaultdict(OrderedDict)
regional_languages = defaultdict(OrderedDict)
road_language_overrides = defaultdict(OrderedDict)
languages = set()
all_languages = languages
osm_admin1_ids = set()
languages_initialized = False
def init_languages(languages_dir=LANGUAGES_DIR):
global languages_initialized
if languages_initialized:
return
path = os.path.join(languages_dir, 'countries', 'country_language.tsv')
if not os.path.exists(path):
raise ValueError('File does not exist: {}'.format(path))
for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
country_languages[country][lang] = int(is_official)
languages.add(lang)
for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
if int(is_official) or len(country_languages[country]) == 1:
official_languages[country][lang] = 1
path = os.path.join(languages_dir, 'countries', 'road_sign_languages.tsv')
for country, lang, default in csv.reader(open(path), delimiter='\t'):
road_language_overrides[country][lang] = int(default)
if lang not in languages:
languages.add(lang)
path = os.path.join(languages_dir, 'regional', 'adm1.tsv')
for country, key, value, langs, default in unicode_csv_reader(open(path), delimiter='\t'):
if key == 'osm':
osm_admin1_ids.add(tuple(value.split(':')))
for lang in langs.split(','):
regional_languages[(country, key, value)][lang] = int(default)
if lang not in country_languages[country]:
country_languages[country][lang] = 0
if lang not in languages:
languages.add(lang)
languages_initialized = True
init_languages()
def get_country_languages(country, official=True, overrides=True):
if official:
languages = official_languages[country]
else:
languages = country_languages[country]
if overrides:
road_overrides = road_language_overrides.get(country)
if road_overrides and road_overrides.values()[0]:
languages = road_overrides
elif road_overrides:
languages.update(road_overrides)
return languages
def get_regional_languages(country, key, value):
return regional_languages.get((country, key, value), OrderedDict())

View File

@@ -0,0 +1,5 @@
import unicodedata
def strip_accents(s):
return u''.join([c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'])

View File

@@ -0,0 +1,37 @@
import re
import os
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_decode
class Scanner(object):
'''
Simple scanner implementation in Python using regular expression groups.
Used to create dynamic lexicons for parsing various CLDR files
without compiling a C scanner. Only C scanners are used at runtime
'''
def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
self.lexicon = lexicon
regexes, responses = zip(*lexicon)
self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
self.responses = responses
def scan(self, s):
for match in self.regex.finditer(safe_decode(s)):
i = match.lastindex
response = self.responses[i - 1]
token = match.group(i)
if not callable(response):
yield (token, response)
else:
responses = response(match, token)
if responses is not None:
for response, token in responses:
yield (token, response)

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,273 @@
'''
unicode_data.py
---------------
Python's unicodedata module uses an outdated spec (Unicode 5.2) and since
e.g. unicode categories are used in tokenization, we'd like to keep this
as up-to-date as possible with the latest standard.
'''
import csv
import os
import sys
from collections import defaultdict, namedtuple
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.file_utils import download_file
from geodata.string_utils import wide_unichr, wide_ord
from unicode_properties import *
from unicode_paths import UNICODE_DATA_DIR
UNIDATA_URL = 'http://unicode.org/Public/UNIDATA/UnicodeData.txt'
UNIDATA_DIR = os.path.join(UNICODE_DATA_DIR, 'unidata')
LOCAL_UNIDATA_FILE = os.path.join(UNIDATA_DIR, 'UnicodeData.txt')
unicode_categories = defaultdict(list)
unicode_blocks = defaultdict(list)
unicode_combining_classes = defaultdict(list)
unicode_general_categories = defaultdict(list)
unicode_scripts = defaultdict(list)
unicode_properties = {}
unicode_script_ids = {}
unicode_blocks = {}
unicode_category_aliases = {}
unicode_property_aliases = {}
unicode_property_value_aliases = {}
unicode_word_breaks = {}
# Ref: ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
UNIDATA_FIELDS = [
'code',
'name',
'category',
'combining',
'bidi_category',
'decomp_mapping',
'decimal_value',
'digit_value',
'numeric_value',
'mirrored',
'unicode_1_name',
'comment',
'upper_mapping',
'lower_mapping',
'title_mapping',
]
UnicodeDataRow = namedtuple('UnicodeDataRow', ','.join(UNIDATA_FIELDS))
def parse_unicode_data():
'''
Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
'''
if not os.path.exists(LOCAL_UNIDATA_FILE):
download_file(UNIDATA_URL, LOCAL_UNIDATA_FILE)
unidata_file = open(LOCAL_UNIDATA_FILE)
for line in csv.reader(unidata_file, delimiter=';'):
yield UnicodeDataRow(*line)
def iter_unicode_combining_classes():
return unicode_combining_classes.iteritems()
def iter_unicode_categories():
return unicode_categories.iteritems()
def get_unicode_category(cat):
return unicode_categories[cat]
def get_unicode_combining_class(c):
return unicode_combining_classes[c]
def get_unicode_categories():
'''
Build dict of unicode categories e.g.
{
'Lu': ['A', 'B', 'C', ...]
'Ll': ['a', 'b', 'c', ...]
}
'''
categories = defaultdict(list)
for row in parse_unicode_data():
categories[row.category].append(wide_unichr(unicode_to_integer(row.code)))
return dict(categories)
def get_unicode_combining_classes():
'''
Build dict of unicode combining classes e.g.
{
'0': ['\x00', '\x01', \x02', ...]
}
'''
combining_classes = defaultdict(list)
for row in parse_unicode_data():
combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code)))
return dict(combining_classes)
unicode_category_aliases = {
'letter': 'L',
'lower': 'Ll',
'lowercase': 'Ll',
'lowercaseletter': 'Ll',
'upper': 'Lu',
'uppercase': 'Lu',
'uppercaseletter': 'Lu',
'title': 'Lt',
'nonspacing mark': 'Mn',
'mark': 'M',
}
COMBINING_CLASS_PROP = 'canonical_combining_class'
BLOCK_PROP = 'block'
GENERAL_CATEGORY_PROP = 'general_category'
SCRIPT_PROP = 'script'
WORD_BREAK_PROP = 'word_break'
def init_unicode_categories():
'''
Initialize module-level dictionaries
'''
global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
unicode_categories.update(get_unicode_categories())
unicode_combining_classes.update(get_unicode_combining_classes())
for key in unicode_categories.keys():
unicode_general_categories[key[0]].extend(unicode_categories[key])
script_chars = get_chars_by_script()
for i, script in enumerate(script_chars):
if script:
unicode_scripts[script.lower()].append(wide_unichr(i))
unicode_scripts = dict(unicode_scripts)
unicode_script_ids.update(build_master_scripts_list(script_chars))
unicode_blocks.update(get_unicode_blocks())
unicode_properties.update(get_unicode_properties())
unicode_property_aliases.update(get_property_aliases())
unicode_word_breaks.update(get_word_break_properties())
for key, value in get_property_value_aliases().iteritems():
key = unicode_property_aliases.get(key, key)
if key == GENERAL_CATEGORY_PROP:
for k, v in value.iteritems():
k = k.lower()
unicode_category_aliases[k] = v
if '_' in k:
unicode_category_aliases[k.replace('_', '')] = v
unicode_property_value_aliases[key] = value
regex_chars = re.compile('([\[\]\{\}\-\^])')
def replace_regex_chars(s):
return regex_chars.sub(r'\\\1', s)
def format_regex_char(i):
c = wide_unichr(i)
return replace_regex_chars(c.encode('unicode-escape'))
def make_char_set_regex(chars):
'''
Build a regex character set from a list of characters
'''
group_start = None
group_end = None
last_ord = -2
ords = map(wide_ord, chars)
ords.sort()
ords.append(None)
groups = []
for i, o in enumerate(ords):
if o is not None and o == last_ord + 1:
group_end = o
elif group_start is not None and group_end is not None:
groups.append('-'.join((format_regex_char(group_start), format_regex_char(group_end))))
group_end = None
group_start = o
elif group_start is not None and group_end is None:
groups.append(format_regex_char(group_start))
group_start = o
else:
group_start = o
last_ord = o
return u'[{}]'.format(u''.join(groups))
name_category = [
('control_chars', 'Cc'),
('other_format_chars', 'Cf'),
('other_not_assigned_chars', 'Cn'),
('other_private_use_chars', 'Co'),
('other_surrogate_chars', 'Cs'),
('letter_lower_chars', 'Ll'),
('letter_modifier_chars', 'Lm'),
('letter_other_chars', 'Lo'),
('letter_title_chars', 'Lt'),
('letter_upper_chars', 'Lu'),
('mark_spacing_combining_chars', 'Mc'),
('mark_enclosing_chars', 'Me'),
('mark_nonspacing_chars', 'Mn'),
('number_or_digit_chars', 'Nd'),
('number_letter_chars', 'Nl'),
('number_other_chars', 'No'),
('punct_connector_chars', 'Pc'),
('punct_dash_chars', 'Pd'),
('punct_close_chars', 'Pe'),
('punct_final_quote_chars', 'Pf'),
('punct_initial_quote_chars', 'Pi'),
('punct_other_chars', 'Po'),
('punct_open_chars', 'Ps'),
('currency_symbol_chars', 'Sc'),
('symbol_modifier_chars', 'Sk'),
('symbol_math_chars', 'Sm'),
('symbol_other_chars', 'So'),
('separator_line_chars', 'Zl'),
('separator_paragraph_chars', 'Zp'),
('space', 'Zs'),
]
def main():
init_unicode_categories()
for name, cat in name_category:
if cat not in unicode_categories:
continue
chars = unicode_categories[cat]
print u'{} = {};'.format(name, make_char_set_regex(chars))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,11 @@
import os
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources')
UNICODE_DATA_DIR = os.path.join(DATA_DIR, 'unicode')
CLDR_DIR = os.path.join(UNICODE_DATA_DIR, 'cldr')

View File

@@ -0,0 +1,463 @@
'''
scripts.py
This code uses the latest copy of Scripts.txt from unicode.org
to generate a C file (and header) defining which script every character
belongs to.
'''
import csv
import os
import requests
import re
import sys
import tempfile
import requests
import subprocess
from cStringIO import StringIO
from collections import OrderedDict, defaultdict
from itertools import islice
from lxml import etree
from operator import itemgetter
from zipfile import ZipFile
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_encode, safe_decode
from geodata.file_utils import ensure_dir, download_file
from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
from cldr_languages import *
from download_cldr import download_cldr
from languages import get_country_languages
from unicode_paths import UNICODE_DATA_DIR
from word_breaks import script_regex, regex_char_range
SRC_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src')
SCRIPTS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'scripts')
LOCAL_SCRIPTS_FILE = os.path.join(SCRIPTS_DATA_DIR, 'Scripts.txt')
LOCAL_ISO_15924_FILE = os.path.join(SCRIPTS_DATA_DIR, 'iso15924.txt')
BLOCKS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'blocks')
LOCAL_BLOCKS_FILE = os.path.join(BLOCKS_DATA_DIR, 'Blocks.txt')
PROPS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'props')
LOCAL_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'PropList.txt')
LOCAL_PROP_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyAliases.txt')
LOCAL_PROP_VALUE_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyValueAliases.txt')
LOCAL_DERIVED_CORE_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'DerivedCoreProperties.txt')
WORD_BREAKS_DIR = os.path.join(UNICODE_DATA_DIR, 'word_breaks')
LOCAL_WORD_BREAKS_FILE = os.path.join(WORD_BREAKS_DIR, 'WordBreakProperty.txt')
SCRIPTS_HEADER = 'unicode_script_types.h'
SCRIPTS_DATA_FILENAME = 'unicode_scripts_data.c'
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
BLOCKS_URL = 'http://unicode.org/Public/UNIDATA/Blocks.txt'
PROPS_URL = 'http://unicode.org/Public/UNIDATA/PropList.txt'
PROP_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyAliases.txt'
PROP_VALUE_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyValueAliases.txt'
DERIVED_CORE_PROPS_URL = 'http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt'
WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt'
ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
#define UNICODE_SCRIPT_TYPES_H
#include <stdlib.h>
#define NUM_CODEPOINTS {num_codepoints}
#define MAX_LANGS {max_langs}
typedef enum {{
{script_enum}
NUM_SCRIPTS
}} script_t;
#endif
'''
scripts_c_data_template = u'''
script_t char_scripts[] = {{
{char_scripts}
}};
script_code_t script_codes[] = {{
{script_codes}
}};
script_languages_t script_languages[] = {{
{script_languages}
}};
'''
script_code_template = '{{SCRIPT_{name}, "{code}"}}'
script_language_template = '{{{num_langs}, {languages}}}'
def unicode_to_integer(u):
return int('0x{}'.format(u), 16)
def script_name_constant(i, u):
return u'SCRIPT_{} = {}'.format(u.upper(), i)
UNKNOWN_SCRIPT = 'Unknown'
COMMON_SCRIPT = 'Common'
def parse_char_range(r):
return [unicode_to_integer(u) for u in r.split('..')]
def get_chars_by_script():
scripts_file = open(LOCAL_SCRIPTS_FILE)
scripts = [None] * NUM_CODEPOINTS
# Lines look like:
# 0041..005A ; Latin # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
for char_range, script, char_class in script_regex.findall(scripts_file.read()):
script_range = parse_char_range(char_range)
if len(script_range) == 2:
for i in xrange(script_range[0], script_range[1] + 1):
scripts[i] = script
elif script_range:
scripts[script_range[0]] = script
return scripts
COMMENT_CHAR = '#'
DELIMITER_CHAR = ';'
def parse_file(f):
for line in f:
line = line.split(COMMENT_CHAR)[0].strip()
if not line:
continue
tokens = line.split(DELIMITER_CHAR)
if tokens:
yield [t.strip() for t in tokens]
def get_property_aliases():
prop_aliases_file = open(LOCAL_PROP_ALIASES_FILE)
aliases = {}
for line in parse_file(prop_aliases_file):
prop = line[1]
prop_aliases = [line[0]] + line[2:]
for alias in prop_aliases:
aliases[alias.lower()] = prop.lower()
return aliases
def get_property_value_aliases():
prop_value_aliases_file = open(LOCAL_PROP_VALUE_ALIASES_FILE)
value_aliases = defaultdict(dict)
for line in parse_file(prop_value_aliases_file):
prop = line[0]
if prop not in ('ccc', 'gc'):
value = line[2]
aliases = [line[1]] + line[3:]
else:
value = line[1]
aliases = line[2:]
for alias in aliases:
value_aliases[prop.lower()][alias] = value
return dict(value_aliases)
def get_unicode_blocks():
blocks_file = open(LOCAL_BLOCKS_FILE)
blocks = defaultdict(list)
for line in parse_file(blocks_file):
char_range, block = line
char_range = parse_char_range(char_range)
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
blocks[block.lower()].append(wide_unichr(i))
elif char_range:
blocks[block.lower()].append(wide_unichr(char_range[0]))
return dict(blocks)
def get_unicode_properties():
props_file = open(LOCAL_PROPS_FILE)
props = defaultdict(list)
for line in parse_file(props_file):
char_range, prop = line
char_range = parse_char_range(char_range)
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop.lower()].append(wide_unichr(i))
elif char_range:
props[prop.lower()].append(wide_unichr(char_range[0]))
derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
for line in parse_file(derived_props_file):
char_range, prop = line
char_range = parse_char_range(char_range)
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop.lower()].append(wide_unichr(i))
elif char_range:
props[prop.lower()].append(wide_unichr(char_range[0]))
return dict(props)
def get_word_break_properties():
props_file = open(LOCAL_WORD_BREAKS_FILE)
props = defaultdict(list)
for line in parse_file(props_file):
char_range, prop = line
char_range = parse_char_range(char_range)
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop].append(wide_unichr(i))
elif char_range:
props[prop].append(wide_unichr(char_range[0]))
return dict(props)
def build_master_scripts_list(chars):
all_scripts = OrderedDict.fromkeys(filter(bool, chars))
for i, script in enumerate(all_scripts.keys()):
all_scripts[script] = i + 1
# Unknown script for all characters not covered
all_scripts[UNKNOWN_SCRIPT] = 0
return all_scripts
SCRIPT_ALIASES_SUPPLEMENTAL = {
'Hant': 'Han',
'Hans': 'Han'
}
def get_script_codes(all_scripts):
if not os.path.exists(LOCAL_ISO_15924_FILE):
temp_dir = tempfile.gettempdir()
script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])
# This comes as a .zip
script_codes_response = requests.get(ISO_15924_URL)
zf = ZipFile(StringIO(script_codes_response.content))
iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]
# Strip out the comments, etc.
temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
if line.strip() and not line.strip().startswith('#')])
f = open(LOCAL_ISO_15924_FILE, 'w')
f.write(safe_encode(temp_iso15924_file))
f.close()
script_codes_file = open(LOCAL_ISO_15924_FILE)
script_codes = {}
seen_scripts = set()
# Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
if name in all_scripts:
script_codes[code] = name
seen_scripts.add(name)
else:
normalized_name = name.split('(')[0].strip()
if normalized_name in all_scripts and normalized_name not in seen_scripts:
script_codes[code] = normalized_name
seen_scripts.add(normalized_name)
value_aliases = get_property_value_aliases()
script_aliases = value_aliases['sc']
for code, script in script_aliases.iteritems():
if code not in script_codes and script in all_scripts:
script_codes[code] = script
script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)
return script_codes
SCRIPT_CODE_ALIASES = {
'Jpan': ['Hani', 'Hira', 'Kana'],
'Kore': ['Hang', 'Han']
}
def extract_language_scripts(xml):
language_scripts = defaultdict(list)
for lang in xml.xpath('//languageData/language'):
language_code = lang.attrib['type'].lower()
scripts = lang.get('scripts')
if not scripts:
continue
for script in scripts.split():
script_aliases = SCRIPT_CODE_ALIASES.get(script)
if not script_aliases:
language_scripts[language_code].append(script)
else:
language_scripts[language_code].extend(script_aliases)
return language_scripts
def batch_iter(iterable, batch_size):
source_iter = iter(iterable)
while True:
batch = list(islice(source_iter, batch_size))
if len(batch) > 0:
yield batch
else:
return
def get_script_languages():
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
# to identify the language. We keep track of those single language scripts to inform
# the language classifier
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
cldr_xml = etree.parse(cldr_supplemental_data)
language_scripts = extract_language_scripts(cldr_xml)
country_languages_path = os.path.join(DEFAULT_LANGUAGES_DIR, COUNTRY_LANGUAGES_FILENAME)
if not os.path.exists(country_languages_path):
fetch_cldr_languages(DEFAULT_LANGUAGES_DIR)
country_language_file = open(country_languages_path)
country_language_reader = csv.reader(country_language_file, delimiter='\t')
countries = set([country for country, lang, script, pct, is_official
in country_language_reader])
spoken_languages = set.union(*(set(get_country_languages(country)) for country in countries))
script_code_languages = defaultdict(list)
for language, scripts in language_scripts.iteritems():
if language not in spoken_languages:
continue
for script in scripts:
script_code_languages[script].append(language)
script_languages = defaultdict(list)
for script_code, script_name in script_codes.iteritems():
langs = script_code_languages.get(script_code, [])
script_languages[script_name].extend(langs)
for name in all_scripts.iterkeys():
script_languages.setdefault(name, [])
return script_languages
def main(out_dir=SRC_DIR):
# Output is a C header and data file, see templates
out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')
download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
download_file(PROPS_URL, LOCAL_PROPS_FILE)
download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
download_cldr()
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
script_languages = get_script_languages()
max_langs = 0
for script, langs in script_languages.iteritems():
num_langs = len(langs)
if num_langs > max_langs:
max_langs = num_langs
# Generate C header and constants
script_enum = u'''
'''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))])
out_header.write(scripts_header_template.format(num_codepoints=NUM_CODEPOINTS,
max_langs=max_langs,
script_enum=script_enum))
out_header.close()
# Generate C data file
char_scripts_data = u''',
'''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])
script_codes_data = u''',
'''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])
sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]
script_language_data = u''',
'''.join([script_language_template.format(num_langs=len(langs),
languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL'))
for langs in sorted_lang_scripts])
out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
char_scripts=char_scripts_data,
script_codes=script_codes_data,
script_languages=script_language_data))
out_file.close()
if __name__ == '__main__':
main(*sys.argv[1:])

View File

@@ -0,0 +1,140 @@
'''
word_breaks.py
This script is used to automatically build ranges of unicode characters
from the unicode spec's word break properties. These ranges help us
build a tokenizer that does the right thing in every language with regard
to word segmentation. The lines outputted by this script can be pasted
into scanner.re before compliation.
'''
import requests
from collections import defaultdict
import re
# Operate on WordBreakProperty.txt file
hebrew_letter_regex = re.compile('^([^\s]+)[\s]+; Hebrew_Letter ')
format_regex = re.compile('^([^\s]+)[\s]+; Format ')
extend_regex = re.compile('^([^\s]+)[\s]+; Extend ')
katakana_regex = re.compile('^([^\s]+)[\s]+; Katakana ')
other_alpha_letter_regex = re.compile('^([^\s]+)[\s]+; ALetter # Lo (?!.*(?:HANGUL|TIBETAN|JAVANESE|BALINESE|YI) )')
mid_letter_regex = re.compile('^([^\s]+)[\s]+; MidLetter')
mid_number_regex = re.compile('^([^\s]+)[\s]+; MidNum ')
mid_num_letter_regex = re.compile('^([^\s]+)[\s]+; MidNumLet ')
numeric_regex = re.compile('^([^\s]+)[\s]+; Numeric ')
extend_num_letter_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
# Operate on Scripts.txt file
other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
ideographic_scripts = set([
'han',
'hiragana',
'hangul',
'tibetan',
'thai',
'lao',
'javanese',
'balinese',
'yi',
])
def regex_char_range(match):
r = match.split('..')
# Wide version
return u'-'.join([('\u{}'.format(c.lower()) if len(c) < 5 else '\U{}'.format(c.lower().rjust(8, '0'))) for c in r])
def get_letter_range(text, *regexes):
char_ranges = []
for line in text.split('\n'):
for regex in regexes:
m = regex.match(line)
if m:
char_ranges.append(regex_char_range(m.group(1)))
return char_ranges
def get_letter_ranges_for_scripts(text, scripts, char_class_regex):
char_ranges = []
for char_range, script, char_class in script_regex.findall(text):
if script.lower() in scripts and char_class_regex.match(char_class):
char_ranges.append(regex_char_range(char_range))
return char_ranges
def get_char_class(text, char_class_regex):
char_ranges = []
for char_range, script, char_class in script_regex.findall(text):
if char_class_regex.match(char_class):
char_ranges.append(regex_char_range(char_range))
return char_ranges
hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
def get_hangul_syllable_ranges(text):
char_ranges = defaultdict(list)
for line in text.split('\n'):
m = hangul_syllable_type_regex.match(line)
if m:
char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
return dict(char_ranges)
name_funcs = [
('hebrew_letter_chars', hebrew_letter_regex),
('format_chars', format_regex),
('extend_chars', extend_regex),
('katakana_chars', katakana_regex),
('letter_other_alpha_chars', other_alpha_letter_regex),
('mid_letter_chars', mid_letter_regex),
('mid_number_chars', mid_number_regex),
('mid_num_letter_chars', mid_num_letter_regex),
('numeric_chars', numeric_regex),
('extend_num_letter_chars', extend_num_letter_regex),
]
IDEOGRAPHIC_CHARS = 'ideographic_chars'
IDEOGRAPHIC_NUMERIC_CHARS = 'ideographic_numeric_chars'
numbers_regex = re.compile('N[ol]', re.I)
letters_regex = re.compile('L*', re.I)
def main():
''' Insert these lines into scanner.re '''
response = requests.get(WORD_BREAK_PROPERTIES_URL)
if response.ok:
for name, reg in name_funcs:
s = get_letter_range(response.content, reg)
print '{} = [{}];'.format(name, ''.join(s))
response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
if response.ok:
syllable_ranges = get_hangul_syllable_ranges(response.content)
for name, ranges in syllable_ranges.iteritems():
print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
response = requests.get(SCRIPTS_URL)
if response.ok:
s = ''.join(get_char_class(response.content, numbers_regex))
print '{} = [{}];'.format(IDEOGRAPHIC_NUMERIC_CHARS, ''.join(s))
s = ''.join(get_letter_ranges_for_scripts(response.content, ideographic_scripts, letters_regex))
print '{} = [{}];'.format(IDEOGRAPHIC_CHARS, ''.join(s))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,18 @@
from collections import namedtuple
from geodata.addresses.config import address_config
from geodata.math.sampling import weighted_choice
IntersectionQuery = namedtuple('IntersectionQuery', 'road1, intersection_phrase, road2')
NULL_INTERSECTION_QUERY = IntersectionQuery(None, None, None)
class Intersection(object):
@classmethod
def phrase(cls, language, country=None):
values, probs = address_config.alternative_probabilities('cross_streets.intersection', language, country=country)
if not values:
return None
phrase, props = weighted_choice(values, probs)
return phrase

View File

View File

@@ -0,0 +1,100 @@
import argparse
import logging
import os
import subprocess
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.osm.osm_address_training_data import WAYS_LANGUAGE_DATA_FILENAME, ADDRESS_LANGUAGE_DATA_FILENAME, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME, TOPONYM_LANGUAGE_DATA_FILENAME
LANGUAGES_ALL_FILE = 'languages.all'
LANGAUGES_RANDOM_FILE = 'languages.random'
LANGUAGES_TRAIN_FILE = 'languages.train'
LANGUAGES_CV_FILE = 'languages.cv'
LANGUAGES_TEST_FILE = 'languages.test'
def create_language_training_data(osm_dir, split_data=True, train_split=0.8, cv_split=0.1):
language_all_path = os.path.join(osm_dir, LANGUAGES_ALL_FILE)
ways_path = os.path.join(osm_dir, WAYS_LANGUAGE_DATA_FILENAME)
if os.system(' '.join(['cat', ways_path, '>', language_all_path])) != 0:
raise SystemError('Could not find {}'.format(ways_path))
addresses_path = os.path.join(osm_dir, ADDRESS_LANGUAGE_DATA_FILENAME)
if os.system(' '.join(['cat', addresses_path, '>>', language_all_path])) != 0:
raise SystemError('Could not find {}'.format(addresses_path))
formatted_path = os.path.join(osm_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME)
if os.system(' '.join(['cat', formatted_path, '>>', language_all_path])) != 0:
raise SystemError('Could not find {}'.format(formatted_path))
toponyms_path = os.path.join(osm_dir, TOPONYM_LANGUAGE_DATA_FILENAME)
if os.system(' '.join(['cat', toponyms_path, '>>', language_all_path])) != 0:
raise SystemError('Could not find {}'.format(toponyms_path))
languages_random_path = os.path.join(osm_dir, LANGAUGES_RANDOM_FILE)
if os.system(u' '.join(['shuf', '--random-source=/dev/urandom', language_all_path, '>', languages_random_path])) != 0:
raise SystemError('shuffle failed')
languages_train_path = os.path.join(osm_dir, LANGUAGES_TRAIN_FILE)
if split_data:
languages_test_path = os.path.join(osm_dir, LANGUAGES_TEST_FILE)
num_lines = sum((1 for line in open(languages_random_path)))
train_lines = int(train_split * num_lines)
test_lines = num_lines - train_lines
cv_lines = int(test_lines * (cv_split / (1.0 - train_split))) + 1
subprocess.check_call(['split', '-l', str(train_lines), languages_random_path, os.path.join(osm_dir, 'language-split-')])
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-aa'), languages_train_path])
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-ab'), languages_test_path])
languages_cv_path = os.path.join(osm_dir, LANGUAGES_CV_FILE)
subprocess.check_call(['split', '-l', str(cv_lines), languages_test_path, os.path.join(osm_dir, 'language-split-')])
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-aa'), languages_cv_path])
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-ab'), languages_test_path])
else:
subprocess.check_call(['mv', languages_random_path, languages_train_path])
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--no-split',
action='store_false',
default=True,
help='Do not split data into train/cv/test')
parser.add_argument('-t', '--train-split',
type=float,
default=0.8,
help='Train split percentage as a float (default 0.8)')
parser.add_argument('-c', '--cv-split',
type=float,
default=0.1,
help='Cross-validation split percentage as a float (default 0.1)')
parser.add_argument('-o', '--osm-dir',
default=os.getcwd(),
help='OSM directory')
args = parser.parse_args()
if args.train_split + args.cv_split >= 1.0:
raise ValueError('Train split + cross-validation split must be less than 1.0')
if not os.path.exists(args.osm_dir):
raise ValueError('OSM directory does not exist')
create_language_training_data(args.osm_dir, split_data=args.no_split, train_split=args.train_split, cv_split=args.cv_split)

View File

@@ -0,0 +1,176 @@
import os
import six
import sys
from collections import defaultdict, OrderedDict
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
from geodata.address_expansions.gazetteers import *
from geodata.encoding import safe_decode, safe_encode
from geodata.string_utils import wide_iter, wide_ord
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
from geodata.text.normalize import normalized_tokens, normalize_string
from geodata.text.tokenize import tokenize
from geodata.text.token_types import token_types
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
# For toponyms, we want to limit the countries we consider to those where
# the place names can themselves be considered training examples of the language
WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']),
'fr': set(['fr']),
'it': set(['it']),
'de': set(['de', 'at']),
'nl': set(['nl']),
'es': set(['es', 'ar', 'mx', 'cl', 'co', 'pe', 'ec', 'pr', 'uy',
've', 'cu', 'do', 'bo', 'gt', 'cr', 'py', 'sv', 'pa',
'ni', 'hn']),
'pt': set(['pt', 'br']),
}
char_scripts = get_chars_by_script()
script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())}
lang_scripts = defaultdict(set)
for script, langs in six.iteritems(script_languages):
for lang in langs:
lang_scripts[lang].add(script)
lang_scripts = dict(lang_scripts)
UNKNOWN_SCRIPT = 'Unknown'
COMMON_SCRIPT = 'Common'
MAX_ASCII = 127
def get_string_script(s):
s = safe_decode(s)
str_len = len(s)
script = last_script = UNKNOWN_SCRIPT
is_ascii = True
script_len = 0
for c in wide_iter(s):
script = char_scripts[wide_ord(c)]
if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
script = last_script
if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
if (script_len < str_len):
for c in reversed(list(wide_iter(s[:script_len]))):
if char_scripts[wide_ord(c)] == COMMON_SCRIPT:
script_len -= 1
break
is_ascii = is_ascii and ord(c) <= MAX_ASCII
script_len += 1
if script != UNKNOWN_SCRIPT:
last_script = script
return (last_script, script_len, is_ascii)
LATIN_SCRIPT = 'Latin'
UNKNOWN_LANGUAGE = 'unk'
AMBIGUOUS_LANGUAGE = 'xxx'
def disambiguate_language_script(text, languages):
script_langs = {}
read_len = 0
while read_len < len(text):
script, script_len, is_ascii = get_string_script(text[read_len:])
if script != LATIN_SCRIPT:
script_valid = [l for l, d in languages if l in script_languages.get(script, [])]
script_langs[script] = set(script_valid)
if script_len == len(text) and len(script_valid) == 1:
return script_valid[0], script_langs
read_len += script_len
return UNKNOWN_LANGUAGE, script_langs
LATIN_TRANSLITERATED_SCRIPTS = {'Arabic', 'Cyrllic'}
def has_non_latin_script(languages):
for lang, is_default in languages:
scripts = lang_scripts.get(lang, set())
if LATIN_SCRIPT not in scripts or scripts & LATIN_TRANSLITERATED_SCRIPTS:
return True
return False
def disambiguate_language(text, languages, scripts_only=False):
text = safe_decode(text)
valid_languages = OrderedDict(languages)
language_script, script_langs = disambiguate_language_script(text, languages)
if language_script is not UNKNOWN_LANGUAGE:
return language_script
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
tokens = normalized_tokens(text)
current_lang = None
possible_lang = None
seen_languages = set()
for t, c, l, data in street_types_gazetteer.filter(tokens):
if c == token_types.PHRASE:
valid = OrderedDict()
data = [safe_decode(d).split(u'|') for d in data]
potentials = set([l for l, d, i, c in data if l in valid_languages])
potential_defaults = set([l for l in potentials if valid_languages[l]])
phrase_len = sum((len(t_i[0]) for t_i in t))
for lang, dictionary, is_canonical, canonical in data:
is_canonical = int(is_canonical)
is_stopword = dictionary == 'stopword'
if lang not in valid_languages or (is_stopword and len(potentials) > 1):
continue
is_default = valid_languages[lang]
lang_valid = is_default or not seen_languages or lang in seen_languages
if lang_valid and phrase_len > 1 and ((is_canonical and not is_stopword) or (is_default and (len(potentials) == 1 or len(potential_defaults) == 1))):
valid[lang] = 1
elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
return AMBIGUOUS_LANGUAGE
elif is_stopword and is_canonical and not is_default and lang in seen_languages:
valid[lang] = 1
elif not seen_languages and len(potentials) == 1 and phrase_len > 1:
possible_lang = lang if possible_lang is None or possible_lang == lang else None
if seen_languages and valid and not any((l in seen_languages for l in valid)) and \
(not any((valid_languages.get(l) for l in valid)) or any((valid_languages.get(l) for l in seen_languages))):
return AMBIGUOUS_LANGUAGE
valid = valid.keys()
if len(valid) == 1:
current_lang = valid[0]
else:
valid_default = [l for l in valid if valid_languages.get(l)]
if len(valid_default) == 1 and current_lang is not None and valid_default[0] != current_lang:
return AMBIGUOUS_LANGUAGE
elif len(valid_default) == 1:
current_lang = valid_default[0]
if any((current_lang not in langs for script, langs in script_langs.iteritems())):
return AMBIGUOUS_LANGUAGE
seen_languages.update(valid)
if current_lang is not None:
return current_lang
elif possible_lang is not None:
if not any((possible_lang not in langs for script, langs in script_langs.iteritems())):
return possible_lang
else:
return AMBIGUOUS_LANGUAGE
return UNKNOWN_LANGUAGE

View File

@@ -0,0 +1,53 @@
import random
import bisect
from collections import OrderedDict
'''
Top languages on the Interwebs. Not a probability distribution
as it doesn't sum to 1 and websites can be in more than one
language. Reference:
https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
'''
INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
('en', 0.555),
('ru', 0.059),
('de', 0.058),
('ja', 0.05),
('es', 0.046),
('fr', 0.04),
('zh', 0.028),
('pt', 0.025),
('it', 0.019),
('pl', 0.017),
('tr', 0.015),
('nl', 0.013),
('fa', 0.009),
('ar', 0.008),
('ko', 0.007),
])
def cdf(probs):
total = float(sum(probs))
result = []
cumulative = 0.0
for w in probs:
cumulative += w
result.append(cumulative / total)
return result
MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
cdf=INTERNET_LANGUAGES_CDF):
assert len(keys) == len(cdf)
sample = random.random()
idx = bisect.bisect(cdf, sample)
return keys[idx]

10
scripts/geodata/log.py Normal file
View File

@@ -0,0 +1,10 @@
import logging
import sys
def log_to_file(f, level=logging.INFO):
handler = logging.StreamHandler(f)
formatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]: %(message)s')
handler.setFormatter(formatter)
logging.root.addHandler(handler)
logging.root.setLevel(level)

View File

View File

@@ -0,0 +1,5 @@
FLOAT_EPSILON = 1e-09
def isclose(a, b, rel_tol=FLOAT_EPSILON, abs_tol=0.0):
return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

View File

@@ -0,0 +1,42 @@
import bisect
import random
import sys
from geodata.math.floats import isclose, FLOAT_EPSILON
def weighted_choice(values, cdf):
"""Pick one of n values given a discrete cumulative distribution"""
assert values and cdf, 'values and probabilities cannot be empty/None'
assert len(values) == len(cdf), 'len(values) != len(probs)'
assert all(p >= 0.0 and p <= (1.0 + FLOAT_EPSILON) for p in cdf), 'Probabilities not valid: {}'.format(cdf)
x = random.random()
i = bisect.bisect(cdf, x)
return values[i]
def check_probability_distribution(probs):
cumulative = 0.0
for p in probs:
assert p >= 0.0, 'Probabilities cannot be negative'
assert p <= 1.0, 'Probabilities cannot be > 1.0'
cumulative += p
assert isclose(cumulative, 1.0), 'Probabilities must sum to 1: probs={}, cumulative={}'.format(probs, cumulative)
def cdf(probs):
total = 0.0
cumulative = [0.0] * len(probs)
for i, p in enumerate(probs):
total += p
cumulative[i] = total
return cumulative
def zipfian_distribution(n, b=1.0):
"""Distribution where the ith item's frequency is proportional to its rank"""
frequencies = [1. / (i ** b) for i in xrange(1, n + 1)]
total = sum(frequencies)
return [f / total for f in frequencies]

View File

@@ -0,0 +1,52 @@
import argparse
import logging
import os
import sys
import six
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.address_expansions.abbreviations import abbreviate
from geodata.coordinates.conversion import latlon_to_decimal
from geodata.math.floats import isclose
from geodata.osm.extract import parse_osm
from geodata.places.reverse_geocode import PlaceReverseGeocoder
from geodata.encoding import safe_decode
class MetroStationReverseGeocoder(PlaceReverseGeocoder):
GEOHASH_PRECISION = 7
include_property_patterns = PlaceReverseGeocoder.include_property_patterns | set([
'operator',
'network',
'station',
])
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--osm-metro-stations-file',
help='Path to OSM metro stations file')
parser.add_argument('-p', '--precision',
type=int,
default=MetroStationReverseGeocoder.GEOHASH_PRECISION,
help='Geohash precision')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
help='Output directory')
logging.basicConfig(level=logging.INFO)
args = parser.parse_args()
if args.osm_metro_stations_file:
index = MetroStationReverseGeocoder.create_from_osm_file(args.osm_metro_stations_file, args.out_dir, precision=args.precision)
else:
parser.error('Must specify metro stations file')
index.save()

View File

View File

@@ -0,0 +1,102 @@
from geodata.text.normalize import *
from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity
from collections import Counter
class NameDeduper(object):
'''
Base class for deduping geographic entity names e.g. for matching names
from different databases (concordances).
By default uses Soft TFIDF similarity (see geodata.names.similarity)
for non-ideographic names and Jaccard similarity with word frequencies
for ideographic names.
See class attributes for options.
'''
stopwords = set()
'''Set of words which should not be considered in similarity'''
discriminative_words = set()
'''Set of words which break similarity e.g. North, Heights'''
discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
'''Set of categories which, if not contained in both sets, break similarity'''
content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
'''Set of categories representing content tokens (default setting ignores punctuation)'''
replacements = {}
'''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''
dupe_threshold = 0.9
'''Similarity threshold above which entities are considered dupes'''
ignore_parentheticals = True
'''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''
@classmethod
def tokenize(cls, s):
return normalized_tokens(s)
@classmethod
def content_tokens(cls, s):
tokens = cls.tokenize(s)
if cls.ignore_parentheticals:
tokens = remove_parens(tokens)
return [(cls.replacements.get(t, t), c)
for t, c in tokens
if c in cls.content_categories and
t not in cls.stopwords]
@classmethod
def possible_match(cls, tokens1, tokens2):
if not cls.discriminative_categories and not cls.discriminative_words:
return True
intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
invalid = any((True for t, c in tokens1 + tokens2
if t not in intersection and
(c in cls.discriminative_categories or t in cls.discriminative_words)
))
return not invalid
@classmethod
def compare_ideographs(cls, s1, s2):
tokens1 = cls.content_tokens(s1)
tokens2 = cls.content_tokens(s2)
if not cls.possible_match(tokens1, tokens2):
return 0.0
tokens1_only = [t for t, c in tokens1]
tokens2_only = [t for t, c in tokens2]
if u''.join(tokens1_only) == u''.join(tokens2_only):
return 1.0
else:
# Many Han/Hangul characters are common, shouldn't use IDF
return jaccard_similarity(tokens1_only, tokens2_only)
@classmethod
def compare(cls, s1, s2, idf):
tokens1 = cls.content_tokens(s1)
tokens2 = cls.content_tokens(s2)
if not cls.possible_match(tokens1, tokens2):
return 0.0
tokens1_only = [t for t, c in tokens1]
tokens2_only = [t for t, c in tokens2]
# Test exact equality, also handles things like Cabbage Town == Cabbagetown
if u''.join(tokens1_only) == u''.join(tokens2_only):
return 1.0
else:
return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)
@classmethod
def is_dupe(cls, sim):
return sim >= cls.dupe_threshold

View File

@@ -0,0 +1,119 @@
import os
import re
import six
import yaml
from geodata.encoding import safe_decode
this_dir = os.path.realpath(os.path.dirname(__file__))
AFFIX_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'boundaries', 'names', 'languages')
class NameAffixes(object):
def __init__(self, config_dir=AFFIX_CONFIG_DIR):
self.config_dir = config_dir
self.language_prefixes = {}
self.language_suffixes = {}
self.language_prefix_regexes = {}
self.language_suffix_regexes = {}
self.language_prefix_sim_only_regexes = {}
self.language_suffix_sim_only_regexes = {}
for filename in os.listdir(config_dir):
if not filename.endswith('.yaml'):
continue
lang = filename.rsplit('.yaml')[0]
conf = yaml.load(open(os.path.join(config_dir, filename)))
self.add_affixes(lang, conf)
for country, country_conf in six.iteritems(conf.get('countries', {})):
country_lang = (country, lang)
self.add_affixes(country_lang, country_conf)
def add_affixes(self, lang, *confs):
prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]
self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]
self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
whitespace_phrase = six.u('[ \-]')
all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
if all_prefixes:
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
if all_suffixes:
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
if sim_only_prefixes:
sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
if sim_only_suffixes:
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
def replace_prefixes(self, name, lang, country=None, sim_only=False):
name = safe_decode(name).strip()
if not sim_only or lang not in self.language_prefix_sim_only_regexes:
d = self.language_prefix_regexes
else:
d = self.language_prefix_sim_only_regexes
re = None
if country is not None:
re = d.get((country, lang))
if re:
name = re.sub(six.u(''), name)
re = d.get(lang)
if not re:
return name
return re.sub(six.u(''), name)
def replace_suffixes(self, name, lang, country=None, sim_only=False):
name = safe_decode(name).strip()
if not sim_only or lang not in self.language_suffix_sim_only_regexes:
d = self.language_suffix_regexes
else:
d = self.language_suffix_sim_only_regexes
re = None
if country is not None:
re = d.get((country, lang))
if re:
name = re.sub(six.u(''), name)
re = d.get(lang)
if not re:
return name
return re.sub(six.u(''), name)
def replace_affixes(self, name, lang, country=None, sim_only=False):
return self.replace_prefixes(self.replace_suffixes(name, lang, country=country, sim_only=sim_only), lang, country=country, sim_only=sim_only)
name_affixes = NameAffixes()

View File

@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
import Levenshtein
from collections import OrderedDict
def ordered_word_count(tokens):
counts = OrderedDict()
for k in tokens:
counts[k] = counts.get(k, 0) + 1
return counts
def soft_tfidf_similarity(tokens1, tokens2, idf,
sim_func=Levenshtein.jaro_winkler, theta=0.95,
common_word_threshold=100):
'''
Soft TFIDF is a hybrid distance function using both global statistics
(inverse document frequency) and local similarity (Jaro-Winkler).
For each token t1 in the first string, find the token t2 which is most
similar to t1 in terms of the local distance function.
The SoftTFIDF similarity is the dot product of the max token similarities
and the cosine similarity of the TF-IDF vectors for all tokens where
the max similarity is >= a given threshold theta.
sim_func should return a number in the range (0, 1) inclusive and theta
should be in the same range i.e. this would _not_ work for a metric like
basic Levenshtein or Damerau-Levenshtein distance where we'd want the
value to be below the threshold. Those metrics can be transformed into
a (0, 1) measure.
@param tokens1: normalized tokens of string 1 (list of strings only)
@param tokens2: normalized tokens of string 2 (list of strings only)
@param idf: IDFIndex from geodata.statistics.tf_idf
@param sim_func: similarity function which takes 2 strings and returns
a number between 0 and 1
@param theta: token-level threshold on sim_func's return value at
which point two tokens are considered "close"
Reference:
https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
'''
token1_counts = ordered_word_count(tokens1)
token2_counts = ordered_word_count(tokens2)
tfidf1 = idf.tfidf_vector(token1_counts)
tfidf2 = idf.tfidf_vector(token2_counts)
total_sim = 0.0
t1_len = len(token1_counts)
t2_len = len(token2_counts)
if t2_len < t1_len:
token1_counts, token2_counts = token2_counts, token1_counts
tfidf1, tfidf2 = tfidf2, tfidf1
for i, t1 in enumerate(token1_counts):
sim, j = max([(sim_func(t1, t2), j) for j, t2 in enumerate(token2_counts)])
if sim >= theta:
total_sim += sim * tfidf1[i] * tfidf2[j]
return total_sim
def jaccard_similarity(tokens1, tokens2):
'''
Traditionally Jaccard similarity is defined for two sets:
Jaccard(A, B) = (A ∩ B) / (A B)
Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b']
would be 1.0, which is not ideal for entity name matching.
In this implementation the cardinality of the set intersections/unions
are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67
'''
token1_counts = ordered_word_count(tokens1)
token2_counts = ordered_word_count(tokens2)
intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts))
return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)

View File

@@ -0,0 +1,622 @@
# -*- coding: utf-8 -*-
import argparse
import fnmatch
import logging
import operator
import os
import re
import six
import subprocess
import sys
import yaml
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.address_formatting.formatter import AddressFormatter
from geodata.coordinates.conversion import latlon_to_decimal
from geodata.encoding import safe_decode
from geodata.file_utils import ensure_dir, download_file
from geodata.i18n.unicode_properties import get_chars_by_script
from geodata.i18n.word_breaks import ideographic_scripts
from geodata.names.deduping import NameDeduper
from geodata.osm.admin_boundaries import OSMNeighborhoodPolygonReader
from geodata.osm.components import osm_address_components
from geodata.osm.definitions import osm_definitions
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
from geodata.polygons.index import *
from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder
from geodata.statistics.tf_idf import IDFIndex
class NeighborhoodDeduper(NameDeduper):
# Lossless conversions only
replacements = {
u'saint': u'st',
u'and': u'&',
u'': u'0',
u'': u'1',
u'': u'2',
u'': u'3',
u'': u'4',
u'': u'5',
u'': u'6',
u'': u'7',
u'': u'8',
u'': u'9',
u'': u'10',
}
discriminative_words = set([
# Han numbers
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
# Roman numerals
u'i', u'ii',
u'iii', u'iv',
u'v', u'vi',
u'vii', u'viii',
u'ix', u'x',
u'xi', u'xii',
u'xiii', u'xiv',
u'xv', u'xvi',
u'xvii', u'xviii',
u'xix', u'xx',
# English directionals
u'north', u'south',
u'east', u'west',
u'northeast', u'northwest',
u'southeast', u'southwest',
# Spanish, Portguese and Italian directionals
u'norte', u'nord', u'sur', u'sul', u'sud',
u'est', u'este', u'leste', u'oeste', u'ovest',
# New in various languages
u'new',
u'nova',
u'novo',
u'nuevo',
u'nueva',
u'nuovo',
u'nuova',
# Qualifiers
u'heights',
u'hills',
u'upper', u'lower',
u'little', u'great',
u'park',
u'parque',
u'village',
])
stopwords = set([
u'cp',
u'de',
u'la',
u'urbanizacion',
u'do',
u'da',
u'dos',
u'del',
u'community',
u'bairro',
u'barrio',
u'le',
u'el',
u'mah',
u'раион',
u'vila',
u'villa',
u'kampung',
u'ahupua`a',
])
class ClickThatHoodReverseGeocoder(GeohashPolygonIndex):
persistent_polygons = False
cache_size = 0
SCRATCH_DIR = '/tmp'
# Contains accurate boundaries for neighborhoods sans weird GeoPlanet names like "Adelphi" or "Crown Heights South"
NEIGHBORHOODS_REPO = 'https://github.com/codeforamerica/click_that_hood'
config_path = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'neighborhoods', 'click_that_hood.yaml')
config = yaml.load(open(config_path))
@classmethod
def clone_repo(cls, path):
subprocess.check_call(['rm', '-rf', path])
subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
@classmethod
def create_neighborhoods_index(cls):
scratch_dir = cls.SCRATCH_DIR
repo_path = os.path.join(scratch_dir, 'click_that_hood')
cls.clone_repo(repo_path)
data_path = os.path.join(repo_path, 'public', 'data')
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods')
ensure_dir(neighborhoods_dir)
index = cls(save_dir=neighborhoods_dir)
for c in cls.config['files']:
filename = c['filename']
component = c['component']
path = os.path.join(data_path, filename)
features = json.load(open(path))['features']
for f in features:
f['properties']['component'] = component
try:
index.add_geojson_like_file(features)
except ValueError:
continue
return index
class OSMNeighborhoodReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = False
cache_size = 10000
simplify_polygons = False
polygon_reader = OSMNeighborhoodPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['postal_code'])
cache_size = 0
SCRATCH_DIR = '/tmp'
@classmethod
def create_neighborhoods_index(cls, osm_neighborhoods_file):
scratch_dir = cls.SCRATCH_DIR
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
ensure_dir(neighborhoods_dir)
return cls.create_from_osm_file(osm_neighborhoods_file, output_dir=neighborhoods_dir)
class NeighborhoodReverseGeocoder(RTreePolygonIndex):
'''
Neighborhoods are very important in cities like NYC, SF, Chicago, London
and many others. We want the address parser to be trained with addresses
that sufficiently capture variations in address patterns, including
neighborhoods. Quattroshapes neighborhood data (in the US at least)
is not great in terms of names, mostly becasue GeoPlanet has so many
incorrect names. The neighborhoods project, also known as ClickThatHood
has very accurate polygons with correct names, but only for a handful
of cities. OSM usually lists neighborhoods and some other local admin
areas like boroughs as points rather than polygons.
This index merges all of the above data sets in prioritized order
(ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
tests for neighborhoods. The properties vary by source but each has
source has least a "name" key which in practice is what we care about.
Quattroshapes data is no longer accessible and has been replaced by
WhosOnFirst.
'''
PRIORITIES_FILENAME = 'priorities.json'
DUPE_THRESHOLD = 0.9
persistent_polygons = True
cache_size = 100000
source_priorities = {
'osm': 0, # Best names/polygons, same coordinate system
'osm_cth': 1, # Prefer the OSM names if possible
'clickthathood': 2, # Better names/polygons than WhosOnFirst
'osm_wof': 3, # Prefer OSM names matched with WhosOnFirst polygon
'wof': 4, # Replacement of Quattroshapes
}
level_priorities = {
'neighborhood': 0,
'local_admin': 1,
}
regex_replacements = [
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes
(re.compile('^paris-(?=[\d])', re.I), ''),
(re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
]
quattroshapes_city_district_patterns = [
six.u('Praha [\d]+'),
]
quattroshapes_city_district_regex = re.compile('|'.join([six.u('^\s*{}\s*$').format(p) for p in quattroshapes_city_district_patterns]), re.I | re.U)
@classmethod
def count_words(cls, s):
doc = defaultdict(int)
for t, c in NeighborhoodDeduper.content_tokens(s):
doc[t] += 1
return doc
@classmethod
def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
'''
Given an OSM file (planet or some other bounds) containing neighborhoods
as points (some suburbs have boundaries)
and their dependencies, create an R-tree index for coarse-grained
reverse geocoding.
Note: the input file is expected to have been created using
osmfilter. Use fetch_osm_address_data.sh for planet or copy the
admin borders commands if using other geometries.
'''
index = cls(save_dir=output_dir)
logger = logging.getLogger('neighborhoods')
logger.info('Creating ClickThatHood neighborhoods')
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
logger.info('Creating OSM neighborhoods')
osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
logger.info('Creating WhosOnFirst neighborhoods')
wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods"))
country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
osm_admin_rtree.cache_size = 1000
logger.info('Creating IDF index')
idf = IDFIndex()
char_scripts = get_chars_by_script()
for idx in (cth, wof, osmn):
for i in xrange(idx.i):
props = idx.get_properties(i)
name = props.get('name')
if name is not None:
doc = cls.count_words(name)
idf.update(doc)
for key, attrs, deps in parse_osm(filename):
for k, v in six.iteritems(attrs):
if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
doc = cls.count_words(v)
idf.update(doc)
for i in six.moves.xrange(osmn.i):
props = osmn.get_properties(i)
poly = osmn.get_polygon(i)
props['source'] = 'osm'
props['component'] = AddressFormatter.SUBURB
props['polygon_type'] = 'neighborhood'
index.index_polygon(poly.context)
index.add_polygon(poly.context, props)
wof.matched = [False] * wof.i
cth.matched = [False] * cth.i
logger.info('Matching OSM points to neighborhood polygons')
# Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons
num_polys = 0
for element_id, attrs, deps in parse_osm(filename):
try:
lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
except ValueError:
continue
osm_name = attrs.get('name')
if not osm_name:
continue
id_type, element_id = element_id.split(':')
element_id = long(element_id)
props['type'] = id_type
props['id'] = element_id
possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD)
is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD)
country, candidate_languages = country_rtree.country_and_languages(lat, lon)
component_name = None
component_name = osm_address_components.component_from_properties(country, attrs)
ranks = []
osm_names = []
for key in OSM_NAME_TAGS:
name = attrs.get(key)
if name:
osm_names.append(name)
for name_key in OSM_NAME_TAGS:
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
for idx in (cth, wof):
candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
if candidates:
max_sim = 0.0
arg_max = None
normalized_wof_names = {}
for osm_name in osm_names:
contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
for c in safe_decode(osm_name)))
for i in candidates:
props = idx.get_properties(i)
name = normalized_wof_names.get(i)
if not name:
name = props.get('name')
if not name:
continue
for pattern, repl in cls.regex_replacements:
name = pattern.sub(repl, name)
normalized_wof_names[i] = name
if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood':
continue
if not contains_ideographs:
sim = NeighborhoodDeduper.compare(osm_name, name, idf)
else:
# Many Han/Hangul characters are common, shouldn't use IDF
sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)
if sim > max_sim:
max_sim = sim
poly = idx.get_polygon(i)
arg_max = (max_sim, props, poly.context, idx, i)
if arg_max:
ranks.append(arg_max)
ranks.sort(key=operator.itemgetter(0), reverse=True)
if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
score, props, poly, idx, i = ranks[0]
existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True)
existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True)
skip_node = False
for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries):
for poly_index, osm_props in enumerate(boundaries):
containing_component = None
name = osm_props.get('name')
# Only exact name matches here since we're comparins OSM to OSM
if name and name.lower() != attrs.get('name', '').lower():
continue
if boundaries is existing_neighborhood_boundaries:
containing_component = AddressFormatter.SUBURB
skip_node = True
break
else:
containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]]
containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids)
if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]:
skip_node = True
break
if skip_node:
break
# Skip this element
if skip_node:
continue
if idx is cth:
if props['component'] == AddressFormatter.SUBURB:
attrs['polygon_type'] = 'neighborhood'
elif props['component'] == AddressFormatter.CITY_DISTRICT:
attrs['polygon_type'] = 'local_admin'
else:
continue
source = 'osm_cth'
else:
level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None)
source = 'osm_quattro'
if level == 'neighborhood':
attrs['polygon_type'] = 'neighborhood'
else:
attrs['polygon_type'] = 'local_admin'
containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries]
component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids)
attrs['component'] = component
attrs['source'] = source
index.index_polygon(poly)
index.add_polygon(poly, attrs)
idx.matched[i] = True
num_polys += 1
if num_polys % 1000 == 0 and num_polys > 0:
logger.info('did {} neighborhoods'.format(num_polys))
for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')):
for i in xrange(idx.i):
props = idx.get_properties(i)
poly = idx.get_polygon(i)
if idx.matched[i]:
continue
props['source'] = source
if idx is cth:
component = props['component']
if component == AddressFormatter.SUBURB:
props['polygon_type'] = 'neighborhood'
elif component == AddressFormatter.CITY_DISTRICT:
props['polygon_type'] = 'local_admin'
else:
continue
elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood':
component = AddressFormatter.SUBURB
name = props.get('name')
if not name:
continue
for pattern, repl in cls.regex_replacements:
name = pattern.sub(repl, name)
props['name'] = name
if cls.quattroshapes_city_district_regex.match(name):
component = AddressFormatter.CITY_DISTRICT
props['component'] = component
props['polygon_type'] = 'neighborhood'
else:
# We don't actually care about local admin polygons unless they match OSM
continue
index.index_polygon(poly.context)
index.add_polygon(poly.context, props)
return index
def setup(self):
self.priorities = []
def index_polygon_properties(self, properties):
self.priorities.append((self.level_priorities[properties['polygon_type']], self.source_priorities[properties['source']]))
def load_polygon_properties(self, d):
self.priorities = [tuple(p) for p in json.load(open(os.path.join(d, self.PRIORITIES_FILENAME)))]
def save_polygon_properties(self, d):
json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w'))
def priority(self, i):
return self.priorities[i]
def get_candidate_polygons(self, lat, lon):
candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon)
return sorted(candidates, key=self.priority)
class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex):
persistent_polygons = False
cache_size = None
NAME = "wof:name"
ASCII_NAME = "gn:asciiname"
LEVEL = "wof:placetype"
GEONAMES_ID = "gn:geonameid"
SUPERSEDED = "wof:superseded_by"
NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"}
POLYGON_TYPES = {"Polygon", "MultiPolygon"}
@classmethod
def is_valid_neighbourhood(cls, geojson):
validity = not geojson["properties"].get(cls.SUPERSEDED)
for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}:
validity &= geojson["properties"].get(field)
return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES
@classmethod
def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None):
index = cls(save_dir=output_dir, index_filename=index_filename)
for root, dirnames, filenames in os.walk(wof_dir):
for fname in fnmatch.filter(filenames, "*.geojson"):
with open(os.path.join(root, fname)) as f:
geojson = json.load(f)
if cls.is_valid_neighbourhood(geojson):
properties = {
"name": safe_decode(geojson["properties"].get(cls.NAME)),
"name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)),
"qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)),
"gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID))
}
poly_type = geojson['geometry']['type']
if poly_type == 'Polygon':
poly = cls.to_polygon(geojson['geometry']['coordinates'][0])
index.index_polygon(poly)
poly = index.simplify_polygon(poly)
index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props)
elif poly_type == 'MultiPolygon':
polys = []
for coords in geojson['geometry']['coordinates']:
poly = cls.to_polygon(coords[0])
polys.append(poly)
index.index_polygon(poly)
multi_poly = index.simplify_polygon(MultiPolygon(polys))
index.add_polygon(multi_poly, dict(geojson['properties']))
return index
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-w', '--wof-dir',
help='Path to WhosOnFirst dir')
parser.add_argument('-a', '--osm-admin-rtree-dir',
help='Path to OSM admin rtree dir')
parser.add_argument('-c', '--country-rtree-dir',
help='Path to country rtree dir')
parser.add_argument('-b', '--osm-neighborhood-borders-file',
help='Path to OSM neighborhood borders file (with dependencies, .osm format)')
parser.add_argument('-n', '--osm-neighborhoods-file',
help='Path to OSM neighborhoods file (no dependencies, .osm format)')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
help='Output directory')
logging.basicConfig(level=logging.INFO)
args = parser.parse_args()
if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
index = NeighborhoodReverseGeocoder.create_from_osm_and_wof(
args.osm_neighborhoods_file,
args.wof_dir,
args.country_rtree_dir,
args.osm_admin_rtree_dir,
args.osm_neighborhood_borders_file,
args.out_dir
)
else:
parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file')
index.save()

View File

View File

@@ -0,0 +1,219 @@
import os
import sys
import yaml
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.encoding import safe_encode
from geodata.i18n.unicode_paths import DATA_DIR
class InvalidNumexRuleException(Exception):
pass
NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'numex')
NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
GENDER_MASCULINE = 'GENDER_MASCULINE'
GENDER_FEMININE = 'GENDER_FEMININE'
GENDER_NEUTER = 'GENDER_NEUTER'
GENDER_NONE = 'GENDER_NONE'
gender_map = {
'm': GENDER_MASCULINE,
'f': GENDER_FEMININE,
'n': GENDER_NEUTER,
None: GENDER_NONE,
}
CATEGORY_PLURAL = 'CATEGORY_PLURAL'
CATEGORY_DEFAULT = 'CATEGORY_DEFAULT'
valid_numex_keys = set(['name', 'value', 'type', 'left', 'right', 'gender', 'category', 'radix',
'multiply_gte', 'exact_multiple_only', 'left_separator', 'right_separator'])
valid_ordinal_keys = set(['suffixes', 'gender', 'category'])
category_map = {
'plural': CATEGORY_PLURAL,
None: CATEGORY_DEFAULT
}
LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER'
LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
left_context_map = {
'add': LEFT_CONTEXT_ADD,
'multiply': LEFT_CONTEXT_MULTIPLY,
'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER,
None: LEFT_CONTEXT_NONE,
}
RIGHT_CONTEXT_MULTIPLY = 'NUMEX_RIGHT_CONTEXT_MULTIPLY'
RIGHT_CONTEXT_ADD = 'NUMEX_RIGHT_CONTEXT_ADD'
RIGHT_CONTEXT_NONE = 'NUMEX_RIGHT_CONTEXT_NONE'
right_context_map = {
'add': RIGHT_CONTEXT_ADD,
'multiply': RIGHT_CONTEXT_MULTIPLY,
None: RIGHT_CONTEXT_NONE,
}
CARDINAL = 'NUMEX_CARDINAL_RULE'
ORDINAL = 'NUMEX_ORDINAL_RULE'
ORDINAL_INDICATOR = 'NUMEX_ORDINAL_INDICATOR_RULE'
rule_type_map = {
'cardinal': CARDINAL,
'ordinal': ORDINAL,
'ordinal_indicator': ORDINAL_INDICATOR,
}
numex_key_template = u'"{key}"'
numex_rule_template = u'{{{left_context_type}, {right_context_type}, {rule_type}, {gender}, {category}, {radix}, {value}LL}}'
stopword_rule = u'NUMEX_STOPWORD_RULE'
ordinal_indicator_template = u'{{"{key}", {gender}, {category}, "{value}"}}'
stopwords_template = u'"{word}"'
language_template = u'{{"{language}", {whole_words_only}, {rule_index}, {num_rules}, {ordinal_indicator_index}, {num_ordinal_indicators}}}'
numex_rules_data_template = u'''
char *numex_keys[] = {{
{numex_keys}
}};
numex_rule_t numex_rules[] = {{
{numex_rules}
}};
ordinal_indicator_t ordinal_indicator_rules[] = {{
{ordinal_indicator_rules}
}};
numex_language_source_t numex_languages[] = {{
{languages}
}};
'''
def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
all_keys = []
all_rules = []
all_ordinal_indicators = []
all_stopwords = []
all_languages = []
out = open(outfile, 'w')
for filename in os.listdir(dirname):
path = os.path.join(dirname, filename)
if not os.path.isfile(path) or not filename.endswith('.yaml'):
continue
language = filename.split('.yaml', 1)[0]
data = yaml.load(open(path))
whole_words_only = data.get('whole_words_only', False)
rules = data.get('rules', [])
rule_index = len(all_rules)
for rule in rules:
invalid_keys = set(rule.keys()) - valid_numex_keys
if invalid_keys:
raise InvalidNumexRuleException(u'Invalid keys: ({}) for language {}, rule: {}'.format(u','.join(invalid_keys), language, rule))
gender = gender_map[rule.get('gender')]
rule_type = rule_type_map[rule['type']]
key = rule['name']
value = rule['value']
radix = rule.get('radix', 10)
rule_category = rule.get('category')
category = category_map.get(rule_category)
if category is None:
continue
left_context_type = left_context_map[rule.get('left')]
right_context_type = right_context_map[rule.get('right')]
all_keys.append(unicode(numex_key_template.format(key=key)))
all_rules.append(unicode(numex_rule_template.format(
language=language,
rule_type=rule_type,
gender=gender,
category=category,
left_context_type=left_context_type,
right_context_type=right_context_type,
value=value,
radix=radix
)))
ordinal_indicator_index = len(all_ordinal_indicators)
ordinal_indicators = data.get('ordinal_indicators', [])
num_ordinal_indicators = 0
for rule in ordinal_indicators:
gender = gender_map[rule.get('gender')]
category = category_map[rule.get('category')]
invalid_ordinal_keys = set(rule.keys()) - valid_ordinal_keys
if invalid_ordinal_keys:
raise InvalidNumexRuleException(u'Invalid keys ({}) in ordinal rule for language {}, rule: {}'.format(u','.join(invalid_ordinal_keys), language, rule))
for key, suffixes in rule['suffixes'].iteritems():
for suffix in suffixes:
all_ordinal_indicators.append(unicode(ordinal_indicator_template.format(
key=key,
value=suffix,
gender=gender,
category=category
)))
num_ordinal_indicators += len(suffixes)
stopwords = data.get('stopwords', [])
stopword_index = len(all_stopwords)
num_stopwords = len(stopwords)
for stopword in stopwords:
all_keys.append(numex_key_template.format(key=unicode(stopword)))
all_rules.append(stopword_rule)
num_rules = len(rules) + len(stopwords)
all_languages.append(unicode(language_template.format(
language=language,
whole_words_only=int(whole_words_only),
rule_index=rule_index,
num_rules=num_rules,
ordinal_indicator_index=ordinal_indicator_index,
num_ordinal_indicators=num_ordinal_indicators
)))
out.write(safe_encode(numex_rules_data_template.format(
numex_keys=u''',
'''.join(all_keys),
numex_rules=u''',
'''.join(all_rules),
ordinal_indicator_rules=u''',
'''.join(all_ordinal_indicators),
stopwords=u''',
'''.join(all_stopwords),
languages=u''',
'''.join(all_languages),
)))
out.close()
if __name__ == '__main__':
parse_numex_rules(*sys.argv[1:])

View File

@@ -0,0 +1,108 @@
import bisect
import math
import os
import operator
import random
import six
import sys
import yaml
from collections import defaultdict
from marisa_trie import BytesTrie
from geodata.text.phrases import PhraseFilter
from geodata.encoding import safe_encode, safe_decode
from geodata.i18n.unicode_paths import DATA_DIR
from geodata.numbers.numex import NUMEX_DATA_DIR
class OrdinalSuffixTrie(PhraseFilter):
def __init__(self, ordinal_rules):
self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
self.configured = True
def search_substring(self, s):
if len(s) == 0:
return None, 0
for i in xrange(len(s) + 1):
if not self.trie.has_keys_with_prefix(s[:i]):
i -= 1
break
if i > 0:
return (self.trie.get(s[:i]), i)
else:
return None, 0
def search_suffix(self, token):
suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
if suffix_search:
return suffix_search[0].split('|')
else:
return None
class OrdinalExpressions(object):
def __init__(self, base_dir=NUMEX_DATA_DIR):
self.cardinal_rules = {}
self.cardinal_rules_ones = {}
self.ordinal_rules = {}
self.ordinal_suffix_rules = {}
for filename in os.listdir(base_dir):
if filename.endswith('.yaml'):
lang = filename.split('.yaml')[0]
f = open(os.path.join(base_dir, filename))
data = yaml.load(f)
rules = data.get('rules')
if rules is not None and hasattr(rules, '__getslice__'):
cardinals = []
ordinals = defaultdict(list)
for rule in rules:
name = rule.get('name')
value = rule.get('value')
rule_type = rule.get('type')
if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
continue
gender = rule.get('gender', None)
category = rule.get('category', None)
if rule_type == 'ordinal':
ordinals[(value, gender, category)].append(name)
else:
cardinals.append(rule)
if value == 1:
self.cardinal_rules_ones[(lang, gender, category)] = name
self.cardinal_rules[lang] = cardinals
self.ordinal_rules[lang] = ordinals
ordinal_indicators = data.get('ordinal_indicators')
if ordinal_indicators is not None and hasattr(ordinal_indicators, '__getslice__'):
for rule_set in ordinal_indicators:
gender = rule_set.get('gender', None)
category = rule_set.get('category', None)
self.ordinal_suffix_rules[(lang, gender, category)] = OrdinalSuffixTrie(rule_set['suffixes'])
def get_suffixes(self, num, lang, gender=None, category=None):
trie = self.ordinal_suffix_rules.get((lang, gender, category))
if not trie:
return None
return trie.search_suffix(str(num))
def get_suffix(self, num, lang, gender=None, category=None):
suffixes = self.get_suffixes(num, lang, gender=gender, category=category)
if not suffixes:
return None
return random.choice(suffixes)
def suffixed_number(self, num, lang, gender=None, category=None):
suffix = self.get_suffix(num, lang, gender=gender, category=category)
if not suffix:
return None
return six.u('{}{}').format(safe_decode(num), safe_decode(suffix))
ordinal_expressions = OrdinalExpressions()

View File

@@ -0,0 +1,449 @@
import bisect
import math
import os
import random
import six
import yaml
from collections import defaultdict
from geodata.numbers.numex import NUMEX_DATA_DIR
class NumericExpressions(object):
default_separator = ' '
def __init__(self, base_dir=NUMEX_DATA_DIR):
self.cardinal_rules = {}
self.cardinal_rules_sorted = {}
self.cardinal_rules_ones = defaultdict(dict)
self.cardinal_rules_ones_sorted = {}
self.default_separators = {}
self.ordinal_rules = {}
self.ordinal_suffix_rules = {}
for filename in os.listdir(base_dir):
if filename.endswith('.yaml'):
lang = filename.split('.yaml')[0]
f = open(os.path.join(base_dir, filename))
data = yaml.load(f)
default_separator = data.get('default_separator')
if default_separator is not None:
self.default_separators[lang] = default_separator
rules = data.get('rules')
if rules is not None and hasattr(rules, '__getslice__'):
cardinals = defaultdict(list)
ordinals = defaultdict(list)
for rule in rules:
name = rule.get('name')
value = rule.get('value')
rule_type = rule.get('type')
if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
continue
gender = rule.get('gender', None)
category = rule.get('category', None)
if rule_type == 'ordinal':
ordinals[(value, gender, category)].append(rule)
else:
cardinals[(value, gender, category)].append(rule)
if value == 1 and 'multiply_gte' in rule:
self.cardinal_rules_ones[lang][rule['multiply_gte']] = rule
self.cardinal_rules[lang] = cardinals
self.ordinal_rules[lang] = ordinals
self.cardinal_rules_sorted[lang] = sorted(set([v for v, g, c in cardinals]))
self.cardinal_rules_ones_sorted[lang] = sorted(self.cardinal_rules_ones[lang].keys())
self.cardinal_rules_ones = dict(self.cardinal_rules_ones)
def spellout_cardinal(self, num, lang, gender=None, category=None, random_choice_cardinals=False):
num = int(num)
remainder = 0
if lang not in self.cardinal_rules:
return None
rules = self.cardinal_rules.get(lang)
cardinals = self.cardinal_rules_sorted.get(lang)
if not rules or not cardinals:
return None
default_separator = self.default_separators.get(lang, self.default_separator)
if num == 0:
cardinal = rules.get((num, gender, category))
if cardinal:
if not random_choice_cardinals:
cardinal = cardinal[0]
else:
cardinal = random.choice(cardinal)
return cardinal['name']
else:
return None
cardinal_part = []
last_rule = {}
left_multiply_rules = []
while num:
i = bisect.bisect_left(cardinals, num)
if i > len(cardinals) - 1:
return None
if i > 0 and cardinals[i] > num:
val = cardinals[i - 1]
else:
val = cardinals[i]
multiple = num // val
if val == num:
cardinal = rules.get((num, gender, category))
else:
cardinal = rules.get((val, None, None), [])
multiple_rule = None
if multiple > 1:
multiple_val = rules.get((multiple, None, None))
if multiple_val:
if not random_choice_cardinals:
multiple_rule = multiple_val[0]
else:
multiple_rule = random.choice(multiple_val)
elif multiple == 1 and lang in self.cardinal_rules_ones_sorted:
ones_rules = self.cardinal_rules_ones_sorted[lang]
j = bisect.bisect_right(ones_rules, val)
if j > 0 and ones_rules[j - 1] <= num:
multiple_rule = self.cardinal_rules_ones[lang][ones_rules[j - 1]]
use_multiple = multiple > 1
is_left_multiply = False
did_left_multiply = False
if not use_multiple:
rule = None
if cardinal and not random_choice_cardinals:
rule = cardinal[0]
elif cardinal:
rule = random.choice(cardinal)
else:
for rule in cardinal:
left_multiply = rule.get('left') == 'multiply'
if left_multiply:
if not multiple_rule:
left_multiply_rules.append(rule)
is_left_multiply = True
last_rule = rule
rule = None
break
else:
rule = None
if rule is not None:
left_add = last_rule.get('left') == 'add'
right_add = last_rule.get('right') == 'add'
if multiple_rule:
if right_add and cardinal_part:
cardinal_part.append(last_rule.get('left_separator', default_separator))
cardinal_part.append(multiple_rule['name'])
cardinal_part.append(rule.get('left_separator', default_separator))
if right_add:
if not multiple_rule and cardinal_part:
right_separator = last_rule.get('right_separator', default_separator)
cardinal_part.append(right_separator)
cardinal_part.append(rule['name'])
elif left_add and cardinal_part:
last = cardinal_part.pop()
cardinal_part.append(rule['name'])
left_separator = last_rule.get('left_separator', default_separator)
cardinal_part.append(left_separator)
cardinal_part.append(last)
elif not left_add and not right_add:
cardinal_part.append(rule['name'])
last_rule = rule
if left_multiply_rules and 'right' not in rule and 'left' not in rule:
left_multiply_rule = left_multiply_rules.pop()
left_separator = left_multiply_rule.get('left_separator', default_separator)
cardinal_part.append(left_separator)
cardinal_part.append(left_multiply_rule['name'])
did_left_multiply = True
last_rule = left_multiply_rule
if not is_left_multiply and not did_left_multiply:
num -= (multiple * val)
elif not did_left_multiply:
remainder = num % val
num /= val
else:
num = remainder
did_left_multiply = False
return six.u('').join(cardinal_part)
def roman_numeral(self, num):
numeral = self.spellout_cardinal(num, 'la')
if numeral is None:
return None
return numeral.upper()
def spellout_ordinal(self, num, lang, gender=None, category=None,
random_choice_cardinals=False, random_choice_ordinals=False):
num = int(num)
remainder = 0
if lang not in self.cardinal_rules:
return None
rules = self.ordinal_rules.get(lang)
cardinal_rules = self.cardinal_rules.get(lang)
cardinals = self.cardinal_rules_sorted.get(lang)
if not rules or not cardinal_rules or not cardinals:
return None
default_separator = self.default_separators.get(lang, self.default_separator)
expression = []
last_rule = {}
left_multiply_rules = []
if num == 0 or (num, gender, category) in rules:
ordinals = rules.get((num, gender, category))
if ordinals:
if not random_choice_ordinals:
ordinal = ordinals[0]
else:
ordinal = random.choice(ordinals)
return ordinal['name']
else:
return None
while num:
i = bisect.bisect_left(cardinals, num)
if i > len(cardinals) - 1:
return None
if i > 0 and cardinals[i] > num:
val = cardinals[i - 1]
else:
val = cardinals[i]
if val == num and not remainder:
if last_rule.get('right') == 'add':
ordinals = rules.get((num, gender, category))
if ordinals:
if not random_choice_ordinals:
ordinal = ordinals[0]
else:
ordinal = random.choice(ordinals)
right_separator = last_rule.get('right_separator', default_separator)
return right_separator.join([six.u('').join(expression), ordinal['name']])
else:
return None
elif last_rule.get('left') == 'add':
last_num = last_rule['value']
ordinals = rules.get((last_num, gender, category))
if ordinals:
if not random_choice_ordinals:
ordinal = ordinals[0]
else:
ordinal = random.choice(ordinals)
last_rule = ordinal
expression.pop()
cardinals = cardinal_rules.get((num, None, None))
if cardinals:
if not random_choice_cardinals:
rule = cardinals[0]
else:
rule = random.choice(cardinals)
expression.append(rule['name'])
else:
return None
last = ordinal['name']
left_separator = last_rule.get('left_separator', default_separator)
return left_separator.join([six.u('').join(expression), ordinal['name']])
else:
return None
else:
return None
else:
ordinal = rules.get((val, None, None), [])
cardinal = cardinal_rules.get((val, None, None), [])
multiple = num // val
multiple_rule = None
if multiple > 1:
multiple_val = cardinal_rules.get((multiple, None, None))
if multiple_val:
if not random_choice_cardinals:
multiple_rule = multiple_val[0]
else:
multiple_rule = random.choice(multiple_val)
elif multiple == 1 and lang in self.cardinal_rules_ones_sorted:
ones_rules = self.cardinal_rules_ones_sorted[lang]
j = bisect.bisect_right(ones_rules, val)
if j > 0 and ones_rules[j - 1] <= num:
multiple_rule = self.cardinal_rules_ones[lang][ones_rules[j - 1]]
use_multiple = multiple > 1
is_left_multiply = False
did_left_multiply = False
if not use_multiple:
rule = None
if ordinal and not remainder:
for rule in ordinal:
if rule.get('right') == 'add':
break
else:
rule = None
if not rule and cardinal and not random_choice_cardinals:
rule = cardinal[0]
elif not rule and cardinal:
rule = random.choice(cardinal)
else:
rule = None
have_ordinal = False
if ordinal:
for rule in ordinal:
left_multiply = rule.get('left') == 'multiply'
if left_multiply and rule.get('right') == 'add':
if not multiple_rule:
left_multiply_rules.append(rule)
is_left_multiply = True
last_rule = rule
rule = None
have_ordinal = True
break
else:
rule = None
if not have_ordinal:
for rule in cardinal:
left_multiply = rule.get('left') == 'multiply'
if left_multiply:
if not multiple_rule:
left_multiply_rules.append(rule)
is_left_multiply = True
last_rule = rule
rule = None
break
else:
rule = None
if rule is not None:
left_add = last_rule.get('left') == 'add'
right_add = last_rule.get('right') == 'add'
if multiple_rule:
if right_add and expression:
expression.append(last_rule.get('left_separator', default_separator))
expression.append(multiple_rule['name'])
expression.append(rule.get('left_separator', default_separator))
if right_add:
if not multiple_rule and expression:
right_separator = last_rule.get('right_separator', default_separator)
expression.append(right_separator)
expression.append(rule['name'])
elif left_add and expression:
last = expression.pop()
expression.append(rule['name'])
left_separator = last_rule.get('left_separator', default_separator)
expression.append(left_separator)
expression.append(last)
elif not left_add and not right_add:
expression.append(rule['name'])
last_rule = rule
if left_multiply_rules and 'right' not in rule and 'left' not in rule:
left_multiply_rule = left_multiply_rules.pop()
print 'left_multiply_rule', left_multiply_rule
left_separator = left_multiply_rule.get('left_separator', default_separator)
expression.append(left_separator)
expression.append(left_multiply_rule['name'])
did_left_multiply = True
last_rule = left_multiply_rule
if not is_left_multiply and not did_left_multiply:
num -= (multiple * val)
elif not did_left_multiply:
remainder = num % val
num /= val
else:
num = remainder
remainder = 0
did_left_multiply = False
def spellout_cardinal_hundreds(self, num, lang, gender=None, category=None, splitter=six.u(' ')):
if num % 100 >= 10:
first_hundred = self.spellout_cardinal(num % 100, lang, gender=gender, category=category)
elif num % 100 == 0:
rules = self.cardinal_rules.get(lang)
if not rules:
return None
cardinals = rules.get((100, gender, category))
if not cardinals:
return None
for rule in cardinals:
if rule.get('left') == 'multiply' and not rule.get('exact_multiple_only'):
break
else:
rule = None
if not rule:
return None
first_hundred = rule['name']
else:
rules = self.cardinal_rules.get(lang)
if not rules:
return None
tens_place = num % 10
zero_rules = rules.get((0, gender, category))
if not zero_rules:
return None
tens_place_rules = rules.get((tens_place, gender, category))
if not tens_place_rules:
return None
zero_rule = random.choice(zero_rules)
tens_rule = random.choice(tens_place_rules)
first_hundred = splitter.join([zero_rule['name'], tens_rule['name']])
if not first_hundred:
return None
parts = [first_hundred]
for i in xrange(1, int(math.ceil(math.log(num, 100)))):
part = self.spellout_cardinal(num / 100 ** i, lang, gender=gender, category=category)
if not part:
return None
parts.append(part)
return splitter.join(reversed(parts))
numeric_expressions = NumericExpressions()

View File

@@ -0,0 +1,33 @@
import os
import six
import yaml
this_dir = os.path.realpath(os.path.dirname(__file__))
OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'parser', 'data_sets', 'openaddresses.yaml')
class OpenAddressesConfig(object):
def __init__(self, path=OPENADDRESSES_PARSER_DATA_CONFIG):
self.path = path
config = yaml.load(open(path))
self.config = config['global']
self.country_configs = config['countries']
@property
def sources(self):
for country, config in six.iteritems(self.country_configs):
for file_config in config.get('files', []):
filename = file_config['filename'].rsplit('.', 1)[0]
yield country, filename
for subdir, subdir_config in six.iteritems(config.get('subdirs', {})):
for file_config in subdir_config.get('files', []):
filename = file_config['filename'].rsplit('.', 1)[0]
yield country, subdir, filename
openaddresses_config = OpenAddressesConfig()

View File

@@ -0,0 +1,114 @@
import argparse
import os
import requests
import six
import subprocess
import sys
import tempfile
import yaml
from six.moves.urllib_parse import urljoin, quote_plus, unquote_plus
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.openaddresses.config import openaddresses_config
from geodata.csv_utils import unicode_csv_reader
from geodata.file_utils import ensure_dir, download_file, unzip_file, cd, remove_file
from geodata.encoding import safe_encode, safe_decode
BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io'
OPENADDRESSES_LATEST_DIR = urljoin(BASE_OPENADDRESSES_DATA_URL, 'latest/run/')
OPENADDRESSES_STATE_FILE_NAME = 'state.txt'
OPENADDRESSES_STATE_URL = urljoin(BASE_OPENADDRESSES_DATA_URL, OPENADDRESSES_STATE_FILE_NAME)
def download_and_unzip_file(url, out_dir):
zip_filename = url.rsplit('/', 1)[-1].strip()
zip_local_path = os.path.join(out_dir, zip_filename)
success = download_file(url, zip_local_path) and unzip_file(zip_local_path, out_dir)
if os.path.exists(zip_local_path):
remove_file(zip_local_path)
return success
def download_pre_release_downloads(out_dir):
for url in openaddresses_config.config.get('pre_release_downloads', []):
print(six.u('doing pre_release {}').format(safe_decode(url)))
success = download_and_unzip_file(url, out_dir)
if not success:
print(six.u('ERR: could not download {}').format(source))
return False
return True
def openaddresses_download_all_files(out_dir):
temp_dir = tempfile.gettempdir()
local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME)
if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path):
sys.exit('Could not download state.txt file')
reader = unicode_csv_reader(open(local_state_file_path), delimiter='\t')
headers = reader.next()
source_index = headers.index('source')
url_index = headers.index('processed')
download_pre_release_downloads(out_dir)
for row in reader:
source = row[source_index].rsplit('.')[0]
processed = row[url_index]
if not processed or not processed.strip():
continue
print(six.u('doing {}').format(source))
success = download_and_unzip_file(processed, out_dir)
if not success:
print(six.u('ERR: could not download {}').format(source))
remove_file(local_state_file_path)
def openaddresses_download_configured_files(out_dir):
for path in openaddresses_config.sources:
source = six.b('/').join([safe_encode(p) for p in path])
filename = safe_encode(path[-1]) + six.b('.zip')
zip_path = filename + '.zip'
zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] + [quote_plus(filename)])
url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path)
download_pre_release_downloads(out_dir)
print(six.u('doing {}').format(safe_decode(source)))
success = download_and_unzip_file(url, out_dir)
if not success:
print(six.u('ERR: could not download {}').format(source))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--out-dir',
required=True,
help='Output directory')
parser.add_argument('--all', action='store_true',
default=False, help='Download all completed OpenAddresses files')
args = parser.parse_args()
ensure_dir(args.out_dir)
if args.all:
openaddresses_download_all_files(args.out_dir)
else:
openaddresses_download_configured_files(args.out_dir)

View File

@@ -0,0 +1,698 @@
# -*- coding: utf-8 -*-
import csv
import ftfy
import itertools
import os
import random
import re
import six
import yaml
from geodata.addresses.units import Unit
from geodata.address_expansions.abbreviations import abbreviate
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_types_gazetteer, toponym_abbreviations_gazetteer
from geodata.address_formatting.formatter import AddressFormatter
from geodata.addresses.components import AddressComponents
from geodata.countries.constants import Countries
from geodata.countries.names import country_names
from geodata.encoding import safe_decode, safe_encode
from geodata.i18n.languages import get_country_languages
from geodata.i18n.word_breaks import ideographic_scripts
from geodata.language_id.disambiguation import UNKNOWN_LANGUAGE, get_string_script
from geodata.math.sampling import cdf, weighted_choice
from geodata.openaddresses.config import openaddresses_config
from geodata.places.config import place_config
from geodata.postal_codes.phrases import PostalCodes
from geodata.text.tokenize import tokenize
from geodata.text.token_types import token_types
from geodata.text.utils import is_numeric, is_numeric_strict
from geodata.csv_utils import tsv_string, unicode_csv_reader
OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv'
OPENADDRESSES_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv'
null_regex = re.compile('^\s*(?:null|none)\s*$', re.I)
unknown_regex = re.compile('\bunknown\b', re.I)
not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I)
sin_numero_regex = re.compile('^\s*s\s*/\s*n\s*$', re.I)
russian_number_regex_str = safe_decode(r'(?:№\s*)?(?:(?:[\d]+\w?(?:[\-/](?:(?:[\d]+\w?)|\w))*)|(?:[\d]+\s*\w?)|(?:\b\w\b))')
dom_korpus_stroyeniye_regex = re.compile(safe_decode('(?:(?:дом(?=\s)|д\.?)\s*)?{}(?:(?:\s*,|\s+)\s*(?:(?:корпус(?=\s)|к\.?)\s*){})?(?:(?:\s*,|\s+)\s*(?:(?:строение(?=\s)|с\.?)\s*){})?\s*$').format(russian_number_regex_str, russian_number_regex_str, russian_number_regex_str), re.I | re.U)
uchastok_regex = re.compile(safe_decode('{}\s*(?:,?\s*участок\s+{}\s*)?$').format(russian_number_regex_str, russian_number_regex_str), re.I | re.U)
bea_nomera_regex = re.compile(safe_decode('^\s*б\s*/\s*н\s*$'), re.I)
fraction_regex = re.compile('^\s*[\d]+[\s]*/[\s]*(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)[\s]*$', re.I)
number_space_letter_regex = re.compile('^[\d]+\s+[a-z]$', re.I)
number_slash_number_regex = re.compile('^(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)[\s]*/[\s]*(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)$', re.I)
number_fraction_regex = re.compile('^(?:[\d]+\s+)?(?:1[\s]*/[\s]*[234]|2[\s]*/[\s]*3)$')
colombian_standard_house_number_regex = re.compile('^(\d+[\s]*[a-z]?)\s+([a-z]?[\d]+[\s]*[a-z]?)?', re.I)
dutch_house_number_regex = re.compile('([\d]+)( [a-z])?( [\d]+)?', re.I)
SPANISH = 'es'
PORTUGUESE = 'pt'
RUSSIAN = 'ru'
CHINESE = 'zh'
class OpenAddressesFormatter(object):
field_regex_replacements = {
# All fields
None: [
(re.compile('<\s*null\s*>', re.I), u''),
(re.compile('[\s]{2,}'), six.u(' ')),
(re.compile('\`'), u"'"),
(re.compile('\-?\*'), u""),
],
AddressFormatter.HOUSE_NUMBER: [
# Most of the house numbers in Montreal start with "#"
(re.compile('^#', re.UNICODE), u''),
# Some house numbers have multiple hyphens
(re.compile('[\-]{2,}'), u'-'),
# Some house number ranges are split up like "12 -14"
(re.compile('[\s]*\-[\s]*'), u'-'),
]
}
unit_type_regexes = {}
for (lang, dictionary_type), values in six.iteritems(address_phrase_dictionaries.phrases):
if dictionary_type == 'unit_types_numbered':
unit_phrases = [safe_encode(p) for p in itertools.chain(*values) if len(p) > 2]
pattern = re.compile(r'\b(?:{})\s+(?:#?\s*)(?:[\d]+|[a-z]|[a-z]\-?[\d]+|[\d]+\-?[a-z])\s*$'.format(safe_encode('|').join(unit_phrases)),
re.I | re.UNICODE)
unit_type_regexes[lang] = pattern
def __init__(self, components, country_rtree, debug=False):
self.components = components
self.country_rtree = country_rtree
self.debug = debug
self.formatter = AddressFormatter()
class validators:
@classmethod
def validate_postcode(cls, postcode):
'''
Postcodes that are all zeros are improperly-formatted NULL values
'''
return not all((c in ('0', '-', '.', ' ', ',') for c in postcode))
@classmethod
def validate_street(cls, street):
'''
Streets should not be simple numbers. If they are it's probably a
copy/paste error and should be the house number.
'''
return not is_numeric(street)
@classmethod
def validate_house_number(cls, house_number):
'''
House number doesn't necessarily have to be numeric, but in some of the
OpenAddresses data sets the house number field is equal to the capitalized
street name, so this at least provides protection against insane values
for house number at the cost of maybe missing a few houses numbered "A", etc.
Also OpenAddresses primarily comes from county GIS servers, etc. which use
a variety of database schemas and don't always handle NULLs very well. Again,
while a single zero is a valid house number, in OpenAddresses it's more likely
an error
While a single zero is a valid house number, more than one zero is not, or
at least not in OpenAddresses
'''
try:
house_number = int(house_number.strip())
return house_number > 0
except (ValueError, TypeError):
house_number = house_number.strip()
return house_number and (is_numeric(house_number) or fraction_regex.match(house_number) or number_space_letter_regex.match(house_number) or
number_slash_number_regex.match(house_number) or number_fraction_regex.match(house_number)) and not all((c == '0' for c in house_number if c.isdigit()))
@classmethod
def validate_house_number_sin_numero(cls, house_number):
if sin_numero_regex.match(house_number):
return True
return cls.validate_house_number(house_number)
@classmethod
def validate_russian_house_number(cls, house_number):
if dom_korpus_stroyeniye_regex.match(house_number):
return True
elif uchastok_regex.match(house_number):
return True
elif bea_nomera_regex.match(house_number):
return True
return cls.validate_house_number(house_number)
@classmethod
def validate_colombian_house_number(cls, house_number):
return True
@classmethod
def validate_chinese_house_number(cls, house_number):
if not house_number:
return False
tokens = tokenize(house_number)
if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'', u'', u'')) for t, c in tokens):
return True
return cls.validate_house_number(house_number)
component_validators = {
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
AddressFormatter.ROAD: validators.validate_street,
AddressFormatter.POSTCODE: validators.validate_postcode,
}
language_validators = {
SPANISH: {
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
},
PORTUGUESE: {
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
},
RUSSIAN: {
AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number,
},
CHINESE: {
AddressFormatter.HOUSE_NUMBER: validators.validate_chinese_house_number,
}
}
country_validators = {
Countries.COLOMBIA: {
AddressFormatter.HOUSE_NUMBER: validators.validate_colombian_house_number
}
}
chinese_annex_regex = re.compile(u'([\d]+)(?![\d号栋])', re.U)
@classmethod
def format_chinese_house_number(cls, house_number):
if not house_number:
return house_number
return cls.chinese_annex_regex.sub(u'\\1号', house_number)
@classmethod
def format_colombian_house_number(cls, house_number):
house_number = house_number.strip()
match = colombian_standard_house_number_regex.match(house_number)
if match:
separator = random.choice((u'-', u' - ', u' '))
cross_street, building_number = match.groups()
numbers = []
if cross_street and u' ' in cross_street and random.choice((True, False)):
cross_street = cross_street.replace(u' ', u'')
if cross_street:
numbers.append(cross_street)
if building_number and u' ' in building_number and random.choice((True, False)):
building_number = building_number.replace(u' ', u'')
if building_number:
numbers.append(building_number)
if numbers:
house_number = separator.join(numbers)
house_number_prefixes = (u'#', u'no.', u'no', u'')
if random.choice((True, False)) and not any((house_number.lower().startswith(p) for p in house_number_prefixes)):
house_number = u' '.join([random.choice(house_number_prefixes), house_number])
return house_number
def get_property(self, key, *configs):
for config in configs:
value = config.get(key, None)
if value is not None:
return value
return None
def cldr_country_name(self, country_code, language, configs):
cldr_country_prob = float(self.get_property('cldr_country_probability', *configs))
country_name = None
if random.random() < cldr_country_prob:
localized, iso_3166, alpha2, alpha3 = values = range(4)
localized_prob = float(self.get_property('localized_name_probability', *configs))
iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs))
alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs))
alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs))
probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])
country_type = weighted_choice(values, probs)
country_name = country_code.upper()
if country_type == localized:
country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
elif country_type == iso_3166:
country_name = country_names.iso3166_name(country_code)
elif country_type == alpha3:
country_name = country_names.alpha3_code(country_code) or country_name
return country_name
@classmethod
def cleanup_number(cls, num, strip_commas=False):
num = num.strip()
if strip_commas:
num = num.replace(six.u(','), six.u(''))
try:
num_int = int(num)
except (ValueError, TypeError):
try:
num_float = float(num)
leading_zeros = 0
for c in num:
if c == six.u('0'):
leading_zeros += 1
else:
break
num = safe_decode(int(num_float))
if leading_zeros:
num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
except (ValueError, TypeError):
pass
return num
@classmethod
def fix_component_encodings(cls, components):
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}
def formatted_addresses(self, country_dir, path, configs, tag_components=True):
abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs))
separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0)
abbreviate_toponym_prob = float(self.get_property('abbreviate_toponym_probability', *configs))
add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False)
add_osm_neighborhoods = bool(self.get_property('add_osm_neighborhoods', *configs) or False)
osm_neighborhood_overrides_city = self.get_property('osm_neighborhood_overrides_city', *configs)
non_numeric_units = bool(self.get_property('non_numeric_units', *configs) or False)
house_number_strip_commas = bool(self.get_property('house_number_strip_commas', *configs) or False)
numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False)
postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False)
address_only_probability = float(self.get_property('address_only_probability', *configs))
place_only_probability = float(self.get_property('place_only_probability', *configs))
place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs))
city_replacements = self.get_property('city_replacements', *configs)
override_country_dir = self.get_property('override_country_dir', *configs)
postcode_length = int(self.get_property('postcode_length', *configs) or 0)
drop_address_probability = place_only_probability + place_and_postcode_probability
ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or [])
ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE)
for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))}
alias_fields_containing = {field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value]
for field, value in six.iteritems(dict(self.get_property('alias_fields_containing', *configs) or {}))}
config_language = self.get_property('language', *configs)
add_components = self.get_property('add', *configs)
fields = self.get_property('fields', *configs)
if not fields:
return
field_map = {field_name: f['component'] for field_name, f in six.iteritems(fields)}
mapped_values = {f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get')}
f = open(path)
reader = unicode_csv_reader(f)
headers = reader.next()
header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map}
latitude_index = headers.index('LAT')
longitude_index = headers.index('LON')
# Clear cached polygons
self.components.osm_admin_rtree.clear_cache()
self.components.neighborhoods_rtree.clear_cache()
for row in reader:
try:
latitude = float(row[latitude_index])
longitude = float(row[longitude_index])
except (ValueError, TypeError):
continue
language = config_language
components = {}
skip_record = False
for i, key in six.iteritems(header_indices):
value = row[i].strip()
if not value and key in ignore_rows_missing_fields:
skip_record = True
break
elif not value:
continue
if key in mapped_values:
value = mapped_values[key].get(value, value)
if key == AddressFormatter.ROAD and language == SPANISH:
value = self.components.spanish_street_name(value)
if key == AddressFormatter.POSTCODE:
value = self.cleanup_number(value)
if postcode_strip_non_digit_chars:
value = six.u('').join((c for c in value if c.isdigit()))
if value and not is_numeric(value) and numeric_postcodes_only:
continue
else:
if postcode_length:
value = value.zfill(postcode_length)[:postcode_length]
if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
if add_osm_boundaries:
continue
value = self.components.cleaned_name(value, first_comma_delimited_phrase=True)
if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)):
continue
if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
continue
for exp, sub_val in self.field_regex_replacements.get(key, []):
value = exp.sub(sub_val, value)
for exp, sub_val in self.field_regex_replacements.get(None, []):
value = exp.sub(sub_val, value)
value = value.strip(', -')
validator = self.country_validators.get(country_dir, {}).get(key, self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None)))
if validator is not None and not validator(value):
continue
if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
continue
for (pattern, alias) in alias_fields_containing.get(key, []):
if pattern.search(value):
if 'component' in alias:
key = alias['component']
if value:
components[key] = value
if skip_record:
continue
if components:
country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages) or (country != country_dir and not override_country_dir):
country = country_dir
candidate_languages = get_country_languages(country)
if not candidate_languages:
continue
candidate_languages = candidate_languages.items()
components = self.fix_component_encodings(components)
if language is None:
language = AddressComponents.address_language(components, candidate_languages)
street = components.get(AddressFormatter.ROAD, None)
if street is not None:
street = street.strip()
street = AddressComponents.cleaned_name(street)
if language == UNKNOWN_LANGUAGE:
strip_unit_language = candidate_languages[0][0] if candidate_languages else None
else:
strip_unit_language = language
street = self.components.strip_unit_phrases_for_language(street, strip_unit_language)
street = abbreviate(street_types_gazetteer, street, language,
abbreviate_prob=abbreviate_street_prob,
separate_prob=separate_street_prob)
components[AddressFormatter.ROAD] = street
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
if house_number:
house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas)
if language == CHINESE:
house_number = self.format_chinese_house_number(house_number)
if country_dir == Countries.COLOMBIA:
house_number = self.format_colombian_house_number(house_number)
if house_number is not None:
components[AddressFormatter.HOUSE_NUMBER] = house_number
unit = components.get(AddressFormatter.UNIT, None)
street_required = country not in (Countries.JAPAN, Countries.CZECH_REPUBLIC) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES
postcode = components.get(AddressFormatter.POSTCODE, None)
if postcode:
components[AddressFormatter.POSTCODE] = PostalCodes.add_country_code(postcode, country)
# If there's a postcode, we can still use just the city/state/postcode, otherwise discard
if (not street and street_required) or (street and house_number and (street.lower() == house_number.lower())) or (unit and street and street.lower() == unit.lower()):
if not postcode:
continue
components = self.components.drop_address(components)
# Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc.
unit = components.get(AddressFormatter.UNIT, None)
if unit is not None:
if is_numeric_strict(unit):
unit = Unit.phrase(unit, language, country=country)
elif non_numeric_units:
unit = abbreviate(unit_types_gazetteer, unit, language,
abbreviate_prob=abbreviate_unit_prob,
separate_prob=separate_unit_prob)
else:
unit = None
if unit is not None:
components[AddressFormatter.UNIT] = unit
else:
components.pop(AddressFormatter.UNIT)
unit = None
# CLDR country name
country_name = self.cldr_country_name(country, language, configs)
if country_name:
components[AddressFormatter.COUNTRY] = country_name
for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
component = components.get(component_key, None)
if component is not None:
component = abbreviate(toponym_abbreviations_gazetteer, component, language,
abbreviate_prob=abbreviate_toponym_prob)
component = self.components.name_hyphens(component)
components[component_key] = component
# Any components specified to be added by the config (usually state)
if add_components:
for k, v in six.iteritems(add_components):
if k not in components:
components[k] = v
# Get named states occasionally, added component is usually a state code
address_state = self.components.state_name(components, country, language)
if address_state:
components[AddressFormatter.STATE] = address_state
state = components.get(AddressFormatter.STATE)
if state:
state = self.components.abbreviated_state(state, country, language)
if state:
components[AddressFormatter.STATE] = state
# This is expensive, so only turn on for files that don't supply their own city names
# or for which those names are flawed
osm_components = []
# Using population=0 instead of None means if there's no known population or
# we don't need to add OSM components, we assume the population of the town is
# very small and the place name shouldn't be used unqualified (i.e. needs information
# like state name to disambiguate it)
population = 0
unambiguous_city = False
if add_osm_boundaries or AddressFormatter.CITY not in components:
osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
self.components.add_admin_boundaries(components, osm_components, country, language, latitude, longitude)
categorized = self.components.categorized_osm_components(country, osm_components)
for component, label in categorized:
if label == AddressFormatter.CITY:
unambiguous_city = self.components.unambiguous_wikipedia(component, language)
if 'population' in component:
population = component['population']
break
if AddressFormatter.CITY not in components and city_replacements:
components.update({k: v for k, v in six.iteritems(city_replacements) if k not in components})
# The neighborhood index is cheaper so can turn on for whole countries
neighborhood_components = []
if add_osm_neighborhoods:
neighborhood_components = self.components.neighborhood_components(latitude, longitude)
self.components.add_neighborhoods(components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city)
self.components.cleanup_boundary_names(components)
self.components.country_specific_cleanup(components, country)
self.components.replace_name_affixes(components, language, country=country)
self.components.replace_names(components)
self.components.prune_duplicate_names(components)
self.components.remove_numeric_boundary_names(components)
self.components.add_house_number_phrase(components, language, country=country)
self.components.add_postcode_phrase(components, language, country=country)
# Component dropout
all_osm_components = osm_components + neighborhood_components
components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)
self.components.add_genitives(components, language)
formatted = self.formatter.format_address(components, country, language=language,
minimal_only=False, tag_components=tag_components)
yield (language, country, formatted)
if random.random() < address_only_probability and street:
address_only_components = self.components.drop_places(components)
address_only_components = self.components.drop_postcode(address_only_components)
formatted = self.formatter.format_address(address_only_components, country, language=language,
minimal_only=False, tag_components=tag_components)
yield (language, country, formatted)
rand_val = random.random()
if street and house_number and rand_val < drop_address_probability:
components = self.components.drop_address(components)
if rand_val < place_and_postcode_probability:
components = self.components.drop_postcode(components)
if components and (len(components) > 1 or add_osm_boundaries):
formatted = self.formatter.format_address(components, country, language=language,
minimal_only=False, tag_components=tag_components)
yield (language, country, formatted)
def build_training_data(self, base_dir, out_dir, tag_components=True, sources_only=None):
all_sources_valid = sources_only is None
valid_sources = set()
if not all_sources_valid:
for source in sources_only:
if source.startswith(base_dir):
source = os.path.relpath(source, base_dir)
parts = source.strip('/ ').split('/')
if len(parts) > 3:
raise AssertionError('Sources may only have at maximum 3 parts')
valid_sources.add(tuple(parts))
if tag_components:
formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
else:
formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_FILENAME), 'w')
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
i = 0
for country_dir in sorted(openaddresses_config.country_configs.keys()):
country_config = openaddresses_config.country_configs[country_dir]
# Clear country cache for each new country
self.country_rtree.clear_cache()
for file_config in country_config.get('files', []):
filename = file_config['filename']
if not all_sources_valid and not ((country_dir, filename) in valid_sources or (country_dir,) in valid_sources):
continue
print(six.u('doing {}/{}').format(country_dir, filename))
path = os.path.join(base_dir, country_dir, filename)
configs = (file_config, country_config, openaddresses_config.config)
for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components):
if not formatted_address or not formatted_address.strip():
continue
formatted_address = tsv_string(formatted_address)
if not formatted_address or not formatted_address.strip():
continue
if tag_components:
row = (language, country, formatted_address)
else:
row = (formatted_address,)
writer.writerow(row)
i += 1
if i % 1000 == 0 and i > 0:
print('did {} formatted addresses'.format(i))
if self.debug:
break
for subdir in sorted(country_config.get('subdirs', {}).keys()):
subdir_config = country_config['subdirs'][subdir]
subdir = safe_decode(subdir)
for file_config in subdir_config.get('files', []):
filename = file_config['filename']
if not all_sources_valid and not ((country_dir, subdir, filename) in valid_sources or (country_dir, subdir) in valid_sources or (country_dir,) in valid_sources):
continue
print(six.u('doing {}/{}/{}').format(country_dir, subdir, filename))
path = os.path.join(base_dir, country_dir, subdir, filename)
configs = (file_config, subdir_config, country_config, openaddresses_config.config)
for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components):
if not formatted_address or not formatted_address.strip():
continue
formatted_address = tsv_string(formatted_address)
if not formatted_address or not formatted_address.strip():
continue
if tag_components:
row = (language, country, formatted_address)
else:
row = (formatted_address,)
writer.writerow(row)
i += 1
if i % 1000 == 0 and i > 0:
print('did {} formatted addresses'.format(i))
if self.debug:
break

Some files were not shown because too many files have changed in this diff Show More