Initial fork commit
This commit is contained in:
0
scripts/geodata/__init__.py
Normal file
0
scripts/geodata/__init__.py
Normal file
0
scripts/geodata/address_expansions/__init__.py
Normal file
0
scripts/geodata/address_expansions/__init__.py
Normal file
233
scripts/geodata/address_expansions/abbreviations.py
Normal file
233
scripts/geodata/address_expansions/abbreviations.py
Normal file
@@ -0,0 +1,233 @@
|
||||
import random
|
||||
import re
|
||||
import six
|
||||
|
||||
from geodata.address_expansions.gazetteers import *
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.text.tokenize import tokenize_raw, token_types
|
||||
from geodata.text.utils import non_breaking_dash_regex
|
||||
|
||||
|
||||
LOWER, UPPER, TITLE, MIXED = range(4)
|
||||
|
||||
|
||||
def token_capitalization(s):
|
||||
if s.istitle():
|
||||
return TITLE
|
||||
elif s.islower():
|
||||
return LOWER
|
||||
elif s.isupper():
|
||||
return UPPER
|
||||
else:
|
||||
return MIXED
|
||||
|
||||
|
||||
expansion_token_regex = re.compile('([^ \-\.]+)([\.\- ]+|$)')
|
||||
|
||||
|
||||
def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
|
||||
expansion_tokens = expansion_token_regex.findall(expansion)
|
||||
|
||||
if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
|
||||
expansion_tokenized = tokenize(expansion)
|
||||
is_acronym = len(expansion_tokenized) == 1 and expansion_tokenized[0][1] == token_types.ACRONYM
|
||||
if len(expansion) <= 3 or is_acronym:
|
||||
return expansion.upper()
|
||||
else:
|
||||
return expansion.title()
|
||||
elif len(tokens) == len(expansion_tokens):
|
||||
strings = []
|
||||
for (t, c), (e, suf) in zip(tokens, expansion_tokens):
|
||||
cap = token_capitalization(t)
|
||||
if suf == six.u(' '):
|
||||
suf = space_token
|
||||
if cap == LOWER:
|
||||
strings.append(six.u('').join((e.lower(), suf)))
|
||||
elif cap == UPPER:
|
||||
strings.append(six.u('').join((e.upper(), suf)))
|
||||
elif cap == TITLE:
|
||||
strings.append(six.u('').join((e.title(), suf)))
|
||||
elif t.lower() == e.lower():
|
||||
strings.append(t)
|
||||
else:
|
||||
strings.append(six.u('').join((e.title(), suf)))
|
||||
return six.u('').join(strings)
|
||||
else:
|
||||
|
||||
strings = []
|
||||
for e, suf in expansion_tokens:
|
||||
strings.append(e.title())
|
||||
if suf == six.u(' '):
|
||||
strings.append(space_token)
|
||||
else:
|
||||
strings.append(suf)
|
||||
return six.u('').join(strings)
|
||||
|
||||
|
||||
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
|
||||
'''
|
||||
Abbreviations
|
||||
-------------
|
||||
|
||||
OSM discourages abbreviations, but to make our training data map better
|
||||
to real-world input, we can safely replace the canonical phrase with an
|
||||
abbreviated version and retain the meaning of the words
|
||||
'''
|
||||
raw_tokens = tokenize_raw(s)
|
||||
s_utf8 = safe_encode(s)
|
||||
tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
|
||||
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
|
||||
|
||||
n = len(tokens)
|
||||
|
||||
abbreviated = []
|
||||
|
||||
i = 0
|
||||
|
||||
def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
|
||||
data = [d.split(six.b('|')) for d in data]
|
||||
|
||||
# local copy
|
||||
abbreviated = []
|
||||
|
||||
n = len(t)
|
||||
|
||||
# Append the original tokens with whitespace if there is any
|
||||
if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
|
||||
for j, (t_i, c_i) in enumerate(t):
|
||||
abbreviated.append(tokens[i + j][0])
|
||||
|
||||
if j < n - 1:
|
||||
abbreviated.append(space_token)
|
||||
return abbreviated
|
||||
|
||||
for lang, dictionary, is_canonical, canonical in data:
|
||||
if lang not in (language, 'all'):
|
||||
continue
|
||||
|
||||
is_canonical = int(is_canonical)
|
||||
is_stopword = dictionary == 'stopword'
|
||||
is_prefix = dictionary.startswith('concatenated_prefixes')
|
||||
is_suffix = dictionary.startswith('concatenated_suffixes')
|
||||
is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
|
||||
|
||||
suffix = None
|
||||
prefix = None
|
||||
|
||||
if not is_canonical:
|
||||
continue
|
||||
|
||||
if not is_prefix and not is_suffix:
|
||||
abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
|
||||
# TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
|
||||
# would require an audit of the dictionaries though so abbreviations are listed from
|
||||
# left-to-right by frequency of usage
|
||||
token = random.choice(abbreviations) if abbreviations else canonical
|
||||
token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
|
||||
abbreviated.append(token)
|
||||
break
|
||||
elif is_prefix:
|
||||
token = tokens[i][0]
|
||||
prefix, token = token[:length], token[length:]
|
||||
|
||||
abbreviated.append(prefix)
|
||||
if random.random() < separate_prob:
|
||||
sub_tokens = tokenize(token)
|
||||
if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
|
||||
token = six.u('').join((t for t, c in sub_tokens[1:]))
|
||||
|
||||
abbreviated.append(space_token)
|
||||
if token.islower():
|
||||
abbreviated.append(token.title())
|
||||
else:
|
||||
abbreviated.append(token)
|
||||
abbreviated.append(space_token)
|
||||
break
|
||||
elif is_suffix:
|
||||
token = tokens[i][0]
|
||||
|
||||
token, suffix = token[:-length], token[-length:]
|
||||
|
||||
concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
|
||||
|
||||
separated_abbreviations = []
|
||||
phrase = gazetteer.trie.get(suffix.rstrip('.'))
|
||||
suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
|
||||
for l, d, _, c in suffix_data:
|
||||
if l == lang and c == canonical:
|
||||
separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
|
||||
|
||||
separate = random.random() < separate_prob
|
||||
|
||||
if concatenated_abbreviations and not separate:
|
||||
abbreviation = random.choice(concatenated_abbreviations)
|
||||
elif separated_abbreviations:
|
||||
abbreviation = random.choice(separated_abbreviations)
|
||||
else:
|
||||
abbreviation = canonical
|
||||
|
||||
if separate:
|
||||
sub_tokens = tokenize(token)
|
||||
if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
|
||||
token = six.u('').join((t for t, c in sub_tokens[:-1]))
|
||||
|
||||
abbreviated.append(token)
|
||||
if separate:
|
||||
abbreviated.append(space_token)
|
||||
if suffix.isupper():
|
||||
abbreviated.append(abbreviation.upper())
|
||||
elif separate:
|
||||
abbreviated.append(abbreviation.title())
|
||||
else:
|
||||
abbreviated.append(abbreviation)
|
||||
break
|
||||
else:
|
||||
for j, (t_i, c_i) in enumerate(t):
|
||||
abbreviated.append(tokens[i + j][0])
|
||||
if j < n - 1:
|
||||
abbreviated.append(space_token)
|
||||
return abbreviated
|
||||
|
||||
for t, c, length, data in gazetteer.filter(norm_tokens):
|
||||
if c == token_types.PHRASE:
|
||||
abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
|
||||
abbreviated.extend(abbrev_tokens)
|
||||
|
||||
if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
|
||||
abbreviated.append(six.u(' '))
|
||||
|
||||
i += len(t)
|
||||
|
||||
else:
|
||||
token = tokens[i][0]
|
||||
if not non_breaking_dash_regex.search(token):
|
||||
abbreviated.append(token)
|
||||
else:
|
||||
sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
|
||||
sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
|
||||
|
||||
sub_token_abbreviated = []
|
||||
sub_i = 0
|
||||
sub_n = len(sub_tokens)
|
||||
for t, c, length, data in gazetteer.filter(sub_tokens_norm):
|
||||
if c == token_types.PHRASE:
|
||||
abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
|
||||
sub_token_abbreviated.extend(abbrev_tokens)
|
||||
sub_i += len(t)
|
||||
if sub_i < sub_n:
|
||||
if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
|
||||
sub_token_abbreviated.append(six.u('.'))
|
||||
sub_token_abbreviated.append(six.u('-'))
|
||||
else:
|
||||
sub_token_abbreviated.append(sub_tokens[sub_i][0])
|
||||
sub_i += 1
|
||||
if sub_i < sub_n:
|
||||
sub_token_abbreviated.append(six.u('-'))
|
||||
|
||||
abbreviated.append(six.u('').join(sub_token_abbreviated))
|
||||
|
||||
if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
|
||||
abbreviated.append(six.u(' '))
|
||||
i += 1
|
||||
|
||||
return six.u('').join(abbreviated).strip()
|
||||
254
scripts/geodata/address_expansions/address_dictionaries.py
Normal file
254
scripts/geodata/address_expansions/address_dictionaries.py
Normal file
@@ -0,0 +1,254 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
|
||||
ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'dictionaries')
|
||||
|
||||
ADDRESS_HEADER_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_rule.h')
|
||||
ADDRESS_DATA_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c')
|
||||
|
||||
address_language_index_template = u'{{{language}, {index}, {length}}}'
|
||||
address_expansion_rule_template = u'{{{phrase}, {num_dictionaries}, {{{dictionaries}}}, {canonical_index}}}'
|
||||
|
||||
|
||||
address_expansion_rule_header_template = u'''
|
||||
#ifndef ADDRESS_EXPANSION_RULE_H
|
||||
#define ADDRESS_EXPANSION_RULE_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "constants.h"
|
||||
#include "gazetteers.h"
|
||||
|
||||
#define MAX_DICTIONARY_TYPES {max_dictionary_types}
|
||||
|
||||
typedef struct address_expansion_rule {{
|
||||
char *phrase;
|
||||
uint32_t num_dictionaries;
|
||||
dictionary_type_t dictionaries[MAX_DICTIONARY_TYPES];
|
||||
int32_t canonical_index;
|
||||
}} address_expansion_rule_t;
|
||||
|
||||
typedef struct address_language_index {{
|
||||
char language[MAX_LANGUAGE_LEN];
|
||||
uint32_t index;
|
||||
size_t len;
|
||||
}} address_language_index_t;
|
||||
|
||||
|
||||
#endif
|
||||
'''
|
||||
|
||||
address_expansion_data_file_template = u'''
|
||||
char *canonical_strings[] = {{
|
||||
{canonical_strings}
|
||||
}};
|
||||
|
||||
address_expansion_rule_t expansion_rules[] = {{
|
||||
{expansion_rules}
|
||||
}};
|
||||
|
||||
address_language_index_t expansion_languages[] = {{
|
||||
{address_languages}
|
||||
}};
|
||||
'''
|
||||
|
||||
|
||||
gazetteer_types = {
|
||||
'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE',
|
||||
'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION',
|
||||
'building_types': 'DICTIONARY_BUILDING_TYPE',
|
||||
'categories': 'DICTIONARY_CATEGORY',
|
||||
'categories_plural': 'DICTIONARY_CATEGORY_PLURAL',
|
||||
'chains': 'DICTIONARY_CHAIN',
|
||||
'company_types': 'DICTIONARY_COMPANY_TYPE',
|
||||
'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE',
|
||||
'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE',
|
||||
'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE',
|
||||
'cross_streets': 'DICTIONARY_CROSS_STREET',
|
||||
'directionals': 'DICTIONARY_DIRECTIONAL',
|
||||
'elisions': 'DICTIONARY_ELISION',
|
||||
'entrances': 'DICTIONARY_ENTRANCE',
|
||||
'given_names': 'DICTIONARY_GIVEN_NAME',
|
||||
'house_numbers': 'DICTIONARY_HOUSE_NUMBER',
|
||||
'level_types_basement': 'DICTIONARY_LEVEL_BASEMENT',
|
||||
'level_types_mezzanine': 'DICTIONARY_LEVEL_MEZZANINE',
|
||||
'level_types_numbered': 'DICTIONARY_LEVEL_NUMBERED',
|
||||
'level_types_standalone': 'DICTIONARY_LEVEL_STANDALONE',
|
||||
'level_types_sub_basement': 'DICTIONARY_LEVEL_SUB_BASEMENT',
|
||||
'near': 'DICTIONARY_NEAR',
|
||||
'no_number': 'DICTIONARY_NO_NUMBER',
|
||||
'number': 'DICTIONARY_NUMBER',
|
||||
'nulls': 'DICTIONARY_NULL',
|
||||
'organizations': 'DICTIONARY_NAMED_ORGANIZATION',
|
||||
'people': 'DICTIONARY_NAMED_PERSON',
|
||||
'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX',
|
||||
'personal_titles': 'DICTIONARY_PERSONAL_TITLE',
|
||||
'place_names': 'DICTIONARY_PLACE_NAME',
|
||||
'post_office': 'DICTIONARY_POST_OFFICE',
|
||||
'postcodes': 'DICTIONARY_POSTAL_CODE',
|
||||
'qualifiers': 'DICTIONARY_QUALIFIER',
|
||||
'staircases': 'DICTIONARY_STAIRCASE',
|
||||
'stopwords': 'DICTIONARY_STOPWORD',
|
||||
'street_names': 'DICTIONARY_STREET_NAME',
|
||||
'street_types': 'DICTIONARY_STREET_TYPE',
|
||||
'surnames': 'DICTIONARY_SURNAME',
|
||||
'synonyms': 'DICTIONARY_SYNONYM',
|
||||
'toponyms': 'DICTIONARY_TOPONYM',
|
||||
'unit_directions': 'DICTIONARY_UNIT_DIRECTION',
|
||||
'unit_types_numbered': 'DICTIONARY_UNIT_NUMBERED',
|
||||
'unit_types_standalone': 'DICTIONARY_UNIT_STANDALONE',
|
||||
|
||||
}
|
||||
|
||||
|
||||
class InvalidAddressFileException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def read_dictionary_file(path):
|
||||
for i, line in enumerate(open(path)):
|
||||
line = safe_decode(line.rstrip())
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
if u'}' in line:
|
||||
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
|
||||
phrases = line.split(u'|')
|
||||
|
||||
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
|
||||
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
|
||||
|
||||
yield phrases
|
||||
|
||||
|
||||
def quote_string(s):
|
||||
return u'"{}"'.format(safe_decode(s).replace('\\', '\\\\').replace('"', '\\"'))
|
||||
|
||||
|
||||
class AddressPhraseDictionaries(object):
|
||||
def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
|
||||
self.base_dir = base_dir
|
||||
self.languages = []
|
||||
|
||||
self.language_dictionaries = defaultdict(list)
|
||||
self.phrases = defaultdict(list)
|
||||
|
||||
for language in os.listdir(base_dir):
|
||||
language_dir = os.path.join(base_dir, language)
|
||||
if not os.path.isdir(language_dir):
|
||||
continue
|
||||
|
||||
self.languages.append(language)
|
||||
|
||||
for filename in os.listdir(language_dir):
|
||||
if not filename.endswith('.txt'):
|
||||
raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename))
|
||||
dictionary_name = filename.split('.')[0].lower()
|
||||
|
||||
if dictionary_name not in gazetteer_types:
|
||||
raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types))))
|
||||
self.language_dictionaries[language].append(dictionary_name)
|
||||
|
||||
path = os.path.join(language_dir, filename)
|
||||
for i, line in enumerate(open(path)):
|
||||
line = safe_decode(line.rstrip())
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
if u'}' in line:
|
||||
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
|
||||
phrases = line.split(u'|')
|
||||
|
||||
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
|
||||
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
|
||||
|
||||
self.phrases[(language, dictionary_name)].append(phrases)
|
||||
|
||||
self.language_dictionaries = dict(self.language_dictionaries)
|
||||
self.phrases = dict(self.phrases)
|
||||
|
||||
|
||||
address_phrase_dictionaries = AddressPhraseDictionaries()
|
||||
|
||||
|
||||
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
|
||||
address_languages = []
|
||||
expansion_rules = []
|
||||
canonical_strings = []
|
||||
|
||||
max_dictionary_types = 0
|
||||
|
||||
for language in address_phrase_dictionaries.languages:
|
||||
num_language_rules = 0
|
||||
language_index = len(expansion_rules)
|
||||
|
||||
language_canonical_dictionaries = defaultdict(list)
|
||||
canonical_indices = {}
|
||||
|
||||
for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]:
|
||||
dictionary_type = gazetteer_types[dictionary_name]
|
||||
|
||||
for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]:
|
||||
canonical = phrases[0]
|
||||
if len(phrases) > 1:
|
||||
canonical_index = canonical_indices.get(canonical, None)
|
||||
if canonical_index is None:
|
||||
canonical_index = len(canonical_strings)
|
||||
canonical_strings.append(quote_string(canonical))
|
||||
canonical_indices[canonical] = canonical_index
|
||||
else:
|
||||
canonical_index = -1
|
||||
|
||||
for i, p in enumerate(phrases):
|
||||
language_canonical_dictionaries[p, canonical_index if i > 0 else -1].append(dictionary_type)
|
||||
|
||||
for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
|
||||
max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
|
||||
rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
|
||||
num_dictionaries=str(len(dictionary_types)),
|
||||
dictionaries=', '.join(dictionary_types),
|
||||
canonical_index=canonical_index)
|
||||
expansion_rules.append(rule_template)
|
||||
num_language_rules += 1
|
||||
|
||||
address_languages.append(address_language_index_template.format(language=quote_string(language),
|
||||
index=language_index,
|
||||
length=num_language_rules))
|
||||
|
||||
header = address_expansion_rule_header_template.format(
|
||||
max_dictionary_types=str(max_dictionary_types)
|
||||
)
|
||||
out = open(header_file, 'w')
|
||||
out.write(safe_encode(header))
|
||||
out.close()
|
||||
|
||||
data_file = address_expansion_data_file_template.format(
|
||||
canonical_strings=u''',
|
||||
'''.join(canonical_strings),
|
||||
expansion_rules=u''',
|
||||
'''.join(expansion_rules),
|
||||
address_languages=u''',
|
||||
'''.join(address_languages),
|
||||
)
|
||||
|
||||
out = open(output_file, 'w')
|
||||
out.write(safe_encode(data_file))
|
||||
out.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
input_dir = sys.argv[1]
|
||||
else:
|
||||
input_dir = ADDRESS_EXPANSIONS_DIR
|
||||
|
||||
create_address_expansion_rules_file(base_dir=input_dir, output_file=ADDRESS_DATA_FILE)
|
||||
56
scripts/geodata/address_expansions/equivalence.py
Normal file
56
scripts/geodata/address_expansions/equivalence.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import random
|
||||
import re
|
||||
import six
|
||||
|
||||
from itertools import izip
|
||||
|
||||
from geodata.address_expansions.gazetteers import *
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.text.normalize import normalized_tokens
|
||||
from geodata.text.tokenize import tokenize_raw, token_types
|
||||
from geodata.text.utils import non_breaking_dash_regex
|
||||
|
||||
|
||||
def canonicals_for_language(data, language):
|
||||
canonicals = set()
|
||||
|
||||
for d in data:
|
||||
lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
|
||||
if language is None or lang == language:
|
||||
canonicals.add(canonical)
|
||||
|
||||
return canonicals
|
||||
|
||||
def equivalent(s1, s2, gazetteer, language):
|
||||
'''
|
||||
Address/place equivalence
|
||||
-------------------------
|
||||
|
||||
OSM discourages abbreviations, but to make our training data map better
|
||||
to real-world input, we can safely replace the canonical phrase with an
|
||||
abbreviated version and retain the meaning of the words
|
||||
'''
|
||||
|
||||
tokens_s1 = normalized_tokens(s1)
|
||||
tokens_s2 = normalized_tokens(s2)
|
||||
|
||||
abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1))
|
||||
abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2))
|
||||
|
||||
if len(abbreviated_s1) != len(abbreviated_s2):
|
||||
return False
|
||||
|
||||
for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2):
|
||||
if c1 != token_types.PHRASE and c2 != token_types.PHRASE:
|
||||
if t1 != t2:
|
||||
return False
|
||||
elif c2 == token_types.PHRASE and c2 == token_types.PHRASE:
|
||||
canonicals_s1 = canonicals_for_language(d1, language)
|
||||
canonicals_s2 = canonicals_for_language(d2, language)
|
||||
|
||||
if not canonicals_s1 & canonicals_s2:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
260
scripts/geodata/address_expansions/gazetteers.py
Normal file
260
scripts/geodata/address_expansions/gazetteers.py
Normal file
@@ -0,0 +1,260 @@
|
||||
import os
|
||||
import six
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
|
||||
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.i18n.unicode_paths import DATA_DIR
|
||||
from geodata.text.normalize import normalized_tokens, normalize_string
|
||||
from geodata.text.tokenize import tokenize, token_types
|
||||
from geodata.text.phrases import PhraseFilter
|
||||
from geodata.enum import EnumValue
|
||||
|
||||
from marisa_trie import BytesTrie
|
||||
|
||||
|
||||
DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')
|
||||
|
||||
PREFIX_KEY = u'\x02'
|
||||
SUFFIX_KEY = u'\x03'
|
||||
|
||||
POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix',
|
||||
'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix',
|
||||
'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc',
|
||||
'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
|
||||
'm', 'mm', 'mmm', 'mmmm'])
|
||||
|
||||
|
||||
class DictionaryPhraseFilter(PhraseFilter):
|
||||
serialize = safe_encode
|
||||
deserialize = safe_decode
|
||||
|
||||
def __init__(self, *dictionaries):
|
||||
self.dictionaries = dictionaries
|
||||
self.canonicals = {}
|
||||
|
||||
kvs = defaultdict(OrderedDict)
|
||||
|
||||
for language in address_phrase_dictionaries.languages:
|
||||
for dictionary_name in self.dictionaries:
|
||||
is_suffix_dictionary = 'suffixes' in dictionary_name
|
||||
is_prefix_dictionary = 'prefixes' in dictionary_name
|
||||
|
||||
for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
|
||||
canonical = phrases[0]
|
||||
canonical_normalized = normalize_string(canonical)
|
||||
|
||||
self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]
|
||||
|
||||
for i, phrase in enumerate(phrases):
|
||||
|
||||
if phrase in POSSIBLE_ROMAN_NUMERALS:
|
||||
continue
|
||||
|
||||
is_canonical = normalize_string(phrase) == canonical_normalized
|
||||
|
||||
if is_suffix_dictionary:
|
||||
phrase = SUFFIX_KEY + phrase[::-1]
|
||||
elif is_prefix_dictionary:
|
||||
phrase = PREFIX_KEY + phrase
|
||||
|
||||
kvs[phrase][(language, dictionary_name, canonical)] = is_canonical
|
||||
|
||||
kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
|
||||
|
||||
self.trie = BytesTrie(kvs)
|
||||
|
||||
def serialize(self, s):
|
||||
return s
|
||||
|
||||
def deserialize(self, s):
|
||||
return s
|
||||
|
||||
def search_substring(self, s):
|
||||
if len(s) == 0:
|
||||
return None, 0
|
||||
|
||||
for i in xrange(len(s) + 1):
|
||||
if not self.trie.has_keys_with_prefix(s[:i]):
|
||||
i -= 1
|
||||
break
|
||||
if i > 0:
|
||||
return (self.trie.get(s[:i]), i)
|
||||
else:
|
||||
return None, 0
|
||||
|
||||
def search_suffix(self, token):
|
||||
suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
|
||||
if suffix_len > 0:
|
||||
suffix_len -= len(SUFFIX_KEY)
|
||||
return suffix_search, suffix_len
|
||||
|
||||
def search_prefix(self, token):
|
||||
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
|
||||
if prefix_len > 0:
|
||||
prefix_len -= len(PREFIX_KEY)
|
||||
return prefix_search, prefix_len
|
||||
|
||||
def basic_filter(self, tokens):
|
||||
return super(DictionaryPhraseFilter, self).filter(tokens)
|
||||
|
||||
def filter(self, tokens):
|
||||
for p, t, data in self.basic_filter(tokens):
|
||||
if not p:
|
||||
t, c = t
|
||||
token = t
|
||||
token_len = len(token)
|
||||
|
||||
suffix_search, suffix_len = self.search_suffix(token)
|
||||
if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
|
||||
yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search))
|
||||
continue
|
||||
prefix_search, prefix_len = self.search_prefix(token)
|
||||
if prefix_search and self.trie.get(token[:prefix_len]):
|
||||
yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search))
|
||||
continue
|
||||
else:
|
||||
c = token_types.PHRASE
|
||||
yield t, c, len(t), map(safe_decode, data)
|
||||
|
||||
def gen_phrases(self, s, canonical_only=False, languages=None):
|
||||
tokens = tokenize(s)
|
||||
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
|
||||
|
||||
if not languages:
|
||||
languages = None
|
||||
elif not hasattr(languages, '__iter__'):
|
||||
languages = [languages]
|
||||
|
||||
if not hasattr(languages, '__contains__'):
|
||||
languages = set(languages)
|
||||
|
||||
for t, c, length, data in self.filter(norm_tokens):
|
||||
if c == token_types.PHRASE:
|
||||
if not canonical_only and languages is None:
|
||||
yield six.u(' ').join([t_i for t_i, c_i in t])
|
||||
else:
|
||||
phrase = None
|
||||
for d in data:
|
||||
lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
|
||||
|
||||
if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'):
|
||||
phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
|
||||
yield phrase
|
||||
|
||||
def string_contains_phrases(self, s, canonical_only=False, languages=None):
|
||||
phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages)
|
||||
try:
|
||||
phrases.next()
|
||||
return True
|
||||
except StopIteration:
|
||||
return False
|
||||
|
||||
def extract_phrases(self, s, canonical_only=False, languages=None):
|
||||
return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
|
||||
|
||||
|
||||
STREET_TYPES_ONLY_DICTIONARIES = ('street_types',
|
||||
'directionals',
|
||||
'concatenated_suffixes_separable',
|
||||
'concatenated_suffixes_inseparable',
|
||||
'people',
|
||||
'personal_suffixes',
|
||||
'personal_titles',
|
||||
)
|
||||
|
||||
STREET_TYPES_DICTIONARIES = STREET_TYPES_ONLY_DICTIONARIES + ('concatenated_prefixes_separable',
|
||||
'organizations',
|
||||
'qualifiers',
|
||||
'stopwords',
|
||||
)
|
||||
|
||||
GIVEN_NAME_DICTIONARY = 'given_names'
|
||||
SURNAME_DICTIONARY = 'surnames'
|
||||
|
||||
CHAIN_DICTIONARY = 'chains'
|
||||
|
||||
SYNONYM_DICTIONARY = 'synonyms'
|
||||
|
||||
PERSONAL_NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
|
||||
SURNAME_DICTIONARY,)
|
||||
|
||||
|
||||
NAME_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
|
||||
'building_types',
|
||||
'company_types',
|
||||
'place_names',
|
||||
'qualifiers',
|
||||
'synonyms',
|
||||
'toponyms',
|
||||
)
|
||||
|
||||
QUALIFIERS_DICTIONARY = 'qualifiers'
|
||||
|
||||
HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number')
|
||||
|
||||
POSTCODE_DICTIONARIES = ('postcode',)
|
||||
|
||||
TOPONYMS_DICTIONARY = 'toponyms'
|
||||
|
||||
TOPONYM_ABBREVIATION_DICTIONARIES = ('qualifiers',
|
||||
'directionals',
|
||||
'personal_titles',
|
||||
'synonyms',
|
||||
)
|
||||
|
||||
|
||||
UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement',
|
||||
'level_types_mezzanine',
|
||||
'level_types_numbered',
|
||||
'level_types_standalone',
|
||||
'level_types_sub_basement',
|
||||
'number',
|
||||
'post_office',
|
||||
'unit_types_numbered',
|
||||
'unit_types_standalone',
|
||||
)
|
||||
|
||||
VENUE_NAME_DICTIONARIES = ('academic_degrees',
|
||||
'building_types',
|
||||
'chains',
|
||||
'company_types',
|
||||
'directionals',
|
||||
'given_names',
|
||||
'organizations',
|
||||
'people',
|
||||
'personal_suffixes',
|
||||
'personal_titles',
|
||||
'place_names',
|
||||
'stopwords',
|
||||
'surnames',
|
||||
)
|
||||
|
||||
ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
|
||||
NAME_DICTIONARIES + \
|
||||
UNIT_ABBREVIATION_DICTIONARIES + \
|
||||
('no_number', 'nulls',)
|
||||
|
||||
|
||||
_gazetteers = []
|
||||
|
||||
|
||||
def create_gazetteer(*dictionaries):
|
||||
g = DictionaryPhraseFilter(*dictionaries)
|
||||
_gazetteers.append(g)
|
||||
return g
|
||||
|
||||
|
||||
street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
|
||||
street_types_only_gazetteer = create_gazetteer(*STREET_TYPES_ONLY_DICTIONARIES)
|
||||
qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY)
|
||||
names_gazetteer = create_gazetteer(*NAME_DICTIONARIES)
|
||||
chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
|
||||
unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
|
||||
street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, )))
|
||||
abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
|
||||
toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES)
|
||||
toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY)
|
||||
given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
|
||||
venue_names_gazetteer = create_gazetteer(*VENUE_NAME_DICTIONARIES)
|
||||
0
scripts/geodata/address_formatting/__init__.py
Normal file
0
scripts/geodata/address_formatting/__init__.py
Normal file
29
scripts/geodata/address_formatting/aliases.py
Normal file
29
scripts/geodata/address_formatting/aliases.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import six
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class Aliases(object):
|
||||
def __init__(self, aliases):
|
||||
self.aliases = aliases
|
||||
self.priorities = {k: i for i, k in enumerate(aliases)}
|
||||
|
||||
def key_priority(self, key):
|
||||
return self.priorities.get(key, len(self.priorities))
|
||||
|
||||
def get(self, key, default=None):
|
||||
return self.aliases.get(key, default)
|
||||
|
||||
def replace(self, components):
|
||||
replacements = defaultdict(list)
|
||||
values = {}
|
||||
for k in list(components):
|
||||
new_key = self.aliases.get(k)
|
||||
if new_key and new_key not in components:
|
||||
value = components.pop(k)
|
||||
values[k] = value
|
||||
replacements[new_key].append(k)
|
||||
|
||||
for key, source_keys in six.iteritems(replacements):
|
||||
source_keys.sort(key=self.key_priority)
|
||||
value = values[source_keys[0]]
|
||||
components[key] = value
|
||||
924
scripts/geodata/address_formatting/formatter.py
Normal file
924
scripts/geodata/address_formatting/formatter.py
Normal file
@@ -0,0 +1,924 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import copy
|
||||
import os
|
||||
import pystache
|
||||
import random
|
||||
import re
|
||||
import six
|
||||
import subprocess
|
||||
import yaml
|
||||
|
||||
from collections import OrderedDict, defaultdict
|
||||
from itertools import ifilter
|
||||
|
||||
from geodata.address_formatting.aliases import Aliases
|
||||
from geodata.configs.utils import nested_get, recursive_merge
|
||||
from geodata.math.floats import isclose
|
||||
from geodata.math.sampling import weighted_choice, cdf
|
||||
from geodata.text.tokenize import tokenize, tokenize_raw, token_types
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting'
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
FORMATTER_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'formatting', 'global.yaml')
|
||||
|
||||
|
||||
class AddressFormatter(object):
|
||||
'''
|
||||
Approximate Python port of lokku's Geo::Address::Formatter
|
||||
|
||||
Usage:
|
||||
address_formatter = AddressFormatter()
|
||||
components = {
|
||||
'house': u'Anticafé',
|
||||
'house_number': '2',
|
||||
'road': u'Calle de la Unión',
|
||||
'postcode': '28013',
|
||||
'city': u'Madrid',
|
||||
}
|
||||
country = 'es'
|
||||
language = 'es'
|
||||
address_formatter.format_address(components, country, language)
|
||||
'''
|
||||
|
||||
whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
|
||||
|
||||
splitter = ' | '
|
||||
|
||||
separator_tag = 'SEP'
|
||||
field_separator_tag = 'FSEP'
|
||||
|
||||
CATEGORY = 'category'
|
||||
NEAR = 'near'
|
||||
ATTENTION = 'attention'
|
||||
CARE_OF = 'care_of'
|
||||
HOUSE = 'house'
|
||||
HOUSE_NUMBER = 'house_number'
|
||||
PO_BOX = 'po_box'
|
||||
ROAD = 'road'
|
||||
BUILDING = 'building'
|
||||
ENTRANCE = 'entrance'
|
||||
STAIRCASE = 'staircase'
|
||||
LEVEL = 'level'
|
||||
UNIT = 'unit'
|
||||
INTERSECTION = 'intersection'
|
||||
SUBDIVISION = 'subdivision'
|
||||
METRO_STATION = 'metro_station'
|
||||
SUBURB = 'suburb'
|
||||
CITY_DISTRICT = 'city_district'
|
||||
CITY = 'city'
|
||||
ISLAND = 'island'
|
||||
STATE = 'state'
|
||||
STATE_DISTRICT = 'state_district'
|
||||
POSTCODE = 'postcode'
|
||||
COUNTRY_REGION = 'country_region'
|
||||
COUNTRY = 'country'
|
||||
WORLD_REGION = 'world_region'
|
||||
|
||||
component_order = {k: i for i, k in enumerate([
|
||||
CATEGORY,
|
||||
NEAR,
|
||||
ATTENTION,
|
||||
CARE_OF,
|
||||
HOUSE,
|
||||
PO_BOX,
|
||||
HOUSE_NUMBER,
|
||||
BUILDING,
|
||||
ENTRANCE,
|
||||
STAIRCASE,
|
||||
LEVEL,
|
||||
UNIT,
|
||||
ROAD,
|
||||
INTERSECTION,
|
||||
SUBDIVISION,
|
||||
METRO_STATION,
|
||||
SUBURB,
|
||||
CITY,
|
||||
CITY_DISTRICT,
|
||||
ISLAND,
|
||||
STATE,
|
||||
STATE_DISTRICT,
|
||||
POSTCODE,
|
||||
COUNTRY_REGION,
|
||||
COUNTRY,
|
||||
WORLD_REGION,
|
||||
])}
|
||||
|
||||
BOUNDARY_COMPONENTS_ORDERED = [
|
||||
SUBDIVISION,
|
||||
METRO_STATION,
|
||||
SUBURB,
|
||||
CITY_DISTRICT,
|
||||
CITY,
|
||||
ISLAND,
|
||||
STATE_DISTRICT,
|
||||
STATE,
|
||||
COUNTRY_REGION,
|
||||
COUNTRY,
|
||||
WORLD_REGION,
|
||||
]
|
||||
|
||||
BOUNDARY_COMPONENTS = set(BOUNDARY_COMPONENTS_ORDERED)
|
||||
|
||||
SUB_BUILDING_COMPONENTS = {
|
||||
ENTRANCE,
|
||||
STAIRCASE,
|
||||
LEVEL,
|
||||
UNIT,
|
||||
}
|
||||
|
||||
STREET_COMPONENTS = {
|
||||
HOUSE_NUMBER,
|
||||
ROAD,
|
||||
}
|
||||
|
||||
ADDRESS_LEVEL_COMPONENTS = STREET_COMPONENTS | SUB_BUILDING_COMPONENTS
|
||||
|
||||
NAME_COMPONENTS = {
|
||||
ATTENTION,
|
||||
CARE_OF,
|
||||
HOUSE,
|
||||
}
|
||||
|
||||
address_formatter_fields = set(component_order)
|
||||
|
||||
aliases = Aliases(
|
||||
OrderedDict([
|
||||
('street', ROAD),
|
||||
('street_name', ROAD),
|
||||
('hamlet', CITY),
|
||||
('village', CITY),
|
||||
('neighborhood', SUBURB),
|
||||
('neighbourhood', SUBURB),
|
||||
('city_district', CITY_DISTRICT),
|
||||
('county', STATE_DISTRICT),
|
||||
('state_code', STATE),
|
||||
('country_name', COUNTRY),
|
||||
('continent', WORLD_REGION),
|
||||
('postal_code', POSTCODE),
|
||||
('post_code', POSTCODE),
|
||||
])
|
||||
)
|
||||
|
||||
category_template = '{{{category}}} {{{near}}} {{{place}}}'
|
||||
chain_template = '{{{house}}} {{{near}}} {{{place}}}'
|
||||
intersection_template = '{{{road1}}} {{{intersection}}} {{{road2}}} {{{place}}}'
|
||||
|
||||
template_address_parts = [HOUSE, HOUSE_NUMBER, ROAD]
|
||||
template_admin_parts = [CITY, STATE, COUNTRY]
|
||||
|
||||
template_address_parts_re = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in template_address_parts]))
|
||||
template_admin_parts_re = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in template_admin_parts]))
|
||||
|
||||
MINIMAL_COMPONENT_KEYS = [
|
||||
(ROAD, HOUSE_NUMBER),
|
||||
(ROAD, HOUSE),
|
||||
(ROAD, POSTCODE)
|
||||
]
|
||||
|
||||
FIRST, BEFORE, AFTER, LAST = range(4)
|
||||
|
||||
def __init__(self, scratch_dir='/tmp', splitter=None):
|
||||
if splitter is not None:
|
||||
self.splitter = splitter
|
||||
|
||||
self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
|
||||
self.clone_repo()
|
||||
|
||||
self.load_config()
|
||||
self.load_country_formats()
|
||||
|
||||
self.language_code_replacements = self.config['language_code_replacements']
|
||||
|
||||
self.setup_insertion_probabilities()
|
||||
self.setup_no_name_templates()
|
||||
self.setup_place_only_templates()
|
||||
|
||||
self.template_cache = {}
|
||||
self.parsed_cache = {}
|
||||
|
||||
def clone_repo(self):
|
||||
subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
|
||||
subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
|
||||
|
||||
def load_country_formats(self):
|
||||
config = yaml.load(open(os.path.join(self.formatter_repo_path,
|
||||
'conf', 'countries', 'worldwide.yaml')))
|
||||
self.country_aliases = {}
|
||||
self.house_number_ordering = {}
|
||||
|
||||
for key in list(config):
|
||||
country = key
|
||||
language = None
|
||||
if '_' in key:
|
||||
country, language = country.split('_', 1)
|
||||
value = config[key]
|
||||
if hasattr(value, 'items'):
|
||||
address_template = value.get('address_template')
|
||||
if not address_template and 'use_country' in value:
|
||||
# Temporary fix for Norway territories (NO unquoted is a boolean) and recursive references
|
||||
if value['use_country'] in (country, False):
|
||||
continue
|
||||
self.country_aliases[country] = value['use_country']
|
||||
address_template = config[value['use_country']]['address_template']
|
||||
|
||||
if address_template:
|
||||
value['address_template'] = self.add_postprocessing_tags(address_template, country, language=language)
|
||||
|
||||
post_format_replacements = value.get('postformat_replace')
|
||||
if post_format_replacements:
|
||||
value['postformat_replace'] = [[pattern, replacement.replace('$', '\\')] for pattern, replacement in post_format_replacements]
|
||||
else:
|
||||
address_template = value
|
||||
config[country] = self.add_postprocessing_tags(value, country, language=language)
|
||||
|
||||
try:
|
||||
house_number_index = address_template.index(self.tag_token(self.HOUSE_NUMBER))
|
||||
road_index = address_template.index(self.tag_token(self.ROAD))
|
||||
|
||||
if house_number_index < road_index:
|
||||
self.house_number_ordering[key.lower()] = -1
|
||||
else:
|
||||
self.house_number_ordering[key.lower()] = 1
|
||||
except ValueError:
|
||||
self.house_number_ordering[key.lower()] = 0
|
||||
|
||||
self.country_formats = config
|
||||
|
||||
def load_config(self):
|
||||
config = yaml.load(open(FORMATTER_CONFIG))
|
||||
self.config = config.get('global', {})
|
||||
language_configs = config.get('languages', {})
|
||||
|
||||
self.language_configs = {}
|
||||
for language in language_configs:
|
||||
language_config = language_configs[language]
|
||||
config_copy = copy.deepcopy(self.config)
|
||||
self.language_configs[language] = recursive_merge(config_copy, language_config)
|
||||
|
||||
country_configs = config.get('countries', {})
|
||||
|
||||
self.country_configs = {}
|
||||
for country in country_configs:
|
||||
country_config = country_configs[country]
|
||||
config_copy = copy.deepcopy(self.config)
|
||||
self.country_configs[country] = recursive_merge(config_copy, country_config)
|
||||
|
||||
def get_property(self, keys, country, language=None, default=None):
|
||||
if isinstance(keys, six.string_types):
|
||||
keys = keys.split('.')
|
||||
keys = tuple(keys)
|
||||
value = nested_get(self.language_configs, (language,) + keys, default=default)
|
||||
if not value:
|
||||
value = nested_get(self.country_configs, (country,) + keys, default=default)
|
||||
if not value:
|
||||
value = nested_get(self.config, keys, default=default)
|
||||
return value
|
||||
|
||||
def insertion_distribution(self, insertions):
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for k, v in six.iteritems(insertions):
|
||||
if k == 'conditional' or not v:
|
||||
continue
|
||||
|
||||
if 'before' in v:
|
||||
val = (self.BEFORE, v['before'])
|
||||
elif 'after' in v:
|
||||
val = (self.AFTER, v['after'])
|
||||
elif 'last' in v:
|
||||
val = (self.LAST, None)
|
||||
elif 'first' in v:
|
||||
val = (self.FIRST, None)
|
||||
else:
|
||||
raise ValueError('Insertions must contain one of {{first, before, after, last}}. Value was: {}'.format(v))
|
||||
|
||||
prob = v['probability']
|
||||
values.append(val)
|
||||
probs.append(prob)
|
||||
|
||||
# If the probabilities don't sum to 1, add a "do nothing" action
|
||||
if not isclose(sum(probs), 1.0):
|
||||
probs.append(1.0 - sum(probs))
|
||||
values.append((None, None, False))
|
||||
|
||||
return values, cdf(probs)
|
||||
|
||||
def insertion_probs(self, config):
|
||||
component_insertions = {}
|
||||
for component, insertions in six.iteritems(config):
|
||||
component_insertions[component] = self.insertion_distribution(insertions)
|
||||
|
||||
return component_insertions
|
||||
|
||||
def inverted(self, template):
|
||||
lines = template.split(six.u('\n'))
|
||||
return six.u('\n').join(reversed(lines))
|
||||
|
||||
def house_number_before_road(self, country, language=None):
|
||||
key = value = None
|
||||
if language is not None:
|
||||
key = six.u('_').join((country.lower(), language.lower()))
|
||||
if key in self.house_number_ordering:
|
||||
value = self.house_number_ordering[key]
|
||||
|
||||
if value is None:
|
||||
key = country
|
||||
if key in self.house_number_ordering:
|
||||
value = self.house_number_ordering[key]
|
||||
|
||||
if value is None:
|
||||
value = 0
|
||||
|
||||
if value <= 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def conditional_insertion_probs(self, conditionals):
|
||||
conditional_insertions = defaultdict(OrderedDict)
|
||||
for component, value in six.iteritems(conditionals):
|
||||
if 'conditional' in value:
|
||||
conditionals = value['conditional']
|
||||
|
||||
for c in conditionals:
|
||||
other = c['component']
|
||||
conditional_insertions[component][other] = self.insertion_distribution(c['probabilities'])
|
||||
return conditional_insertions
|
||||
|
||||
def setup_insertion_probabilities(self):
|
||||
config = self.config['insertions']
|
||||
self.global_insertions = self.insertion_probs(config)
|
||||
self.global_conditionals = self.conditional_insertion_probs(config)
|
||||
|
||||
self.global_invert_probability = self.config.get('invert_probability', 0.0)
|
||||
|
||||
self.country_insertions = {}
|
||||
self.country_conditionals = {}
|
||||
|
||||
self.country_invert_probabilities = {}
|
||||
|
||||
for country, config in six.iteritems(self.country_configs):
|
||||
if 'insertions' in config:
|
||||
self.country_insertions[country.lower()] = self.insertion_probs(config['insertions'])
|
||||
self.country_conditionals[country.lower()] = self.conditional_insertion_probs(config['insertions'])
|
||||
|
||||
if 'invert_probability' in config:
|
||||
self.country_invert_probabilities[country] = config['invert_probability']
|
||||
|
||||
self.language_insertions = {}
|
||||
self.language_conditionals = {}
|
||||
|
||||
for language, config in six.iteritems(self.language_configs):
|
||||
if 'insertions' in config:
|
||||
self.language_insertions[language.lower()] = self.insertion_probs(config['insertions'])
|
||||
self.language_conditionals[language.lower()] = self.conditional_insertion_probs(config['insertions'])
|
||||
|
||||
def setup_no_name_templates(self):
|
||||
self.templates_no_name = {}
|
||||
|
||||
for country, config in six.iteritems(self.country_formats):
|
||||
if hasattr(config, 'items') and 'address_template' in config:
|
||||
address_template = self.remove_components(config['address_template'], self.NAME_COMPONENTS)
|
||||
self.templates_no_name[country] = address_template
|
||||
|
||||
def setup_place_only_templates(self):
|
||||
self.templates_place_only = {}
|
||||
|
||||
for country, config in six.iteritems(self.country_formats):
|
||||
if hasattr(config, 'items') and 'address_template' in config:
|
||||
address_template = self.remove_components(config['address_template'], self.NAME_COMPONENTS | self.ADDRESS_LEVEL_COMPONENTS)
|
||||
self.templates_place_only[country] = address_template
|
||||
|
||||
def country_template(self, c):
|
||||
return self.country_formats.get(c, self.country_formats['default'])
|
||||
|
||||
def is_reverse(self, template):
|
||||
address_parts_match = self.template_address_parts_re.search(template)
|
||||
admin_parts_match = list(self.template_admin_parts_re.finditer(template))
|
||||
|
||||
# last instance of city/state/country occurs before the first instance of house_number/road
|
||||
return admin_parts_match[-1].start() < address_parts_match.start()
|
||||
|
||||
def build_first_of_template(self, keys):
|
||||
""" For constructing """
|
||||
return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys]))
|
||||
|
||||
def tag_token(self, key):
|
||||
return '{{{{{{{key}}}}}}}'.format(key=key)
|
||||
|
||||
def remove_components(self, template, tags):
|
||||
new_components = []
|
||||
tags = set(tags)
|
||||
|
||||
parsed = pystache.parse(safe_decode(template))
|
||||
|
||||
last_removed = False
|
||||
for i, el in enumerate(parsed._parse_tree):
|
||||
if hasattr(el, 'parsed'):
|
||||
keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key') and e.key not in tags]
|
||||
if keys:
|
||||
new_components.append(self.build_first_of_template(keys))
|
||||
last_removed = False
|
||||
else:
|
||||
last_removed = True
|
||||
elif hasattr(el, 'key'):
|
||||
if el.key not in tags:
|
||||
new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
|
||||
last_removed = False
|
||||
else:
|
||||
last_removed = True
|
||||
|
||||
elif not last_removed:
|
||||
new_components.append(el)
|
||||
else:
|
||||
last_removed = False
|
||||
return ''.join(new_components).strip()
|
||||
|
||||
def insert_component(self, template, tag, before=None, after=None, first=False, last=False,
|
||||
separate=True, is_reverse=False, exact_order=True):
|
||||
if not before and not after and not first and not last:
|
||||
return
|
||||
|
||||
template = template.rstrip()
|
||||
|
||||
if not exact_order:
|
||||
first_template_regex = re.compile(six.u('{{#first}}.*?{{/first}}'), re.UNICODE)
|
||||
sans_firsts = first_template_regex.sub(six.u(''), template)
|
||||
|
||||
tag_match = re.compile(self.tag_token(tag)).search(sans_firsts)
|
||||
|
||||
if before:
|
||||
before_match = re.compile(self.tag_token(before)).search(sans_firsts)
|
||||
if before_match and tag_match and before_match.start() > tag_match.start():
|
||||
return template
|
||||
|
||||
if after:
|
||||
after_match = re.compile(self.tag_token(after)).search(sans_firsts)
|
||||
if after_match and tag_match and tag_match.start() > after_match.start():
|
||||
return template
|
||||
|
||||
key_added = False
|
||||
skip_next_non_token = False
|
||||
new_components = []
|
||||
|
||||
tag_token = self.tag_token(tag)
|
||||
|
||||
parsed = pystache.parse(safe_decode(template))
|
||||
num_tokens = len(parsed._parse_tree)
|
||||
for i, el in enumerate(parsed._parse_tree):
|
||||
|
||||
if hasattr(el, 'parsed'):
|
||||
keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key')]
|
||||
if (before in set(keys) or first) and not key_added:
|
||||
token = new_components[-1] if new_components and '{' not in new_components[-1] else '\n'
|
||||
new_components.extend([tag_token, token])
|
||||
key_added = True
|
||||
|
||||
keys = [k for k in keys if self.aliases.get(k, k) != tag]
|
||||
if keys:
|
||||
new_components.append(self.build_first_of_template(keys))
|
||||
else:
|
||||
while new_components and '{' not in new_components[-1]:
|
||||
new_components.pop()
|
||||
continue
|
||||
|
||||
if (after in set(keys) or i == num_tokens - 1) and not key_added:
|
||||
token = '\n'
|
||||
if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
|
||||
token = parsed._parse_tree[i + 1]
|
||||
new_components.extend([token, tag_token])
|
||||
key_added = True
|
||||
|
||||
elif hasattr(el, 'key'):
|
||||
if el.key == tag:
|
||||
if i == num_tokens - 1 and last:
|
||||
new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
|
||||
|
||||
skip_next_non_token = True
|
||||
continue
|
||||
|
||||
if (el.key == before or first) and not key_added:
|
||||
token = '\n'
|
||||
if new_components and '{' not in new_components[-1]:
|
||||
token = new_components[-1]
|
||||
new_components.extend([tag_token, token])
|
||||
key_added = True
|
||||
|
||||
new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
|
||||
|
||||
if (el.key == after or i == num_tokens - 1) and not key_added:
|
||||
token = '\n'
|
||||
if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
|
||||
token = parsed._parse_tree[i + 1]
|
||||
new_components.extend([token, tag_token])
|
||||
key_added = True
|
||||
elif not skip_next_non_token:
|
||||
new_components.append(el)
|
||||
|
||||
if i == num_tokens - 1 and not key_added:
|
||||
key_added = True
|
||||
new_components.append(tag_token)
|
||||
|
||||
skip_next_non_token = False
|
||||
|
||||
return ''.join(new_components)
|
||||
|
||||
def add_postprocessing_tags(self, template, country, language=None):
|
||||
is_reverse = self.is_reverse(template)
|
||||
|
||||
i = None
|
||||
pivot = None
|
||||
|
||||
pivot_keys = (AddressFormatter.CITY, AddressFormatter.STATE, AddressFormatter.COUNTRY)
|
||||
|
||||
for component in pivot_keys:
|
||||
token = self.tag_token(component)
|
||||
if token in template:
|
||||
i = self.BOUNDARY_COMPONENTS_ORDERED.index(component)
|
||||
pivot = component
|
||||
break
|
||||
|
||||
if i is None:
|
||||
raise ValueError('Template {} does not contain one of {{{}}}'.format(country, ','.join(pivot_keys)))
|
||||
|
||||
prev = pivot
|
||||
|
||||
if i > 1:
|
||||
for component in self.BOUNDARY_COMPONENTS_ORDERED[i - 1:0:-1]:
|
||||
kw = {'before': prev} if not is_reverse else {'after': prev}
|
||||
template = self.insert_component(template, component, exact_order=False, **kw)
|
||||
prev = component
|
||||
|
||||
prev = pivot
|
||||
|
||||
if i < len(self.BOUNDARY_COMPONENTS_ORDERED) - 1:
|
||||
for component in self.BOUNDARY_COMPONENTS_ORDERED[i + 1:]:
|
||||
kw = {'after': prev} if not is_reverse else {'before': prev}
|
||||
template = self.insert_component(template, component, exact_order=False, **kw)
|
||||
prev = component
|
||||
|
||||
return template
|
||||
|
||||
def render_template(self, template, components, tagged=False):
|
||||
def render_first(text):
|
||||
text = pystache.render(text, **components)
|
||||
splits = (e.strip() for e in text.split('||'))
|
||||
selected = next(ifilter(bool, splits), '')
|
||||
return selected
|
||||
|
||||
output = pystache.render(template, first=render_first,
|
||||
**components).strip()
|
||||
|
||||
values = self.whitespace_component_regex.split(output)
|
||||
|
||||
splitter = self.splitter if not tagged else ' {}/{} '.format(self.splitter.strip(), self.field_separator_tag)
|
||||
|
||||
values = [self.strip_component(val, tagged=tagged) for val in values]
|
||||
|
||||
output = splitter.join([
|
||||
val for val in values if val.strip()
|
||||
])
|
||||
|
||||
return output
|
||||
|
||||
def minimal_components(self, components):
|
||||
for component_list in self.MINIMAL_COMPONENT_KEYS:
|
||||
if all((c in components for c in component_list)):
|
||||
return True
|
||||
return False
|
||||
|
||||
def post_replacements(self, template, text):
|
||||
components = []
|
||||
seen = set()
|
||||
for component in text.split(self.splitter):
|
||||
component = component.strip()
|
||||
if component not in seen:
|
||||
components.append(component)
|
||||
seen.add(component)
|
||||
text = self.splitter.join(components)
|
||||
post_format_replacements = template.get('postformat_replace')
|
||||
if post_format_replacements:
|
||||
for regex, replacement in post_format_replacements:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
def revised_template(self, template, components, country, language=None):
|
||||
if not template:
|
||||
return None
|
||||
|
||||
country_language = None
|
||||
if language:
|
||||
country_language = '{}_{}'.format(country, language)
|
||||
|
||||
alias_country = self.country_aliases.get(country.upper(), country).lower()
|
||||
for term in (country, country_language):
|
||||
if term in self.country_insertions or term in self.country_conditionals:
|
||||
break
|
||||
else:
|
||||
country = alias_country
|
||||
|
||||
cache_keys = []
|
||||
|
||||
invert_probability = self.country_invert_probabilities.get(country, self.global_invert_probability)
|
||||
if random.random() < invert_probability:
|
||||
cache_keys.append('inverted')
|
||||
cache_key = tuple(sorted(cache_keys))
|
||||
if cache_key in self.template_cache:
|
||||
template = self.template_cache[cache_key]
|
||||
else:
|
||||
template = self.inverted(template)
|
||||
self.template_cache[cache_key] = template
|
||||
|
||||
for component in sorted(components, key=self.component_order.get):
|
||||
scope = country
|
||||
insertions = nested_get(self.country_insertions, (country, component), default=None)
|
||||
conditionals = nested_get(self.country_conditionals, (country, component), default=None)
|
||||
|
||||
if insertions is None and language:
|
||||
insertions = nested_get(self.country_insertions, (country_language, component), default=None)
|
||||
scope = country_language
|
||||
|
||||
if conditionals is None and language:
|
||||
conditionals = nested_get(self.country_conditionals, (country_language, component), default=None)
|
||||
|
||||
if insertions is None and language:
|
||||
insertions = nested_get(self.language_insertions, (language, component), default=None)
|
||||
scope = 'lang:{}'.format(language)
|
||||
|
||||
if conditionals is None and language:
|
||||
conditionals = nested_get(self.language_conditionals, (language, component), default=None)
|
||||
|
||||
if insertions is None:
|
||||
insertions = nested_get(self.global_insertions, (component,), default=None)
|
||||
scope = None
|
||||
|
||||
if conditionals is None:
|
||||
conditionals = nested_get(self.global_conditionals, (component,), default=None)
|
||||
|
||||
if insertions is not None:
|
||||
conditional_insertions = None
|
||||
if conditionals is not None:
|
||||
for k, v in six.iteritems(conditionals):
|
||||
if k in components:
|
||||
conditional_insertions = v
|
||||
break
|
||||
|
||||
order, other = None, None
|
||||
|
||||
# Check the conditional probabilities first
|
||||
if conditional_insertions is not None:
|
||||
values, probs = conditional_insertions
|
||||
order, other = weighted_choice(values, probs)
|
||||
|
||||
# If there are no conditional probabilites or the "default" value was chosen, sample from the marginals
|
||||
if other is None:
|
||||
values, probs = insertions
|
||||
order, other = weighted_choice(values, probs)
|
||||
|
||||
# Even though we may change the value of "other" below, use
|
||||
# the original cache key because changes from here on are
|
||||
# deterministic and should be cached.
|
||||
insertion_id = (scope, component, order, other)
|
||||
cache_keys.append(insertion_id)
|
||||
|
||||
cache_key = tuple(sorted(cache_keys))
|
||||
|
||||
if cache_key in self.template_cache:
|
||||
template = self.template_cache[cache_key]
|
||||
continue
|
||||
|
||||
other_token = self.tag_token(other)
|
||||
|
||||
# Don't allow insertions between road and house_number
|
||||
# This can happen if e.g. "level" is supposed to be inserted
|
||||
# after house number assuming that it's a continental European
|
||||
# address where house number comes after road. If in a previous
|
||||
# insertion we were to swap house_number and road to create an
|
||||
# English-style address, the final ordering would be
|
||||
# house_number, unit, road, which we don't want. So effectively
|
||||
# treat house_number and road as an atomic unit.
|
||||
|
||||
if other == self.HOUSE_NUMBER and component != self.ROAD:
|
||||
road_tag = self.tag_token(self.ROAD)
|
||||
house_number_tag = other_token
|
||||
|
||||
if house_number_tag in template and road_tag in template:
|
||||
road_after_house_number = template.index(road_tag) > template.index(house_number_tag)
|
||||
|
||||
if road_after_house_number and order == self.AFTER:
|
||||
other = self.ROAD
|
||||
elif not road_after_house_number and order == self.BEFORE:
|
||||
other = self.ROAD
|
||||
elif other == self.ROAD and component != self.HOUSE_NUMBER:
|
||||
house_number_tag = self.tag_token(self.HOUSE_NUMBER)
|
||||
road_tag = other_token
|
||||
|
||||
if house_number_tag in template and road_tag in template:
|
||||
road_before_house_number = template.index(road_tag) < template.index(house_number_tag)
|
||||
|
||||
if road_before_house_number and order == self.AFTER:
|
||||
other = self.HOUSE_NUMBER
|
||||
elif not road_before_house_number and order == self.BEFORE:
|
||||
other = self.HOUSE_NUMBER
|
||||
|
||||
if order == self.BEFORE and other_token in template:
|
||||
template = self.insert_component(template, component, before=other)
|
||||
elif order == self.AFTER and other_token in template:
|
||||
template = self.insert_component(template, component, after=other)
|
||||
elif order == self.LAST:
|
||||
template = self.insert_component(template, component, last=True)
|
||||
elif order == self.FIRST:
|
||||
template = self.insert_component(template, component, first=True)
|
||||
else:
|
||||
continue
|
||||
|
||||
self.template_cache[cache_key] = template
|
||||
|
||||
return template
|
||||
|
||||
def remove_repeat_template_separators(self, template):
|
||||
return re.sub('(?:[\s]*([,;\-]/{})[\s]*){{2,}}'.format(self.separator_tag), r' \1 ', template)
|
||||
|
||||
def tag_template_separators(self, template):
|
||||
template = re.sub(r'}\s*([,\-;])\s*', r'}} \1/{} '.format(self.separator_tag), template)
|
||||
return template
|
||||
|
||||
def strip_component(self, value, tagged=False):
|
||||
if not tagged:
|
||||
comma = token_types.COMMA.value
|
||||
hyphen = token_types.HYPHEN.value
|
||||
|
||||
start = end = 0
|
||||
tokens = tokenize_raw(value.strip())
|
||||
for token_start, token_length, token_type in tokens:
|
||||
start = token_start
|
||||
if token_type not in (comma, hyphen):
|
||||
break
|
||||
else:
|
||||
start = token_start + token_length
|
||||
|
||||
for token_start, token_length, token_type in reversed(tokens):
|
||||
end = token_start + token_length
|
||||
if token_type not in (comma, hyphen):
|
||||
break
|
||||
else:
|
||||
end = token_start
|
||||
|
||||
return value[start:end]
|
||||
else:
|
||||
start = end = 0
|
||||
tokens = value.split()
|
||||
|
||||
separator_tag = self.separator_tag
|
||||
|
||||
for i, t in enumerate(tokens):
|
||||
t, c = t.rsplit('/', 1)
|
||||
start = i
|
||||
if c != separator_tag:
|
||||
break
|
||||
else:
|
||||
start = i + 1
|
||||
|
||||
num_tokens = len(tokens)
|
||||
|
||||
for j, t in enumerate(reversed(tokens)):
|
||||
t, c = t.rsplit('/', 1)
|
||||
end = num_tokens - j
|
||||
if c != separator_tag:
|
||||
break
|
||||
else:
|
||||
end = num_tokens - j - 1
|
||||
|
||||
return six.u(' ').join(tokens[start:end])
|
||||
|
||||
def get_template_from_config(self, config, country, language=None):
|
||||
template = None
|
||||
if language:
|
||||
language = self.language_code_replacements.get(language, language.split('_')[0])
|
||||
# For countries like China and Japan where the country format varies
|
||||
# based on which language is being used
|
||||
template = config.get('{}_{}'.format(country.upper(), language.lower()), None)
|
||||
|
||||
if not template:
|
||||
template = config.get(country.upper())
|
||||
|
||||
if not template:
|
||||
return None
|
||||
|
||||
return template
|
||||
|
||||
def get_template(self, country, language=None):
|
||||
return self.get_template_from_config(self.country_formats, country, language=language)
|
||||
|
||||
def get_no_name_template(self, country, language=None):
|
||||
return self.get_template_from_config(self.templates_no_name, country, language=language)
|
||||
|
||||
def get_place_template(self, country, language=None):
|
||||
return self.get_template_from_config(self.templates_place_only, country, language=language)
|
||||
|
||||
def tagged_tokens(self, name, label):
|
||||
return six.u(' ').join([six.u('{}/{}').format(t.replace(' ', ''), label if t != ',' else self.separator_tag) for t, c in tokenize(name)])
|
||||
|
||||
def template_language_matters(self, country, language):
|
||||
return '{}_{}'.format(country.upper(), language) in self.country_formats or '{}_{}'.format(country, language) in self.country_formats
|
||||
|
||||
def format_category_query(self, category_query, address_components, country, language, tag_components=True):
|
||||
if tag_components:
|
||||
components = {self.CATEGORY: self.tagged_tokens(category_query.category, self.CATEGORY)}
|
||||
if category_query.prep is not None:
|
||||
components[self.NEAR] = self.tagged_tokens(category_query.prep, self.NEAR)
|
||||
else:
|
||||
components = {self.CATEGORY: category_query.category}
|
||||
if category_query.prep is not None:
|
||||
components[self.NEAR] = category_query.prep
|
||||
|
||||
if category_query.add_place_name or category_query.add_address:
|
||||
place_formatted = self.format_address(address_components, country, language=language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
if not place_formatted:
|
||||
return None
|
||||
components['place'] = place_formatted
|
||||
|
||||
return self.render_template(self.category_template, components, tagged=tag_components)
|
||||
|
||||
def format_chain_query(self, chain_query, address_components, country, language, tag_components=True):
|
||||
if tag_components:
|
||||
components = {self.HOUSE: self.tagged_tokens(chain_query.name, self.HOUSE)}
|
||||
if chain_query.prep is not None:
|
||||
components[self.NEAR] = self.tagged_tokens(chain_query.prep, self.NEAR)
|
||||
else:
|
||||
components = {self.HOUSE: chain_query.name}
|
||||
if chain_query.prep is not None:
|
||||
components[self.NEAR] = chain_query.prep
|
||||
|
||||
if chain_query.add_place_name or chain_query.add_address:
|
||||
place_formatted = self.format_address(address_components, country, language=language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
if not place_formatted:
|
||||
return None
|
||||
components['place'] = place_formatted
|
||||
|
||||
return self.render_template(self.chain_template, components, tagged=tag_components)
|
||||
|
||||
def format_intersection(self, intersection_query, place_components, country, language, tag_components=True):
|
||||
components = {}
|
||||
if tag_components:
|
||||
components = {'road1': self.tagged_tokens(intersection_query.road1, self.ROAD),
|
||||
'intersection': self.tagged_tokens(intersection_query.intersection_phrase, self.INTERSECTION),
|
||||
'road2': self.tagged_tokens(intersection_query.road2, self.ROAD),
|
||||
}
|
||||
else:
|
||||
components = {'road1': intersection_query.road1,
|
||||
'intersection': intersection_query.intersection_phrase,
|
||||
'road2': intersection_query.road2}
|
||||
|
||||
if place_components:
|
||||
place_formatted = self.format_address(place_components, country, language=language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
|
||||
if place_formatted:
|
||||
components['place'] = place_formatted
|
||||
return self.render_template(self.intersection_template, components, tagged=tag_components)
|
||||
|
||||
def format_address(self, components, country, language,
|
||||
minimal_only=True, tag_components=True, replace_aliases=True):
|
||||
if minimal_only and not self.minimal_components(components):
|
||||
return None
|
||||
|
||||
template = self.get_template(country, language=language)
|
||||
if not template:
|
||||
return None
|
||||
|
||||
if not template or 'address_template' not in template:
|
||||
return None
|
||||
template_text = template['address_template']
|
||||
|
||||
template_text = self.revised_template(template_text, components, country, language=language)
|
||||
if template_text is None:
|
||||
return None
|
||||
|
||||
if tag_components:
|
||||
template_text = self.tag_template_separators(template_text)
|
||||
|
||||
if template_text in self.parsed_cache:
|
||||
template = self.parsed_cache[template_text]
|
||||
else:
|
||||
template = pystache.parse(template_text)
|
||||
self.parsed_cache[template_text] = template
|
||||
|
||||
if replace_aliases:
|
||||
self.aliases.replace(components)
|
||||
|
||||
if tag_components:
|
||||
components = {k: self.tagged_tokens(v, k) for k, v in six.iteritems(components)}
|
||||
|
||||
text = self.render_template(template, components, tagged=tag_components)
|
||||
|
||||
text = self.remove_repeat_template_separators(text)
|
||||
|
||||
return text
|
||||
0
scripts/geodata/addresses/__init__.py
Normal file
0
scripts/geodata/addresses/__init__.py
Normal file
59
scripts/geodata/addresses/blocks.py
Normal file
59
scripts/geodata/addresses/blocks.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
|
||||
|
||||
class Block(NumberedComponent):
|
||||
max_blocks = 10
|
||||
|
||||
block_range = range(1, max_blocks + 1)
|
||||
block_range_probs = zipfian_distribution(len(block_range), 2.0)
|
||||
block_range_cdf = cdf(block_range_probs)
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('blocks.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
number = weighted_choice(cls.block_range, cls.block_range_cdf)
|
||||
return safe_decode(number)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet, 2.0)
|
||||
if num_type == cls.ALPHA:
|
||||
return safe_decode(letter)
|
||||
else:
|
||||
number = weighted_choice(cls.block_range, cls.block_range_cdf)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, block, language, country=None):
|
||||
if block is None:
|
||||
return None
|
||||
|
||||
phrase_prob = address_config.get_property('blocks.alphanumeric_phrase_probability', language, country=country, default=0.0)
|
||||
if random.random() < phrase_prob:
|
||||
return cls.numeric_phrase('blocks.alphanumeric', block, language,
|
||||
dictionaries=['qualifiers'], country=country)
|
||||
else:
|
||||
return None
|
||||
2022
scripts/geodata/addresses/components.py
Normal file
2022
scripts/geodata/addresses/components.py
Normal file
File diff suppressed because it is too large
Load Diff
152
scripts/geodata/addresses/config.py
Normal file
152
scripts/geodata/addresses/config.py
Normal file
@@ -0,0 +1,152 @@
|
||||
|
||||
import copy
|
||||
import os
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from collections import Mapping
|
||||
|
||||
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge, alternative_probabilities
|
||||
from geodata.math.sampling import cdf, check_probability_distribution
|
||||
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'addresses')
|
||||
|
||||
DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'dictionaries')
|
||||
|
||||
|
||||
class AddressConfig(object):
|
||||
def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
|
||||
self.address_configs = {}
|
||||
self.cache = {}
|
||||
|
||||
for filename in os.listdir(config_dir):
|
||||
if not filename.endswith('.yaml'):
|
||||
continue
|
||||
config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
|
||||
countries = config.pop('countries', {})
|
||||
|
||||
for k in countries.keys():
|
||||
country_config = countries[k]
|
||||
config_copy = copy.deepcopy(config)
|
||||
countries[k] = recursive_merge(config_copy, country_config)
|
||||
|
||||
config['countries'] = countries
|
||||
|
||||
lang = filename.rsplit('.yaml')[0]
|
||||
self.address_configs[lang] = config
|
||||
|
||||
self.sample_phrases = {}
|
||||
|
||||
for language in address_phrase_dictionaries.languages:
|
||||
for dictionary in address_phrase_dictionaries.language_dictionaries[language]:
|
||||
self.sample_phrases[(language, dictionary)] = {}
|
||||
for phrases in address_phrase_dictionaries.phrases[(language, dictionary)]:
|
||||
self.sample_phrases[(language, dictionary)][phrases[0]] = phrases[1:]
|
||||
|
||||
def get_property(self, key, language, country=None, default=None):
|
||||
keys = key.split('.')
|
||||
config = self.address_configs.get(language, {})
|
||||
|
||||
if country:
|
||||
country_config = config.get('countries', {}).get(country, {})
|
||||
if country_config:
|
||||
config = country_config
|
||||
|
||||
value = nested_get(config, keys)
|
||||
if value is not DoesNotExist:
|
||||
return value
|
||||
|
||||
return default
|
||||
|
||||
def cache_key(self, prop, language, dictionaries=(), country=None):
|
||||
return (prop, language, country, tuple(dictionaries))
|
||||
|
||||
def alternative_probabilities(self, prop, language, dictionaries=(), country=None):
|
||||
'''Get a probability distribution over alternatives'''
|
||||
key = self.cache_key(prop, language, dictionaries, country=country)
|
||||
if key not in self.cache:
|
||||
properties = self.get_property(prop, language, country=country, default=None)
|
||||
|
||||
if properties is None:
|
||||
return None, None
|
||||
|
||||
alternatives, probs = alternative_probabilities(properties)
|
||||
if alternatives is None:
|
||||
return None, None
|
||||
|
||||
forms = []
|
||||
form_probs = []
|
||||
|
||||
for props, prob in zip(alternatives, probs):
|
||||
phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
|
||||
forms.extend([(p, props) for p in phrases])
|
||||
form_probs.extend([prob * p for p in phrase_probs])
|
||||
|
||||
sample_probability = properties.get('sample_probability')
|
||||
if sample_probability is not None:
|
||||
sample_phrases = []
|
||||
for dictionary in dictionaries:
|
||||
phrases = self.sample_phrases.get((language, dictionary), [])
|
||||
for canonical, surface_forms in six.iteritems(phrases):
|
||||
sample_phrases.append(canonical)
|
||||
sample_phrases.extend(surface_forms)
|
||||
# Note: use the outer properties dictionary e.g. units.alphanumeric
|
||||
forms.extend([(p, properties) for p in sample_phrases])
|
||||
form_probs.extend([float(sample_probability) / len(sample_phrases)] * len(sample_phrases))
|
||||
|
||||
try:
|
||||
check_probability_distribution(form_probs)
|
||||
except AssertionError:
|
||||
print 'values were: {}'.format(forms)
|
||||
raise
|
||||
|
||||
form_probs_cdf = cdf(form_probs)
|
||||
self.cache[key] = (forms, form_probs_cdf)
|
||||
return self.cache[key]
|
||||
|
||||
def form_probabilities(self, properties, language, dictionaries=()):
|
||||
probs = []
|
||||
alternatives = []
|
||||
canonical_prob = properties.get('canonical_probability', 1.0)
|
||||
canonical = properties['canonical']
|
||||
|
||||
alternatives.append(canonical)
|
||||
probs.append(canonical_prob)
|
||||
|
||||
if 'abbreviated_probability' in properties:
|
||||
probs.append(properties['abbreviated_probability'])
|
||||
abbreviated = properties['abbreviated']
|
||||
assert isinstance(abbreviated, basestring)
|
||||
alternatives.append(abbreviated)
|
||||
|
||||
if properties.get('sample', False) and 'sample_probability' in properties:
|
||||
sample_prob = properties['sample_probability']
|
||||
samples = set()
|
||||
for dictionary in dictionaries:
|
||||
phrases = self.sample_phrases.get((language, dictionary), {})
|
||||
samples |= set(phrases.get(canonical, []))
|
||||
if 'sample_exclude' in properties:
|
||||
samples -= set(properties['sample_exclude'])
|
||||
if samples:
|
||||
for phrase in samples:
|
||||
probs.append(sample_prob / float(len(samples)))
|
||||
alternatives.append(phrase)
|
||||
else:
|
||||
total = sum(probs)
|
||||
probs = [p / total for p in probs]
|
||||
|
||||
try:
|
||||
check_probability_distribution(probs)
|
||||
except AssertionError:
|
||||
print 'values were: {}'.format(alternatives)
|
||||
raise
|
||||
|
||||
return alternatives, probs
|
||||
|
||||
address_config = AddressConfig()
|
||||
37
scripts/geodata/addresses/conjunctions.py
Normal file
37
scripts/geodata/addresses/conjunctions.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import six
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice
|
||||
|
||||
|
||||
class Conjunction(object):
|
||||
DEFAULT_WHITESPACE_JOIN = ', '
|
||||
DEFAULT_NON_WHITESPACE_JOIN = ''
|
||||
key = 'and'
|
||||
|
||||
@classmethod
|
||||
def join(cls, phrases, language, country=None):
|
||||
|
||||
if not hasattr(phrases, '__iter__'):
|
||||
raise ValueError('Param phrases must be iterable')
|
||||
|
||||
values, probs = address_config.alternative_probabilities(cls.key, language, country=country)
|
||||
phrase, props = weighted_choice(values, probs)
|
||||
|
||||
whitespace = props.get('whitespace', True)
|
||||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||||
|
||||
phrases = [safe_decode(p) for p in phrases]
|
||||
|
||||
max_phrase_join = props.get('max_phrase_join', 2)
|
||||
if len(phrases) > max_phrase_join:
|
||||
default_join = safe_decode(props.get('default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
|
||||
prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')])
|
||||
else:
|
||||
prefix = six.u('')
|
||||
|
||||
if whitespace:
|
||||
phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase)
|
||||
joined_phrase = phrase.join(phrases[-max_phrase_join:])
|
||||
|
||||
return six.u('').join([prefix, joined_phrase])
|
||||
19
scripts/geodata/addresses/conscription_numbers.py
Normal file
19
scripts/geodata/addresses/conscription_numbers.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import random
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class ConscriptionNumber(NumberedComponent):
|
||||
@classmethod
|
||||
def phrase(cls, number, language, country=None):
|
||||
if number is None:
|
||||
return number
|
||||
|
||||
key = 'conscription_numbers.alphanumeric'
|
||||
dictionaries = ['house_numbers']
|
||||
default = safe_decode(number)
|
||||
|
||||
return cls.numeric_phrase(key, safe_decode(number), language,
|
||||
dictionaries=dictionaries, country=country)
|
||||
42
scripts/geodata/addresses/dependencies.py
Normal file
42
scripts/geodata/addresses/dependencies.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import operator
|
||||
import six
|
||||
|
||||
from geodata.graph.topsort import topsort
|
||||
|
||||
|
||||
class ComponentDependencies(object):
|
||||
'''
|
||||
Declare an address component and its dependencies e.g.
|
||||
a house_numer cannot be used in the absence of a road name.
|
||||
'''
|
||||
|
||||
component_bit_values = {}
|
||||
|
||||
def __init__(self, graph):
|
||||
self.dependencies = {}
|
||||
|
||||
self.all_values = long('1' * len(graph), 2)
|
||||
|
||||
self.dependency_order = [c for c in topsort(graph)]
|
||||
|
||||
for component, deps in six.iteritems(graph):
|
||||
self.dependencies[component] = self.component_bitset(deps) if deps else self.all_values
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.dependencies.__getitem__(key)
|
||||
|
||||
def __contains__(self, key):
|
||||
return self.dependencies.__contains__(key)
|
||||
|
||||
@classmethod
|
||||
def get_component_bit_value(cls, name):
|
||||
val = cls.component_bit_values.get(name)
|
||||
if val is None:
|
||||
num_values = len(cls.component_bit_values)
|
||||
val = 1 << num_values
|
||||
cls.component_bit_values[name] = val
|
||||
return val
|
||||
|
||||
@classmethod
|
||||
def component_bitset(cls, components):
|
||||
return reduce(operator.or_, [cls.get_component_bit_value(name) for name in components])
|
||||
37
scripts/geodata/addresses/directions.py
Normal file
37
scripts/geodata/addresses/directions.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumericPhrase
|
||||
from geodata.math.sampling import weighted_choice
|
||||
|
||||
|
||||
class RelativeDirection(NumericPhrase):
|
||||
key = 'directions'
|
||||
dictionaries = ['unit_directions']
|
||||
|
||||
|
||||
class AnteroposteriorDirection(RelativeDirection):
|
||||
key = 'directions.anteroposterior'
|
||||
|
||||
|
||||
class LateralDirection(RelativeDirection):
|
||||
key = 'directions.lateral'
|
||||
|
||||
|
||||
class CardinalDirection(NumericPhrase):
|
||||
key = 'cardinal_directions'
|
||||
dictionaries = ['cardinal_directions']
|
||||
|
||||
|
||||
class Direction(object):
|
||||
CARDINAL = 'cardinal'
|
||||
RELATIVE = 'relative'
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None, cardinal_proability=0.5):
|
||||
values = [cls.CARDINAL, cls.RELATIVE]
|
||||
probs_cdf = [cardinal_proability, 1.0]
|
||||
|
||||
choice = weighted_choice(values, probs_cdf)
|
||||
if choice == cls.CARDINAL:
|
||||
return CardinalDirection.phrase(None, language, country=country)
|
||||
else:
|
||||
return RelativeDirection.phrase(None, language, country=country)
|
||||
66
scripts/geodata/addresses/entrances.py
Normal file
66
scripts/geodata/addresses/entrances.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.addresses.directions import RelativeDirection
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
|
||||
|
||||
class Entrance(NumberedComponent):
|
||||
max_entrances = 10
|
||||
|
||||
entrance_range = range(1, max_entrances + 1)
|
||||
entrance_range_probs = zipfian_distribution(len(entrance_range), 2.0)
|
||||
entrance_range_cdf = cdf(entrance_range_probs)
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('entrances.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.HYPHENATED_NUMBER:
|
||||
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
|
||||
number2 = number + weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
|
||||
return u'{}-{}'.format(number, number2)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet, 2.0)
|
||||
if num_type == cls.ALPHA:
|
||||
return safe_decode(letter)
|
||||
else:
|
||||
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
|
||||
whitespace_phrase = u''
|
||||
r = random.random()
|
||||
if r < whitespace_probability:
|
||||
whitespace_phrase = u' '
|
||||
elif r < (whitespace_probability + hyphen_probability):
|
||||
whitespace_phrase = u'-'
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, entrance, language, country=None):
|
||||
if entrance is None:
|
||||
return None
|
||||
return cls.numeric_phrase('entrances.alphanumeric', entrance, language,
|
||||
dictionaries=['entrances'], country=country)
|
||||
165
scripts/geodata/addresses/floors.py
Normal file
165
scripts/geodata/addresses/floors.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
|
||||
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
from geodata.numbers.spellout import numeric_expressions
|
||||
|
||||
|
||||
class Floor(NumberedComponent):
|
||||
# When we don't know the number of floors, use a Zipfian distribution
|
||||
# to choose randomly between 1 and max_floors with 1 being much more
|
||||
# likely than 2, etc.
|
||||
max_floors = 10
|
||||
max_basements = 2
|
||||
numbered_floors = range(max_floors + 1) + range(-1, -max_basements - 1, -1)
|
||||
floor_probs = zipfian_distribution(len(numbered_floors), 0.75)
|
||||
floor_probs_cdf = cdf(floor_probs)
|
||||
|
||||
# For use with letters e.g. A0 is probably not as common
|
||||
floors_letters = range(1, max_floors + 1) + [0]
|
||||
floors_letters_probs = zipfian_distribution(len(floors_letters), 2.0)
|
||||
floors_letters_cdf = cdf(floors_letters_probs)
|
||||
|
||||
@classmethod
|
||||
def sample_floors(cls, num_floors, num_basements=0):
|
||||
num_floors = int(num_floors)
|
||||
return random.randint(-num_basements, (num_floors - 1) if num_floors > 0 else 0)
|
||||
|
||||
@classmethod
|
||||
def sample_floors_range(cls, min_floor, max_floor):
|
||||
return random.randint(min_floor, (max_floor - 1) if max_floor > min_floor else min_floor)
|
||||
|
||||
@classmethod
|
||||
def random_int(cls, language, country=None, num_floors=None, num_basements=None):
|
||||
number = None
|
||||
if num_floors is not None:
|
||||
try:
|
||||
num_floors = int(num_floors)
|
||||
except (ValueError, TypeError):
|
||||
return weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
|
||||
|
||||
if num_floors <= cls.max_floors:
|
||||
number = cls.sample_floors(num_floors, num_basements=num_basements or 0)
|
||||
else:
|
||||
number = cls.sample_floors_range(cls.max_floors + 1, num_floors)
|
||||
|
||||
else:
|
||||
number = weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
|
||||
|
||||
return number
|
||||
|
||||
@classmethod
|
||||
def random_from_int(cls, number, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('levels.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
|
||||
|
||||
if number >= 0:
|
||||
number += numbering_starts_at
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.ROMAN_NUMERAL:
|
||||
roman_numeral = numeric_expressions.roman_numeral(number)
|
||||
if roman_numeral is not None:
|
||||
return roman_numeral
|
||||
else:
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.HYPHENATED_NUMBER:
|
||||
number2 = number + sample_floors_range(1, cls.max_floors)
|
||||
return u'{}-{}'.format(number, number2)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet)
|
||||
if num_type == cls.ALPHA:
|
||||
return letter
|
||||
else:
|
||||
number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf)
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}').format(letter, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}').format(number, letter)
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None, num_floors=None, num_basements=None):
|
||||
number = cls.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
|
||||
return cls.random_from_int(number, language, country=country)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, floor, language, country=None, num_floors=None):
|
||||
if floor is None:
|
||||
return None
|
||||
|
||||
integer_floor = False
|
||||
floor = safe_decode(floor)
|
||||
try:
|
||||
floor = int(floor)
|
||||
integer_floor = True
|
||||
except (ValueError, TypeError):
|
||||
try:
|
||||
floor = float(floor)
|
||||
integer_floor = int(floor) == floor
|
||||
except (ValueError, TypeError):
|
||||
return cls.numeric_phrase('levels.alphanumeric', floor, language,
|
||||
dictionaries=['level_types_numbered'], country=country)
|
||||
|
||||
numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
|
||||
try:
|
||||
num_floors = int(num_floors)
|
||||
top_floor = num_floors if numbering_starts_at == 1 else num_floors - 1
|
||||
is_top = num_floors and floor == top_floor
|
||||
except (ValueError, TypeError):
|
||||
is_top = False
|
||||
|
||||
alias_prefix = 'levels.aliases'
|
||||
aliases = address_config.get_property(alias_prefix, language, country=country)
|
||||
if aliases:
|
||||
alias = None
|
||||
|
||||
if not integer_floor and floor >= 0 and 'half_floors' in aliases:
|
||||
floor = int(floor)
|
||||
alias = 'half_floors'
|
||||
elif not integer_floor and floor < 0 and 'half_floors_negative' in aliases:
|
||||
floor = int(floor)
|
||||
alias = 'half_floors_negative'
|
||||
elif floor < -1 and '<-1' in aliases:
|
||||
alias = '<-1'
|
||||
elif is_top and 'top' in aliases:
|
||||
alias = 'top'
|
||||
elif safe_decode(floor) in aliases:
|
||||
alias = safe_decode(floor)
|
||||
|
||||
floor = safe_decode(floor)
|
||||
|
||||
if alias:
|
||||
alias_props = aliases.get(alias)
|
||||
|
||||
# Aliases upon aliases, e.g. for something like "Upper Mezzanine"
|
||||
# where it's an alias for "1" under the half_floors key
|
||||
if safe_decode(floor) in alias_props.get('aliases', {}):
|
||||
alias_prefix = '{}.{}.aliases'.format(alias_prefix, alias)
|
||||
alias = safe_decode(floor)
|
||||
|
||||
if alias:
|
||||
return cls.numeric_phrase('{}.{}'.format(alias_prefix, alias), floor, language,
|
||||
dictionaries=['level_types_basement',
|
||||
'level_types_mezzanine',
|
||||
'level_types_numbered',
|
||||
'level_types_standalone',
|
||||
'level_types_sub_basement'],
|
||||
country=country)
|
||||
|
||||
return cls.numeric_phrase('levels.alphanumeric', floor, language,
|
||||
dictionaries=['level_types_numbered'], country=country)
|
||||
26
scripts/geodata/addresses/house_numbers.py
Normal file
26
scripts/geodata/addresses/house_numbers.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import random
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class HouseNumber(NumberedComponent):
|
||||
@classmethod
|
||||
def phrase(cls, number, language, country=None):
|
||||
if number is not None:
|
||||
prob_key = 'house_numbers.alphanumeric_phrase_probability'
|
||||
key = 'house_numbers.alphanumeric'
|
||||
dictionaries = ['house_numbers', 'number']
|
||||
default = safe_decode(number)
|
||||
else:
|
||||
prob_key = 'house_numbers.no_number_probability'
|
||||
key = 'house_numbers.no_number'
|
||||
dictionaries = ['no_number']
|
||||
default = None
|
||||
|
||||
phrase_prob = address_config.get_property(prob_key, language, country=country, default=0.0)
|
||||
if random.random() < phrase_prob:
|
||||
return cls.numeric_phrase(key, safe_decode(number), language,
|
||||
dictionaries=dictionaries, country=country)
|
||||
return default
|
||||
24
scripts/geodata/addresses/metro_stations.py
Normal file
24
scripts/geodata/addresses/metro_stations.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from geodata.addresses.config import address_config
|
||||
|
||||
import random
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumericPhrase
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class MetroStationPhrase(NumericPhrase):
|
||||
key = 'metro_stations.alphanumeric'
|
||||
dictionaries = ['qualifiers']
|
||||
|
||||
|
||||
class MetroStation(object):
|
||||
@classmethod
|
||||
def phrase(cls, station, language, country=None):
|
||||
if station is None:
|
||||
return None
|
||||
phrase_prob = address_config.get_property('metro_stations.alphanumeric_phrase_probability', language, country=country, default=0.0)
|
||||
if random.random() < phrase_prob:
|
||||
return MetroStationPhrase.phrase(station, language, country=country)
|
||||
|
||||
return None
|
||||
434
scripts/geodata/addresses/numbering.py
Normal file
434
scripts/geodata/addresses/numbering.py
Normal file
@@ -0,0 +1,434 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
from geodata.math.floats import isclose
|
||||
from geodata.numbers.ordinals import ordinal_expressions
|
||||
from geodata.numbers.spellout import numeric_expressions
|
||||
from geodata.text.tokenize import tokenize, token_types
|
||||
|
||||
alphabets = {}
|
||||
|
||||
|
||||
def sample_alphabet(alphabet, b=1.5):
|
||||
'''
|
||||
Sample an "alphabet" using a Zipfian distribution (frequent items are very
|
||||
frequent, long tail of infrequent items). If we look at something like
|
||||
unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
|
||||
"Unit Z" simply because most dwellings only have a few units. Sampling
|
||||
letters from a Zipfian distribution rather than uniformly means that instead
|
||||
of every letter having the same likelihood (1/26), letters toward the beginning
|
||||
of the alphabet are much more likely to be selected. Letters toward the end can
|
||||
still be selected sometimes, but are not very likely.
|
||||
|
||||
Note letters don't necessarily need to be sorted alphabetically, just in order
|
||||
of frequency.
|
||||
'''
|
||||
global alphabets
|
||||
alphabet = tuple(alphabet)
|
||||
if alphabet not in alphabets:
|
||||
probs = zipfian_distribution(len(alphabet), b)
|
||||
probs_cdf = cdf(probs)
|
||||
|
||||
alphabets[alphabet] = probs_cdf
|
||||
|
||||
probs_cdf = alphabets[alphabet]
|
||||
return weighted_choice(alphabet, probs_cdf)
|
||||
|
||||
latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
|
||||
|
||||
|
||||
class Digits(object):
|
||||
ASCII = 'ascii'
|
||||
SPELLOUT = 'spellout'
|
||||
UNICODE_FULL_WIDTH = 'unicode_full_width'
|
||||
ROMAN_NUMERAL = 'roman_numeral'
|
||||
|
||||
CARDINAL = 'cardinal'
|
||||
ORDINAL = 'ordinal'
|
||||
|
||||
unicode_full_width_map = {
|
||||
'0': safe_decode('0'),
|
||||
'1': safe_decode('1'),
|
||||
'2': safe_decode('2'),
|
||||
'3': safe_decode('3'),
|
||||
'4': safe_decode('4'),
|
||||
'5': safe_decode('5'),
|
||||
'6': safe_decode('6'),
|
||||
'7': safe_decode('7'),
|
||||
'8': safe_decode('8'),
|
||||
'9': safe_decode('9'),
|
||||
}
|
||||
|
||||
full_width_digit_map = {
|
||||
v: k for k, v in six.iteritems(unicode_full_width_map)
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def rewrite_full_width(cls, s):
|
||||
return six.u('').join([cls.unicode_full_width_map.get(c, c) for c in s])
|
||||
|
||||
@classmethod
|
||||
def rewrite_standard_width(cls, s):
|
||||
return six.u('').join([cls.full_width_digit_map.get(c, c) for c in s])
|
||||
|
||||
@classmethod
|
||||
def rewrite_roman_numeral(cls, s):
|
||||
roman_numeral = None
|
||||
if s.isdigit():
|
||||
roman_numeral = numeric_expressions.roman_numeral(s)
|
||||
|
||||
if roman_numeral:
|
||||
return roman_numeral
|
||||
else:
|
||||
return s
|
||||
|
||||
@classmethod
|
||||
def rewrite_spellout(cls, s, lang, num_type, props):
|
||||
if s.isdigit():
|
||||
num = int(s)
|
||||
spellout = None
|
||||
gender = props.get('gender')
|
||||
category = props.get('category')
|
||||
|
||||
if num_type == cls.CARDINAL:
|
||||
spellout = numeric_expressions.spellout_cardinal(num, lang, gender=gender, category=category)
|
||||
elif num_type == cls.ORDINAL:
|
||||
spellout = numeric_expressions.spellout_ordinal(num, lang, gender=gender, category=category)
|
||||
|
||||
if spellout:
|
||||
return spellout.title()
|
||||
return s
|
||||
else:
|
||||
return s
|
||||
|
||||
@classmethod
|
||||
def rewrite(cls, d, lang, props, num_type=CARDINAL):
|
||||
if not props:
|
||||
return d
|
||||
|
||||
d = safe_decode(d)
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL):
|
||||
key = '{}_probability'.format(digit_type)
|
||||
if key in props:
|
||||
values.append(digit_type)
|
||||
probs.append(props[key])
|
||||
|
||||
if not isclose(sum(probs), 1.0):
|
||||
values.append(cls.ASCII)
|
||||
probs.append(1.0 - sum(probs))
|
||||
|
||||
probs = cdf(probs)
|
||||
digit_type = weighted_choice(values, probs)
|
||||
|
||||
if digit_type == cls.ASCII:
|
||||
return d
|
||||
elif digit_type == cls.SPELLOUT:
|
||||
return cls.rewrite_spellout(d, lang, num_type, props)
|
||||
elif digit_type == cls.ROMAN_NUMERAL:
|
||||
roman_numeral = cls.rewrite_roman_numeral(d)
|
||||
if random.random() < props.get('ordinal_suffix_probability', 0.0):
|
||||
ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None))
|
||||
if ordinal_suffix:
|
||||
roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix)
|
||||
return roman_numeral
|
||||
elif digit_type == cls.UNICODE_FULL_WIDTH:
|
||||
return cls.rewrite_full_width(d)
|
||||
else:
|
||||
return d
|
||||
|
||||
|
||||
class NumericPhrase(object):
|
||||
key = None
|
||||
|
||||
NUMERIC = 'numeric'
|
||||
NUMERIC_AFFIX = 'numeric_affix'
|
||||
|
||||
@classmethod
|
||||
def pick_phrase_and_type(cls, number, language, country=None):
|
||||
values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
|
||||
if not values:
|
||||
return None, safe_decode(number) if number is not None else None, None
|
||||
|
||||
phrase, phrase_props = weighted_choice(values, probs)
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = phrase_props.get(key, None)
|
||||
if prob is not None:
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
|
||||
if not probs:
|
||||
num_type = cls.NUMERIC
|
||||
else:
|
||||
probs = cdf(probs)
|
||||
num_type = weighted_choice(values, probs)
|
||||
|
||||
return num_type, phrase, phrase_props[num_type]
|
||||
|
||||
@classmethod
|
||||
def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):
|
||||
|
||||
if num_type == cls.NUMERIC_AFFIX:
|
||||
phrase = props['affix']
|
||||
if 'zero_pad' in props and number.isdigit():
|
||||
number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))
|
||||
|
||||
direction = props['direction']
|
||||
whitespace = props.get('whitespace', whitespace_default)
|
||||
whitespace_probability = props.get('whitespace_probability')
|
||||
if whitespace_probability is not None:
|
||||
whitespace = random.random() < whitespace_probability
|
||||
|
||||
if props.get('title_case', True):
|
||||
# Title case unless the config specifies otherwise
|
||||
phrase = phrase.title()
|
||||
|
||||
if number is None:
|
||||
return phrase
|
||||
|
||||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||||
# Phrase goes to the left of hte number
|
||||
if direction == 'left':
|
||||
return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
|
||||
# Phrase goes to the right of the number
|
||||
elif direction == 'right':
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
|
||||
# Need to specify a direction, otherwise return naked number
|
||||
else:
|
||||
return safe_decode(number)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, number, language, country=None):
|
||||
num_type, phrase, props = cls.pick_phrase_and_type(number, language, country=country)
|
||||
whitespace_default = num_type == cls.NUMERIC
|
||||
return cls.combine_with_number(number, phrase, num_type, props, whitespace_default=whitespace_default)
|
||||
|
||||
|
||||
class Number(NumericPhrase):
|
||||
key = 'numbers'
|
||||
dictionaries = ['number']
|
||||
|
||||
|
||||
class NumberedComponent(object):
|
||||
NUMERIC = 'numeric'
|
||||
ALPHA = 'alpha'
|
||||
ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric'
|
||||
NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha'
|
||||
HYPHENATED_NUMBER = 'hyphenated_number'
|
||||
ROMAN_NUMERAL = 'roman_numeral'
|
||||
|
||||
@classmethod
|
||||
def choose_alphanumeric_type(cls, key, language, country=None):
|
||||
alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
|
||||
if alphanumeric_props is None:
|
||||
return None, None
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.HYPHENATED_NUMBER, cls.ROMAN_NUMERAL):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = alphanumeric_props.get(key)
|
||||
if prob is not None:
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
|
||||
if not values:
|
||||
return None, None
|
||||
|
||||
probs = cdf(probs)
|
||||
num_type = weighted_choice(values, probs)
|
||||
num_type_props = alphanumeric_props.get(num_type, {})
|
||||
|
||||
return num_type, num_type_props
|
||||
|
||||
@classmethod
|
||||
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
|
||||
has_alpha = False
|
||||
has_numeric = True
|
||||
is_integer = False
|
||||
is_none = False
|
||||
if num is not None:
|
||||
try:
|
||||
num_int = int(num)
|
||||
is_integer = True
|
||||
except ValueError:
|
||||
try:
|
||||
num_float = float(num)
|
||||
except ValueError:
|
||||
tokens = tokenize(safe_decode(num))
|
||||
has_numeric = False
|
||||
for t, c in tokens:
|
||||
if c == token_types.NUMERIC:
|
||||
has_numeric = True
|
||||
if any((ch.isalpha() for ch in t)):
|
||||
has_alpha = True
|
||||
|
||||
if strict_numeric and has_alpha:
|
||||
return safe_decode(num)
|
||||
|
||||
else:
|
||||
is_none = True
|
||||
|
||||
values, probs = None, None
|
||||
|
||||
if is_alpha:
|
||||
values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)
|
||||
|
||||
# Pick a phrase given the probability distribution from the config
|
||||
if values is None:
|
||||
values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
|
||||
|
||||
if not values:
|
||||
return safe_decode(num) if not is_none else None
|
||||
|
||||
phrase, phrase_props = weighted_choice(values, probs)
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
# Dictionaries are lowercased, so title case here
|
||||
if phrase_props.get('title_case', True):
|
||||
phrase = phrase.title()
|
||||
|
||||
'''
|
||||
There are a few ways we can express the number itself
|
||||
|
||||
1. Alias it as some standalone word like basement (for floor "-1")
|
||||
2. Use the number itself, so "Floor 2"
|
||||
3. Append/prepend an affix e.g. 2/F for second floor
|
||||
4. As an ordinal expression e.g. "2nd Floor"
|
||||
'''
|
||||
have_standalone = False
|
||||
have_null = False
|
||||
for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = phrase_props.get(key)
|
||||
if prob is not None:
|
||||
if num_type == 'standalone':
|
||||
have_standalone = True
|
||||
elif num_type == 'null':
|
||||
have_null = True
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
elif num_type in phrase_props:
|
||||
values.append(num_type)
|
||||
probs.append(1.0)
|
||||
break
|
||||
|
||||
if not probs or is_none:
|
||||
return phrase
|
||||
|
||||
# If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
|
||||
if has_alpha:
|
||||
values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
|
||||
total = float(sum(probs))
|
||||
if isclose(total, 0.0):
|
||||
return None
|
||||
|
||||
probs = [p / total for p in probs]
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
if len(values) < 2:
|
||||
if have_standalone:
|
||||
num_type = 'standalone'
|
||||
elif have_null:
|
||||
num_type = 'null'
|
||||
else:
|
||||
num_type = 'numeric'
|
||||
else:
|
||||
num_type = weighted_choice(values, probs)
|
||||
|
||||
if num_type == 'standalone':
|
||||
return phrase
|
||||
elif num_type == 'null':
|
||||
return safe_decode(num)
|
||||
|
||||
props = phrase_props[num_type]
|
||||
|
||||
if is_integer:
|
||||
num_int = int(num)
|
||||
if phrase_props.get('number_abs_value', False):
|
||||
num_int = abs(num_int)
|
||||
num = num_int
|
||||
|
||||
if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']:
|
||||
return None
|
||||
|
||||
if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']:
|
||||
return None
|
||||
|
||||
if phrase_props.get('number_subtract_abs_value'):
|
||||
num_int -= phrase_props['number_subtract_abs_value']
|
||||
num = num_int
|
||||
|
||||
num = safe_decode(num)
|
||||
digits_props = props.get('digits')
|
||||
if digits_props:
|
||||
# Inherit the gender and category e.g. for ordinals
|
||||
for k in ('gender', 'category'):
|
||||
if k in props:
|
||||
digits_props[k] = props[k]
|
||||
num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)
|
||||
|
||||
# Do we add the numeric phrase e.g. Floor No 1
|
||||
add_number_phrase = props.get('add_number_phrase', False)
|
||||
if add_number_phrase and random.random() < props['add_number_phrase_probability']:
|
||||
num = Number.phrase(num, language, country=country)
|
||||
|
||||
whitespace_default = True
|
||||
|
||||
if num_type == 'numeric_affix':
|
||||
phrase = props['affix']
|
||||
if props.get('upper_case', True):
|
||||
phrase = phrase.upper()
|
||||
if 'zero_pad' in props and num.isdigit():
|
||||
num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
|
||||
whitespace_default = False
|
||||
elif num_type == 'ordinal' and safe_decode(num).isdigit():
|
||||
ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))
|
||||
|
||||
if ordinal_expression is not None:
|
||||
num = ordinal_expression
|
||||
|
||||
if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
|
||||
if random.random() < props['null_phrase_probability']:
|
||||
return num
|
||||
|
||||
direction = props['direction']
|
||||
whitespace = props.get('whitespace', whitespace_default)
|
||||
|
||||
whitespace_probability = props.get('whitespace_probability')
|
||||
if whitespace_probability is not None:
|
||||
whitespace = random.random() < whitespace_probability
|
||||
|
||||
# Occasionally switch up if direction_probability is specified
|
||||
if random.random() > props.get('direction_probability', 1.0):
|
||||
if direction == 'left':
|
||||
direction = 'right'
|
||||
elif direction == 'right':
|
||||
direction = 'left'
|
||||
|
||||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||||
# Phrase goes to the left of hte number
|
||||
if direction == 'left':
|
||||
return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
|
||||
# Phrase goes to the right of the number
|
||||
elif direction == 'right':
|
||||
return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
|
||||
# Need to specify a direction, otherwise return naked number
|
||||
else:
|
||||
return safe_decode(num)
|
||||
76
scripts/geodata/addresses/po_boxes.py
Normal file
76
scripts/geodata/addresses/po_boxes.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import cdf, weighted_choice
|
||||
|
||||
|
||||
class POBox(NumberedComponent):
|
||||
@classmethod
|
||||
def random_digits(cls, num_digits):
|
||||
# Note: PO Boxes can have leading zeros but not important for the parser
|
||||
# since it only cares about how many digits there are in a number
|
||||
low = 10 ** (num_digits - 1)
|
||||
high = (10 ** num_digits) - 1
|
||||
|
||||
return random.randint(low, high)
|
||||
|
||||
@classmethod
|
||||
def random_digits_with_prefix(cls, num_digits, prefix=six.u('')):
|
||||
return six.u('').join([prefix, safe_decode(cls.random_digits(num_digits))])
|
||||
|
||||
@classmethod
|
||||
def random_digits_with_suffix(cls, num_digits, suffix=six.u('')):
|
||||
return six.u('').join([safe_decode(cls.random_digits(num_digits)), suffix])
|
||||
|
||||
@classmethod
|
||||
def random_letter(cls, language, country=None):
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
return sample_alphabet(alphabet)
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('po_boxes.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
if num_type != cls.ALPHA:
|
||||
digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[])
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for val in digit_config:
|
||||
values.append(val['length'])
|
||||
probs.append(val['probability'])
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
num_digits = weighted_choice(values, probs)
|
||||
|
||||
digits = cls.random_digits(num_digits)
|
||||
number = Digits.rewrite(digits, language, num_type_props)
|
||||
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
return safe_decode(number)
|
||||
else:
|
||||
letter = cls.random_letter(language, country=country)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
else:
|
||||
return cls.random_letter(language, country=country)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, box_number, language, country=None):
|
||||
if box_number is None:
|
||||
return None
|
||||
return cls.numeric_phrase('po_boxes.alphanumeric', safe_decode(box_number), language,
|
||||
dictionaries=['post_office'], country=country)
|
||||
11
scripts/geodata/addresses/postcodes.py
Normal file
11
scripts/geodata/addresses/postcodes.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class PostCode(NumberedComponent):
|
||||
@classmethod
|
||||
def phrase(cls, postcode, language, country=None):
|
||||
if postcode is None:
|
||||
return None
|
||||
return cls.numeric_phrase('postcodes.alphanumeric', postcode, language,
|
||||
dictionaries=['postcodes'], country=country)
|
||||
66
scripts/geodata/addresses/staircases.py
Normal file
66
scripts/geodata/addresses/staircases.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.addresses.directions import RelativeDirection
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
|
||||
|
||||
class Staircase(NumberedComponent):
|
||||
max_staircases = 10
|
||||
|
||||
staircase_range = range(1, max_staircases + 1)
|
||||
staircase_range_probs = zipfian_distribution(len(staircase_range), 2.0)
|
||||
staircase_range_cdf = cdf(staircase_range_probs)
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('staircases.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.HYPHENATED_NUMBER:
|
||||
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
|
||||
number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
|
||||
return u'{}-{}'.format(number, number2)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet, 2.0)
|
||||
if num_type == cls.ALPHA:
|
||||
return safe_decode(letter)
|
||||
else:
|
||||
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
|
||||
whitespace_phrase = u''
|
||||
r = random.random()
|
||||
if r < whitespace_probability:
|
||||
whitespace_phrase = u' '
|
||||
elif r < (whitespace_probability + hyphen_probability):
|
||||
whitespace_phrase = u'-'
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, staircase, language, country=None):
|
||||
if staircase is None:
|
||||
return None
|
||||
return cls.numeric_phrase('staircases.alphanumeric', staircase, language,
|
||||
dictionaries=['staircases'], country=country)
|
||||
285
scripts/geodata/addresses/units.py
Normal file
285
scripts/geodata/addresses/units.py
Normal file
@@ -0,0 +1,285 @@
|
||||
import itertools
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.directions import RelativeDirection, LateralDirection, AnteroposteriorDirection
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
from geodata.text.utils import is_numeric_strict
|
||||
|
||||
|
||||
class Unit(NumberedComponent):
|
||||
# When we don't know the number of units, use a Zipfian distribution
|
||||
# to choose randomly between 1 and max_units with 1 being much more
|
||||
# likely than 2, etc.
|
||||
max_units = 99
|
||||
max_basements = 2
|
||||
|
||||
hundreds_numbered_units_tens = [range(101, 110) + [100],
|
||||
range(201, 210) + [200],
|
||||
range(301, 310) + [300],
|
||||
range(401, 410) + [400],
|
||||
range(501, 510) + [500],
|
||||
]
|
||||
|
||||
hundreds_numbered_units = [range(110, 200),
|
||||
range(210, 300),
|
||||
range(310, 400),
|
||||
range(410, 500),
|
||||
range(510, 600),
|
||||
]
|
||||
|
||||
thousands_numbered_units = [range(1001, 1030) + [1000],
|
||||
range(2001, 2030) + [2000],
|
||||
range(3001, 3030) + [3000],
|
||||
range(4001, 4030) + [4000],
|
||||
range(5001, 5030) + [5000]
|
||||
]
|
||||
|
||||
numbered_units = range(1, 10)
|
||||
numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units_tens)))
|
||||
numbered_units.extend(range(10, 100))
|
||||
numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units)))
|
||||
numbered_units.extend(itertools.chain(*itertools.izip(*thousands_numbered_units)))
|
||||
numbered_units.extend(range(10001, 10100) + [10000])
|
||||
numbered_units.append(0)
|
||||
numbered_units.extend(range(0, -max_basements - 1, -1))
|
||||
|
||||
unit_probs = zipfian_distribution(len(numbered_units), 0.7)
|
||||
unit_probs_cdf = cdf(unit_probs)
|
||||
|
||||
num_digits = [2, 3, 4]
|
||||
num_digits_probs = zipfian_distribution(len(num_digits), 4.0)
|
||||
num_digits_cdf = cdf(num_digits_probs)
|
||||
|
||||
# For use with floors e.g. #301 more common than #389
|
||||
positive_units_floors = range(1, 10) + [0] + range(10, max_units + 1)
|
||||
positive_units_floors_probs = zipfian_distribution(len(positive_units_floors), 0.6)
|
||||
positive_units_floors_cdf = cdf(positive_units_floors_probs)
|
||||
|
||||
# For basic positive units
|
||||
positive_units = range(1, max_units + 1)
|
||||
positive_units_probs = zipfian_distribution(len(positive_units), 0.6)
|
||||
positive_units_cdf = cdf(positive_units_probs)
|
||||
|
||||
# For use with letters e.g. A0 less common
|
||||
positive_units_letters = range(1, max_units + 1) + [0]
|
||||
positive_units_letters_probs = zipfian_distribution(len(positive_units_letters), 0.6)
|
||||
positive_units_letters_cdf = cdf(positive_units_letters_probs)
|
||||
|
||||
RESIDENTIAL = 'residential'
|
||||
COMMERCIAL = 'commercial'
|
||||
INDUSTRIAL = 'industrial'
|
||||
UNIVERSITY = 'university'
|
||||
|
||||
@classmethod
|
||||
def sample_num_digits(cls):
|
||||
return weighted_choice(cls.num_digits, cls.num_digits_cdf)
|
||||
|
||||
@classmethod
|
||||
def for_floor(cls, floor_number, num_digits=None):
|
||||
num_digits = num_digits if num_digits is not None else cls.sample_num_digits()
|
||||
unit = weighted_choice(cls.positive_units_floors, cls.positive_units_floors_cdf)
|
||||
return six.u('{}{}').format(floor_number, safe_decode(unit).zfill(num_digits))
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None, num_floors=None, num_basements=None, floor=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('units.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
use_floor_prob = address_config.get_property('units.alphanumeric.use_floor_probability', language, country=country, default=0.0)
|
||||
|
||||
use_positive_numbers_prob = address_config.get_property('units.alphanumeric.use_positive_numbers_probability', language, country=country, default=0.0)
|
||||
|
||||
if (num_floors is None and floor is None) or random.random() >= use_floor_prob:
|
||||
if random.random() >= use_positive_numbers_prob:
|
||||
number = weighted_choice(cls.numbered_units, cls.unit_probs_cdf)
|
||||
else:
|
||||
number = weighted_choice(cls.positive_units, cls.positive_units_cdf)
|
||||
else:
|
||||
if floor is None or not floor.isdigit():
|
||||
floor = Floor.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
|
||||
|
||||
floor_numbering_starts_at = address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)
|
||||
ground_floor_starts_at = address_config.get_property('units.alphanumeric.use_floor_ground_starts_at', language, country=country, default=None)
|
||||
|
||||
if ground_floor_starts_at is not None:
|
||||
try:
|
||||
floor = int(floor)
|
||||
if floor >= floor_numbering_starts_at:
|
||||
floor -= floor_numbering_starts_at
|
||||
floor += ground_floor_starts_at
|
||||
floor = safe_decode(floor)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
use_floor_affix_prob = address_config.get_property('units.alphanumeric.use_floor_numeric_affix_probability', language, country=country, default=0.0)
|
||||
if use_floor_affix_prob and random.random() < use_floor_affix_prob:
|
||||
floor_phrase = Floor.phrase(floor, language, country=country)
|
||||
# Only works if the floor phrase is strictly numeric e.g. "1" or "H1"
|
||||
if is_numeric_strict(floor_phrase):
|
||||
unit = weighted_choice(cls.positive_units, cls.positive_units_cdf)
|
||||
|
||||
unit_num_digits = address_config.get_property('units.alphanumeric.use_floor_unit_num_digits', language, country=country, default=None)
|
||||
if unit_num_digits is not None:
|
||||
unit = safe_decode(unit).zfill(unit_num_digits)
|
||||
|
||||
return six.u('{}{}').format(floor_phrase, unit)
|
||||
|
||||
floor_num_digits = address_config.get_property('units.alphanumeric.use_floor_floor_num_digits', language, country=country, default=None)
|
||||
if floor_num_digits is not None and floor.isdigit():
|
||||
floor = floor.zfill(floor_num_digits)
|
||||
|
||||
number = cls.for_floor(floor)
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.HYPHENATED_NUMBER:
|
||||
number2 = weighted_choice(cls.positive_units, cls.positive_units_cdf)
|
||||
range_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.range_probability', language, country=country, default=0.5))
|
||||
direction = address_config.get_property('units.alphanumeric.hyphenated_number.direction', language, country=country, default='right')
|
||||
direction_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.direction_probability', language, country=country, default=0.0))
|
||||
|
||||
if random.random() < direction_prob:
|
||||
direction = 'left' if direction == 'right' else 'right'
|
||||
|
||||
direction_right = direction == 'right'
|
||||
|
||||
if random.random() < range_prob:
|
||||
if direction_right:
|
||||
number2 += number
|
||||
else:
|
||||
number2 = max(0, number - number2)
|
||||
if direction == 'right':
|
||||
return u'{}-{}'.format(number, number2)
|
||||
else:
|
||||
return u'{}-{}'.format(number2, number)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet)
|
||||
if num_type == cls.ALPHA:
|
||||
return safe_decode(letter)
|
||||
else:
|
||||
if num_floors is None:
|
||||
number = weighted_choice(cls.positive_units_letters, cls.positive_units_letters_cdf)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
|
||||
whitespace_phrase = u''
|
||||
r = random.random()
|
||||
if r < whitespace_probability:
|
||||
whitespace_phrase = u' '
|
||||
elif r < (whitespace_probability + hyphen_probability):
|
||||
whitespace_phrase = u'-'
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
|
||||
@classmethod
|
||||
def add_direction(cls, key, unit, language, country=None):
|
||||
add_direction_probability = address_config.get_property('{}.add_direction_probability'.format(key),
|
||||
language, country=country, default=0.0)
|
||||
if not random.random() < add_direction_probability:
|
||||
return unit
|
||||
add_direction_numeric = address_config.get_property('{}.add_direction_numeric'.format(key),
|
||||
language, country=country)
|
||||
try:
|
||||
unit = int(unit)
|
||||
integer_unit = True
|
||||
except (ValueError, TypeError):
|
||||
integer_unit = False
|
||||
|
||||
if add_direction_numeric and integer_unit:
|
||||
return RelativeDirection.phrase(unit, language, country=country)
|
||||
elif not integer_unit:
|
||||
add_direction_standalone = address_config.get_property('{}.add_direction_standalone'.format(key),
|
||||
language, country=country)
|
||||
if add_direction_standalone:
|
||||
return RelativeDirection.phrase(None, language, country=country)
|
||||
|
||||
@classmethod
|
||||
def add_quadrant(cls, key, unit, language, country=None):
|
||||
add_quadrant_probability = address_config.get_property('{}.add_quadrant_probability'.format(key),
|
||||
language, country=country, default=0.0)
|
||||
if not random.random() < add_quadrant_probability:
|
||||
return unit
|
||||
add_quadrant_numeric = address_config.get_property('{}.add_quadrant_numeric'.format(key),
|
||||
language, country=country)
|
||||
try:
|
||||
unit = int(unit)
|
||||
integer_unit = True
|
||||
except (ValueError, TypeError):
|
||||
integer_unit = False
|
||||
|
||||
first_direction = address_config.get_property('{}.add_quadrant_first_direction'.format(key),
|
||||
language, country=country)
|
||||
|
||||
if first_direction == 'lateral':
|
||||
ordering = (LateralDirection, AnteroposteriorDirection)
|
||||
elif first_direction == 'anteroposterior':
|
||||
ordering = (AnteroposteriorDirection, LateralDirection)
|
||||
else:
|
||||
return unit
|
||||
|
||||
if not integer_unit:
|
||||
add_quadrant_standalone = address_config.get_property('{}.add_quadrant_standalone'.format(key),
|
||||
language, country=country)
|
||||
if add_quadrant_standalone:
|
||||
unit = None
|
||||
else:
|
||||
return None
|
||||
|
||||
last_num_type = None
|
||||
for i, c in enumerate(ordering):
|
||||
num_type, phrase, props = c.pick_phrase_and_type(unit, language, country=country)
|
||||
whitespace_default = num_type == c.NUMERIC or last_num_type == c.NUMERIC
|
||||
unit = c.combine_with_number(unit, phrase, num_type, props, whitespace_default=whitespace_default)
|
||||
last_num_type = num_type
|
||||
|
||||
return unit
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, unit, language, country=None, zone=None):
|
||||
if unit is not None:
|
||||
key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(zone)
|
||||
|
||||
if not address_config.get_property(key, language, country=country):
|
||||
return None
|
||||
|
||||
is_alpha = safe_decode(unit).isalpha()
|
||||
|
||||
direction_unit = None
|
||||
add_direction = address_config.get_property('{}.add_direction'.format(key), language, country=country)
|
||||
if add_direction:
|
||||
direction_unit = cls.add_direction(key, unit, language, country=country)
|
||||
|
||||
if direction_unit and direction_unit != unit:
|
||||
unit = direction_unit
|
||||
is_alpha = False
|
||||
else:
|
||||
add_quadrant = address_config.get_property('{}.add_quadrant'.format(key), language, country=country)
|
||||
if add_quadrant:
|
||||
unit = cls.add_quadrant(key, unit, language, country=country)
|
||||
is_alpha = False
|
||||
|
||||
return cls.numeric_phrase(key, safe_decode(unit), language,
|
||||
dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha)
|
||||
else:
|
||||
key = 'units.standalone'
|
||||
values, probs = address_config.alternative_probabilities(key, language,
|
||||
dictionaries=['unit_types_standalone'],
|
||||
country=country)
|
||||
if values is None:
|
||||
return None
|
||||
phrase, phrase_props = weighted_choice(values, probs)
|
||||
return phrase.title()
|
||||
0
scripts/geodata/boundaries/__init__.py
Normal file
0
scripts/geodata/boundaries/__init__.py
Normal file
167
scripts/geodata/boundaries/names.py
Normal file
167
scripts/geodata/boundaries/names.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from geodata.configs.utils import nested_get, DoesNotExist, alternative_probabilities
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.floats import isclose
|
||||
from geodata.math.sampling import cdf, weighted_choice
|
||||
|
||||
from geodata.encoding import safe_encode
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
BOUNDARY_NAMES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'boundaries', 'names')
|
||||
|
||||
BOUNDARY_NAMES_CONFIG = os.path.join(BOUNDARY_NAMES_DIR, 'global.yaml')
|
||||
|
||||
|
||||
class BoundaryNames(object):
|
||||
DEFAULT_NAME_KEY = 'name'
|
||||
|
||||
def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
|
||||
config = yaml.load(open(config_file))
|
||||
|
||||
default_names = nested_get(config, ('names', 'keys'))
|
||||
name_keys, probs = alternative_probabilities(default_names)
|
||||
|
||||
self.name_keys = name_keys
|
||||
self.name_key_probs = cdf(probs)
|
||||
|
||||
self.component_name_keys = {}
|
||||
|
||||
for component, component_config in six.iteritems(nested_get(config, ('names', 'components'), default={})):
|
||||
component_names = component_config.get('keys')
|
||||
component_name_keys, component_probs = alternative_probabilities(component_names)
|
||||
self.component_name_keys[component] = (component_name_keys, cdf(component_probs))
|
||||
|
||||
self.country_regex_replacements = defaultdict(list)
|
||||
for props in nested_get(config, ('names', 'regex_replacements',), default=[]):
|
||||
country = props.get('country')
|
||||
re_flags = re.I | re.UNICODE
|
||||
if not props.get('case_insensitive', True):
|
||||
re.flags ^= re.I
|
||||
|
||||
pattern = re.compile(props['pattern'], re_flags)
|
||||
replace_group = props['replace_with_group']
|
||||
replace_probability = props['replace_probability']
|
||||
self.country_regex_replacements[country].append((pattern, replace_group, replace_probability))
|
||||
|
||||
self.country_regex_replacements = dict(self.country_regex_replacements)
|
||||
|
||||
self.prefixes = {}
|
||||
self.prefix_regexes = {}
|
||||
self.suffixes = {}
|
||||
self.suffix_regexes = {}
|
||||
|
||||
for language, components in six.iteritems(nested_get(config, ('names', 'prefixes', 'language'), default={}) ):
|
||||
for component, affixes in six.iteritems(components):
|
||||
affix_values, probs = alternative_probabilities(affixes)
|
||||
|
||||
for val in affix_values:
|
||||
if 'prefix' not in val:
|
||||
raise AssertionError(six.u('Invalid prefix value for (language={}, component={}): {} ').format(language, component, val))
|
||||
|
||||
prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values])
|
||||
self.prefix_regexes[(language, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U)
|
||||
|
||||
if not isclose(sum(probs), 1.0):
|
||||
affix_values.append(None)
|
||||
probs.append(1.0 - sum(probs))
|
||||
affix_probs_cdf = cdf(probs)
|
||||
self.prefixes[(language, component)] = affix_values, affix_probs_cdf
|
||||
|
||||
for language, components in six.iteritems(nested_get(config, ('names', 'suffixes', 'language'), default={}) ):
|
||||
for component, affixes in six.iteritems(components):
|
||||
affix_values, probs = alternative_probabilities(affixes)
|
||||
|
||||
for val in affix_values:
|
||||
if 'suffix' not in val:
|
||||
raise AssertionError(six.u('Invalid suffix value for (language={}, component={}): {} ').format(language, component, val))
|
||||
|
||||
suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values])
|
||||
self.suffix_regexes[(language, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U)
|
||||
|
||||
if not isclose(sum(probs), 1.0):
|
||||
affix_values.append(None)
|
||||
probs.append(1.0 - sum(probs))
|
||||
affix_probs_cdf = cdf(probs)
|
||||
self.suffixes[(language, component)] = affix_values, affix_probs_cdf
|
||||
|
||||
self.exceptions = {}
|
||||
|
||||
for props in nested_get(config, ('names', 'exceptions'), default=[]):
|
||||
object_type = props['type']
|
||||
object_id = safe_encode(props['id'])
|
||||
keys = [props['default']]
|
||||
probs = [props['probability']]
|
||||
for alt in props.get('alternatives', []):
|
||||
keys.append(alt['alternative'])
|
||||
probs.append(alt['probability'])
|
||||
|
||||
probs = cdf(probs)
|
||||
self.exceptions[(object_type, object_id)] = (keys, probs)
|
||||
|
||||
def _string_as_regex(self, s):
|
||||
return safe_decode(s).replace(six.u('.'), six.u('\\.'))
|
||||
|
||||
def valid_name(self, object_type, object_id, name):
|
||||
exceptions, probs = self.exceptions.get((object_type, object_id), ((), ()))
|
||||
return not exceptions or name in exceptions
|
||||
|
||||
def name_key_dist(self, props, component):
|
||||
object_type = props.get('type')
|
||||
object_id = safe_encode(props.get('id', ''))
|
||||
|
||||
if (object_type, object_id) in self.exceptions:
|
||||
values, probs = self.exceptions[(object_type, object_id)]
|
||||
return values, probs
|
||||
|
||||
name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs))
|
||||
return name_keys, probs
|
||||
|
||||
def name_key(self, props, component):
|
||||
name_keys, probs = self.name_key_dist(props, component)
|
||||
return weighted_choice(name_keys, probs)
|
||||
|
||||
def name(self, country, language, component, name):
|
||||
all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])
|
||||
|
||||
prefixes, prefix_probs = self.prefixes.get((language, component), (None, None))
|
||||
suffixes, suffix_probs = self.suffixes.get((language, component), (None, None))
|
||||
|
||||
if not all_replacements and not prefixes and not suffixes:
|
||||
return name
|
||||
|
||||
for regex, group, prob in all_replacements:
|
||||
match = regex.match(name)
|
||||
if match and random.random() < prob:
|
||||
name = match.group(group)
|
||||
|
||||
for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0),
|
||||
(suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
|
||||
if affixes is not None:
|
||||
regex = regexes[language, component]
|
||||
if regex.match(name):
|
||||
continue
|
||||
|
||||
affix = weighted_choice(affixes, affix_probs)
|
||||
|
||||
if affix is not None:
|
||||
whitespace = affix.get('whitespace', True)
|
||||
space_val = six.u(' ') if whitespace else six.u('')
|
||||
affix = affix[key]
|
||||
if direction == 0:
|
||||
return six.u('{}{}{}').format(affix, space_val, safe_decode(name))
|
||||
else:
|
||||
return six.u('{}{}{}').format(safe_decode(name), space_val, affix)
|
||||
|
||||
return name
|
||||
|
||||
|
||||
boundary_names = BoundaryNames()
|
||||
0
scripts/geodata/categories/__init__.py
Normal file
0
scripts/geodata/categories/__init__.py
Normal file
72
scripts/geodata/categories/config.py
Normal file
72
scripts/geodata/categories/config.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import csv
|
||||
import os
|
||||
import six
|
||||
import random
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'categories')
|
||||
|
||||
|
||||
class CategoryConfig(object):
|
||||
def __init__(self, base_dir=CATEGORIES_DIR):
|
||||
self.language_categories_singular = {}
|
||||
self.language_categories_plural = {}
|
||||
|
||||
self.language_property_names = defaultdict(set)
|
||||
|
||||
if not os.path.exists(base_dir):
|
||||
raise RuntimeError('{} does not exist'.format(base_dir))
|
||||
|
||||
for filename in os.listdir(base_dir):
|
||||
if not filename.endswith('.tsv'):
|
||||
continue
|
||||
|
||||
lang = filename.rsplit('.tsv')[0]
|
||||
base_lang = lang.split('_')[0]
|
||||
|
||||
singular_rules = self.language_categories_singular.get(base_lang, defaultdict(list))
|
||||
plural_rules = self.language_categories_plural.get(base_lang, defaultdict(list))
|
||||
|
||||
reader = csv.reader(open(os.path.join(CATEGORIES_DIR, filename)), delimiter='\t')
|
||||
reader.next() # headers
|
||||
|
||||
for key, value, is_plural, phrase in reader:
|
||||
self.language_property_names[lang].add(key)
|
||||
is_plural = bool(int(is_plural))
|
||||
if is_plural:
|
||||
plural_rules[(key, value)].append(phrase)
|
||||
else:
|
||||
singular_rules[(key, value)].append(phrase)
|
||||
|
||||
self.language_categories_singular[base_lang] = singular_rules
|
||||
self.language_categories_plural[base_lang] = plural_rules
|
||||
|
||||
self.language_categories_singular = {key: dict(value) for key, value
|
||||
in six.iteritems(self.language_categories_singular)}
|
||||
|
||||
self.language_categories_plural = {key: dict(value) for key, value
|
||||
in six.iteritems(self.language_categories_plural)}
|
||||
|
||||
def has_keys(self, language, keys):
|
||||
prop_names = self.language_property_names.get(language, set())
|
||||
return [k for k in keys if k in prop_names]
|
||||
|
||||
def get_phrase(self, language, key, value, is_plural=False):
|
||||
config = self.language_categories_singular if not is_plural else self.language_categories_plural
|
||||
if language not in config:
|
||||
return None
|
||||
language_config = config[language]
|
||||
choices = language_config.get((key, value))
|
||||
if not choices:
|
||||
return None
|
||||
return random.choice(choices)
|
||||
|
||||
category_config = CategoryConfig()
|
||||
31
scripts/geodata/categories/preposition.py
Normal file
31
scripts/geodata/categories/preposition.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.categories.config import category_config
|
||||
from geodata.math.sampling import weighted_choice, cdf
|
||||
|
||||
|
||||
class CategoryPreposition(object):
|
||||
NEAR = 'near'
|
||||
NEARBY = 'nearby'
|
||||
NEAR_ME = 'near_me'
|
||||
IN = 'in'
|
||||
NULL = 'null'
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
category_props = address_config.get_property('categories', language, country=country)
|
||||
if category_props is None:
|
||||
return None
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
|
||||
k = '{}_probability'.format(prep_phrase_type)
|
||||
prob = category_props.get(k, None)
|
||||
if prob is not None:
|
||||
values.append(prep_phrase_type)
|
||||
probs.append(prob)
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
return weighted_choice(values, probs)
|
||||
38
scripts/geodata/categories/query.py
Normal file
38
scripts/geodata/categories/query.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from collections import namedtuple
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.categories.config import category_config
|
||||
from geodata.categories.preposition import CategoryPreposition
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice
|
||||
|
||||
CategoryQuery = namedtuple('CategoryQuery', 'category, prep, add_place_name, add_address')
|
||||
|
||||
NULL_CATEGORY_QUERY = CategoryQuery(None, None, False, False)
|
||||
|
||||
|
||||
class Category(object):
|
||||
@classmethod
|
||||
def phrase(cls, language, key, value, is_plural=False, country=None):
|
||||
category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
|
||||
if not category_phrase:
|
||||
return NULL_CATEGORY_QUERY
|
||||
|
||||
category_phrase = safe_decode(category_phrase)
|
||||
|
||||
prep_phrase_type = CategoryPreposition.random(language, country=country)
|
||||
|
||||
if prep_phrase_type in (None, CategoryPreposition.NULL):
|
||||
return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
|
||||
|
||||
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
|
||||
if not values:
|
||||
return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
|
||||
|
||||
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
|
||||
prep_phrase = safe_decode(prep_phrase)
|
||||
|
||||
add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
|
||||
add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
|
||||
|
||||
return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
|
||||
125
scripts/geodata/categories/scrape_nominatim_special_phrases.py
Normal file
125
scripts/geodata/categories/scrape_nominatim_special_phrases.py
Normal file
@@ -0,0 +1,125 @@
|
||||
'''
|
||||
scrape_nominatim_special_phrases.py
|
||||
-----------------------------------
|
||||
|
||||
Simple script to scrape https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases
|
||||
for category-related phrases sometimes found in geocoder input.
|
||||
|
||||
Populates a per-language CSV with (phrase, OSM key, OSM value, plural):
|
||||
|
||||
OSM keys/values are like:
|
||||
|
||||
amenity=restaurant
|
||||
tourism=museum
|
||||
shop=books
|
||||
|
||||
Using these phrases, it is possible to construct queries like "restaurants in Brooklyn"
|
||||
'''
|
||||
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import six
|
||||
import sys
|
||||
import time
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
|
||||
DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'categories')
|
||||
|
||||
|
||||
# Use Special:Export to get wiki markup
|
||||
WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
|
||||
NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
|
||||
NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
|
||||
|
||||
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
|
||||
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
|
||||
|
||||
IGNORE_LANGUAGES = {
|
||||
# Interlingua
|
||||
'ia'
|
||||
}
|
||||
|
||||
|
||||
IGNORE_PLURAL_LANGUAGES = {
|
||||
# For Japanese, seems to just put an s on the end, which doesn't seem right
|
||||
# Need input from a native speaker on that one
|
||||
'ja',
|
||||
}
|
||||
|
||||
# Wait this many seconds between page fetches
|
||||
POLITENESS_DELAY = 5.0
|
||||
|
||||
|
||||
def scrape_nominatim_category_page(url, ignore_plurals=False):
|
||||
result = requests.get(url)
|
||||
|
||||
if not result or not result.content:
|
||||
return
|
||||
|
||||
for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
|
||||
if operator and operator != '-':
|
||||
continue
|
||||
|
||||
is_plural = plural == 'Y'
|
||||
if is_plural and ignore_plurals:
|
||||
continue
|
||||
|
||||
yield safe_decode(phrase).lower(), key, value, is_plural
|
||||
|
||||
|
||||
def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
|
||||
print('Fetching main page')
|
||||
result = requests.get(url)
|
||||
languages = {}
|
||||
if not result or not result.content:
|
||||
return languages
|
||||
|
||||
time.sleep(POLITENESS_DELAY)
|
||||
|
||||
for entity, anchor_text in wiki_link_re.findall(result.content):
|
||||
if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
|
||||
continue
|
||||
|
||||
lang = entity.rstrip('/').rsplit('/')[-1].lower()
|
||||
if lang in IGNORE_LANGUAGES:
|
||||
continue
|
||||
|
||||
link = WIKI_BASE_URL + entity.replace(' ', '_')
|
||||
|
||||
ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
|
||||
|
||||
print('Doing {}'.format(lang))
|
||||
phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
|
||||
time.sleep(POLITENESS_DELAY)
|
||||
|
||||
if not phrases:
|
||||
continue
|
||||
|
||||
languages[lang] = phrases
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
|
||||
languages = scrape_all_nominatim_category_pages(url=url)
|
||||
for lang, phrases in six.iteritems(languages):
|
||||
filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
|
||||
with open(filename, 'w') as f:
|
||||
writer = csv.writer(f, delimiter='\t')
|
||||
writer.writerow(('key', 'value', 'is_plural', 'phrase'))
|
||||
|
||||
for phrase, key, value, is_plural in phrases:
|
||||
writer.writerow((safe_encode(key), safe_encode(value),
|
||||
str(int(is_plural)), safe_encode(phrase)))
|
||||
|
||||
print('Done')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
0
scripts/geodata/chains/__init__.py
Normal file
0
scripts/geodata/chains/__init__.py
Normal file
23
scripts/geodata/chains/chains.sh
Executable file
23
scripts/geodata/chains/chains.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
if [ "$#" -ge 1 ]; then
|
||||
DATA_DIR=$1
|
||||
else
|
||||
DATA_DIR=$(pwd)
|
||||
fi
|
||||
|
||||
PWD=$(pwd)
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
|
||||
python $SCRIPT_DIR/chains_tsv.py $DATA_DIR/planet-venues.osm $DATA_DIR/chains.tsv
|
||||
|
||||
cd $DATA_DIR
|
||||
split -d -C524200 chains.tsv chains.split.
|
||||
|
||||
for filename in chains.split.*; do
|
||||
extension="${filename##*.0}"
|
||||
name="${filename%%.*}"
|
||||
echo -e "name_lower\tname\tcanonical\tknown_chain\tcount" | cat - $filename > /tmp/out
|
||||
mv /tmp/out $name.$extension.tsv
|
||||
rm $filename
|
||||
done
|
||||
|
||||
cd $PWD
|
||||
78
scripts/geodata/chains/chains_tsv.py
Normal file
78
scripts/geodata/chains/chains_tsv.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
import six
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
from collections import Counter
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.address_expansions.address_dictionaries import ADDRESS_EXPANSIONS_DIR
|
||||
from geodata.osm.extract import *
|
||||
from geodata.encoding import safe_encode
|
||||
|
||||
|
||||
class VenueNames(object):
|
||||
def __init__(self, venues_filename):
|
||||
self.venues_filename = venues_filename
|
||||
self.all_chains = set()
|
||||
self.chain_canonical = {}
|
||||
|
||||
for filename in glob.glob(os.path.join(ADDRESS_EXPANSIONS_DIR, '**', 'chains.txt')):
|
||||
f = open(filename)
|
||||
for line in f:
|
||||
line = line.rstrip()
|
||||
phrases = safe_decode(line).split(six.u('|'))
|
||||
self.all_chains |= set(phrases)
|
||||
canonical = phrases[0]
|
||||
for p in phrases[1:]:
|
||||
self.chain_canonical[p] = canonical
|
||||
|
||||
self.names = Counter()
|
||||
self.names_lower = Counter()
|
||||
self.names_cap = defaultdict(Counter)
|
||||
|
||||
def count(self):
|
||||
i = 0
|
||||
for node_id, value, deps in parse_osm(self.venues_filename):
|
||||
name = value.get('name')
|
||||
if not name:
|
||||
continue
|
||||
self.names[name] += 1
|
||||
self.names_lower[name.lower()] += 1
|
||||
self.names_cap[name.lower()][name] += 1
|
||||
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print 'did', i
|
||||
i += 1
|
||||
|
||||
def write_to_tsv(self, out_filename, min_threshold=5):
|
||||
writer = csv.writer(open(out_filename, 'w'), delimiter='\t')
|
||||
for k, v in self.names_lower.most_common():
|
||||
if v < min_threshold:
|
||||
break
|
||||
canonical = self.chain_canonical.get(k)
|
||||
if canonical:
|
||||
canonical = self.names_cap[canonical].most_common(1)[0][0]
|
||||
else:
|
||||
canonical = ''
|
||||
most_common_cap = self.names_cap[k].most_common(1)[0][0]
|
||||
writer.writerow((safe_encode(k),
|
||||
safe_encode(most_common_cap),
|
||||
safe_encode(canonical),
|
||||
safe_encode(1) if k in self.all_chains else '',
|
||||
safe_encode(v)))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 3:
|
||||
print('Usage: python chains_tsv.py infile outfile')
|
||||
sys.exit(1)
|
||||
input_file = sys.argv[1]
|
||||
output_file = sys.argv[2]
|
||||
|
||||
names = VenueNames(input_file)
|
||||
names.count()
|
||||
names.write_to_tsv(output_file)
|
||||
100
scripts/geodata/chains/query.py
Normal file
100
scripts/geodata/chains/query.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.address_expansions.gazetteers import chains_gazetteer
|
||||
from geodata.categories.config import category_config
|
||||
from geodata.categories.preposition import CategoryPreposition
|
||||
from geodata.math.sampling import weighted_choice, cdf
|
||||
from geodata.text.normalize import normalized_tokens
|
||||
from geodata.text.tokenize import tokenize, token_types
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name, add_address')
|
||||
|
||||
NULL_CHAIN_QUERY = ChainQuery(None, None, False, False)
|
||||
|
||||
|
||||
class Chain(object):
|
||||
@classmethod
|
||||
def tokenize_name(cls, name):
|
||||
if not name:
|
||||
return []
|
||||
tokens = normalized_tokens(name)
|
||||
return tokens
|
||||
|
||||
@classmethod
|
||||
def possible_chain(cls, name):
|
||||
'''
|
||||
Determines if a venue name contains the name of a known chain store.
|
||||
|
||||
Returns a tuple of:
|
||||
|
||||
(True/False, known chain phrases, other tokens)
|
||||
|
||||
Handles cases like "Hard Rock Cafe Times Square" and allows for downstream
|
||||
decision making (i.e. if the tokens have a low IDF in the local area we might
|
||||
want to consider it a chain).
|
||||
'''
|
||||
tokens = cls.tokenize_name(name)
|
||||
if not tokens:
|
||||
return False, [], []
|
||||
matches = chains_gazetteer.filter(tokens)
|
||||
other_tokens = []
|
||||
phrases = []
|
||||
for t, c, l, d in matches:
|
||||
if c == token_types.PHRASE:
|
||||
phrases.append((t, c, l, d))
|
||||
else:
|
||||
other_tokens.append((t, c))
|
||||
|
||||
return len(phrases) > 0, phrases, other_tokens if len(phrases) > 0 else []
|
||||
|
||||
@classmethod
|
||||
def extract(cls, name):
|
||||
'''
|
||||
Determines if an entire venue name matches a known chain store.
|
||||
|
||||
Note: to avoid false positives, only return True if all of the tokens
|
||||
in the venue's name are part of a single chain store phrase. This will
|
||||
miss a few things like "Hard Rock Cafe Times Square" and the like.
|
||||
|
||||
It will however handle compound chain stores like Subway/Taco Bell
|
||||
'''
|
||||
|
||||
possible, phrases, other_tokens = cls.possible_chain(name)
|
||||
is_chain = possible and not any((c in token_types.WORD_TOKEN_TYPES for t, c in other_tokens))
|
||||
return is_chain, phrases if is_chain else []
|
||||
|
||||
@classmethod
|
||||
def alternate_form(cls, language, dictionary, canonical):
|
||||
choices = address_config.sample_phrases.get((language, dictionary), {}).get(canonical)
|
||||
if not choices:
|
||||
return canonical
|
||||
return random.choice(choices)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, chain, language, country=None):
|
||||
if not chain:
|
||||
return NULL_CHAIN_QUERY
|
||||
|
||||
chain_phrase = safe_decode(chain)
|
||||
|
||||
prep_phrase_type = CategoryPreposition.random(language, country=country)
|
||||
|
||||
if prep_phrase_type in (None, CategoryPreposition.NULL):
|
||||
return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
|
||||
|
||||
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
|
||||
if not values:
|
||||
return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
|
||||
|
||||
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
|
||||
prep_phrase = safe_decode(prep_phrase)
|
||||
|
||||
add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
|
||||
add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
|
||||
|
||||
return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
|
||||
0
scripts/geodata/configs/__init__.py
Normal file
0
scripts/geodata/configs/__init__.py
Normal file
61
scripts/geodata/configs/utils.py
Normal file
61
scripts/geodata/configs/utils.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import six
|
||||
from collections import Mapping
|
||||
|
||||
|
||||
def recursive_merge(a, b):
|
||||
for k, v in six.iteritems(b):
|
||||
if isinstance(v, Mapping) and v:
|
||||
existing = a.get(k, v)
|
||||
merged = recursive_merge(existing, v)
|
||||
a[k] = merged
|
||||
else:
|
||||
a[k] = b[k]
|
||||
return a
|
||||
|
||||
|
||||
class DoesNotExist:
|
||||
pass
|
||||
|
||||
|
||||
def nested_get(obj, keys, default=DoesNotExist):
|
||||
if len(keys) == 0:
|
||||
return obj
|
||||
try:
|
||||
for key in keys[:-1]:
|
||||
obj = obj.get(key, {})
|
||||
if not hasattr(obj, 'items'):
|
||||
return default
|
||||
key = keys[-1]
|
||||
return obj.get(key, default)
|
||||
except AttributeError:
|
||||
return default
|
||||
|
||||
|
||||
def alternative_probabilities(properties):
|
||||
if properties is None:
|
||||
return None, None
|
||||
|
||||
probs = []
|
||||
alternatives = []
|
||||
|
||||
if 'probability' in properties:
|
||||
prob = properties['probability']
|
||||
props = properties['default']
|
||||
probs.append(prob)
|
||||
alternatives.append(props)
|
||||
elif 'alternatives' not in properties and 'default' in properties:
|
||||
prob = 1.0
|
||||
props = properties['default']
|
||||
probs.append(prob)
|
||||
alternatives.append(props)
|
||||
elif 'alternatives' not in properties and 'default' not in properties:
|
||||
return None, None
|
||||
|
||||
alts = properties.get('alternatives', [])
|
||||
for alt in alts:
|
||||
prob = alt.get('probability', 1.0 / len(alts))
|
||||
props = alt['alternative']
|
||||
probs.append(prob)
|
||||
alternatives.append(props)
|
||||
|
||||
return alternatives, probs
|
||||
0
scripts/geodata/coordinates/__init__.py
Normal file
0
scripts/geodata/coordinates/__init__.py
Normal file
165
scripts/geodata/coordinates/conversion.py
Normal file
165
scripts/geodata/coordinates/conversion.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
geodata.coordinates.conversion
|
||||
------------------------------
|
||||
|
||||
Geographic coordinates typically come in two flavors: decimal and
|
||||
DMS (degree-minute-second). This module parses a coordinate string
|
||||
in just about any format. This was originally created for parsing
|
||||
lat/lons found on the web.
|
||||
|
||||
Usage:
|
||||
>>> latlon_to_decimal('40°42′46″N', '74°00′21″W') # returns (40.71277777777778, 74.00583333333333)
|
||||
>>> latlon_to_decimal('40,74 N', '74,001 W') # returns (40.74, -74.001)
|
||||
>>> to_valid_longitude(360.0)
|
||||
>>> latitude_is_valid(90.0)
|
||||
'''
|
||||
|
||||
import math
|
||||
import re
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.floats import isclose
|
||||
|
||||
beginning_re = re.compile('^[^0-9\-]+', re.UNICODE)
|
||||
end_re = re.compile('[^0-9]+$', re.UNICODE)
|
||||
|
||||
latitude_dms_regex = re.compile(ur'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$', re.I | re.UNICODE)
|
||||
longitude_dms_regex = re.compile(ur'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$', re.I | re.UNICODE)
|
||||
|
||||
latitude_decimal_with_direction_regex = re.compile('^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I)
|
||||
longitude_decimal_with_direction_regex = re.compile('^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$', re.I)
|
||||
|
||||
direction_sign_map = {'n': 1, 's': -1, 'e': 1, 'w': -1}
|
||||
|
||||
|
||||
def direction_sign(d):
|
||||
if d is None:
|
||||
return 1
|
||||
d = d.lower().strip()
|
||||
if d in direction_sign_map:
|
||||
return direction_sign_map[d]
|
||||
else:
|
||||
raise ValueError('Invalid direction: {}'.format(d))
|
||||
|
||||
|
||||
def int_or_float(d):
|
||||
try:
|
||||
return int(d)
|
||||
except ValueError:
|
||||
return float(d)
|
||||
|
||||
|
||||
def degrees_to_decimal(degrees, minutes, seconds):
|
||||
degrees = int_or_float(degrees)
|
||||
minutes = int_or_float(minutes)
|
||||
seconds = int_or_float(seconds)
|
||||
|
||||
return degrees + (minutes / 60.0) + (seconds / 3600.0)
|
||||
|
||||
|
||||
def is_valid_latitude(latitude):
|
||||
'''Latitude must be real number between -90.0 and 90.0'''
|
||||
try:
|
||||
latitude = float(latitude)
|
||||
except (ValueError, TypeError):
|
||||
return False
|
||||
|
||||
if latitude > 90.0 or latitude < -90.0 or math.isinf(latitude) or math.isnan(latitude):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_valid_longitude(longitude):
|
||||
'''Allow any valid real number to be a longitude'''
|
||||
try:
|
||||
longitude = float(longitude)
|
||||
except (ValueError, TypeError):
|
||||
return False
|
||||
return not math.isinf(longitude) and not math.isnan(longitude)
|
||||
|
||||
|
||||
def to_valid_latitude(latitude):
|
||||
'''Convert longitude into the -180 to 180 scale'''
|
||||
if not is_valid_latitude(latitude):
|
||||
raise ValueError('Invalid latitude {}'.format(latitude))
|
||||
|
||||
if isclose(latitude, 90.0):
|
||||
latitude = 89.9999
|
||||
elif isclose(latitude, -90.0):
|
||||
latitude = -89.9999
|
||||
|
||||
return latitude
|
||||
|
||||
|
||||
def to_valid_longitude(longitude):
|
||||
'''Convert longitude into the -180 to 180 scale'''
|
||||
if not is_valid_longitude(longitude):
|
||||
raise ValueError('Invalid longitude {}'.format(longitude))
|
||||
|
||||
while longitude <= -180.0:
|
||||
longitude += 360.0
|
||||
|
||||
while longitude > 180.0:
|
||||
longitude -= 360.0
|
||||
|
||||
return longitude
|
||||
|
||||
|
||||
def latlon_to_decimal(latitude, longitude):
|
||||
have_lat = False
|
||||
have_lon = False
|
||||
|
||||
latitude = safe_decode(latitude).strip(u' ,;|')
|
||||
longitude = safe_decode(longitude).strip(u' ,;|')
|
||||
|
||||
latitude = latitude.replace(u',', u'.')
|
||||
longitude = longitude.replace(u',', u'.')
|
||||
|
||||
lat_dms = latitude_dms_regex.match(latitude)
|
||||
lat_dir = latitude_decimal_with_direction_regex.match(latitude)
|
||||
|
||||
if lat_dms:
|
||||
d, m, s, c = lat_dms.groups()
|
||||
sign = direction_sign(c)
|
||||
latitude = degrees_to_decimal(d or 0, m or 0, s or 0)
|
||||
have_lat = True
|
||||
elif lat_dir:
|
||||
d, c = lat_dir.groups()
|
||||
sign = direction_sign(c)
|
||||
latitude = return_type(d) * sign
|
||||
have_lat = True
|
||||
else:
|
||||
latitude = re.sub(beginning_re, u'', latitude)
|
||||
latitude = re.sub(end_re, u'', latitude)
|
||||
|
||||
lon_dms = longitude_dms_regex.match(longitude)
|
||||
lon_dir = longitude_decimal_with_direction_regex.match(longitude)
|
||||
|
||||
if lon_dms:
|
||||
d, m, s, c = lon_dms.groups()
|
||||
sign = direction_sign(c)
|
||||
longitude = degrees_to_decimal(d or 0, m or 0, s or 0)
|
||||
have_lon = True
|
||||
elif lon_dir:
|
||||
d, c = lon_dir.groups()
|
||||
sign = direction_sign(c)
|
||||
longitude = return_type(d) * sign
|
||||
have_lon = True
|
||||
else:
|
||||
longitude = re.sub(beginning_re, u'', longitude)
|
||||
longitude = re.sub(end_re, u'', longitude)
|
||||
|
||||
latitude = float(latitude)
|
||||
longitude = float(longitude)
|
||||
|
||||
if not is_valid_latitude(latitude):
|
||||
raise ValueError('Invalid latitude: {}'.format(latitude))
|
||||
|
||||
if not is_valid_longitude(longitude):
|
||||
raise ValueError('Invalid longitude: {}'.format(longitude))
|
||||
|
||||
latitude = to_valid_latitude(latitude)
|
||||
longitude = to_valid_longitude(longitude)
|
||||
|
||||
return latitude, longitude
|
||||
0
scripts/geodata/countries/__init__.py
Normal file
0
scripts/geodata/countries/__init__.py
Normal file
262
scripts/geodata/countries/constants.py
Normal file
262
scripts/geodata/countries/constants.py
Normal file
@@ -0,0 +1,262 @@
|
||||
import pycountry
|
||||
|
||||
|
||||
class Countries(object):
|
||||
AFGHANISTAN = 'af'
|
||||
ALAND_ISLANDS = 'ax'
|
||||
ALBANIA = 'al'
|
||||
ALGERIA = 'dz'
|
||||
AMERICAN_SAMOA = 'as'
|
||||
ANDORRA = 'ad'
|
||||
ANGOLA = 'ao'
|
||||
ANGUILLA = 'ai'
|
||||
ANTARCTICA = 'aq'
|
||||
ANTIGUA_AND_BARBUDA = 'ag'
|
||||
ARGENTINA = 'ar'
|
||||
ARMENIA = 'am'
|
||||
ARUBA = 'aw'
|
||||
AUSTRALIA = 'au'
|
||||
AUSTRIA = 'at'
|
||||
AZERBAIJAN = 'az'
|
||||
BAHAMAS = 'bs'
|
||||
BAHRAIN = 'bh'
|
||||
BANGLADESH = 'bd'
|
||||
BARBADOS = 'bb'
|
||||
BELARUS = 'by'
|
||||
BELGIUM = 'be'
|
||||
BELIZE = 'bz'
|
||||
BENIN = 'bj'
|
||||
BERMUDA = 'bm'
|
||||
BHUTAN = 'bt'
|
||||
BOLIVIA = 'bo'
|
||||
BONAIRE = 'bq'
|
||||
BOSNIA_AND_HERZEGOVINA = 'bq'
|
||||
BOTSWANA = 'bw'
|
||||
BOUVET_ISLAND = 'bv'
|
||||
BRAZIL = 'br'
|
||||
BRITISH_INDIAN_OCEAN_TERRITORY = 'io'
|
||||
BRITISH_VIRGIN_ISLANDS = 'vg'
|
||||
BRUNEI_DARUSSALAM = 'bn'
|
||||
BULGARIA = 'bg'
|
||||
BURKINA_FASO = 'bf'
|
||||
BURUNDI = 'bi'
|
||||
CAMBODIA = 'kh'
|
||||
CAMEROON = 'cm'
|
||||
CANADA = 'ca'
|
||||
CAPE_VERDE = 'cv'
|
||||
CAYMAN_ISLANDS = 'ky'
|
||||
CENTRAL_AFRICAN_REPUBLIC = 'cf'
|
||||
CHAD = 'td'
|
||||
CHILE = 'cl'
|
||||
CHINA = 'cn'
|
||||
CHRISTMAS_ISLAND = 'cx'
|
||||
COCOS_KEELING_ISLANDS = 'cc'
|
||||
COLOMBIA = 'co'
|
||||
COMOROS = 'km'
|
||||
COOK_ISLANDS = 'ck'
|
||||
COSTA_RICA = 'cr'
|
||||
COTE_DIVOIRE = 'ci'
|
||||
CROATIA = 'hr'
|
||||
CUBA = 'cu'
|
||||
CURACAO = 'cw'
|
||||
CYPRUS = 'cy'
|
||||
CZECH_REPUBLIC = 'cz'
|
||||
DENMARK = 'dk'
|
||||
DEMOCRATIC_REPUBLIC_OF_THE_CONGO = 'cd'
|
||||
DJIBOUTI = 'dj'
|
||||
DOMINICA = 'dm'
|
||||
DOMINICAN_REPUBLIC = 'do'
|
||||
ECUADOR = 'ec'
|
||||
EGYPT = 'eg'
|
||||
EL_SALVADOR = 'sv'
|
||||
EQUATORIAL_GUINEA = 'gq'
|
||||
ERITREA = 'er'
|
||||
ESTONIA = 'ee'
|
||||
ETHIOPIA = 'et'
|
||||
FALKLAND_ISLANDS_MALVINAS = 'fk'
|
||||
FAROE_ISLANDS = 'fo'
|
||||
FEDERATED_STATES_OF_MICRONESIA = 'fm'
|
||||
FIJI = 'fj'
|
||||
FINLAND = 'fi'
|
||||
FRANCE = 'fr'
|
||||
FRENCH_GUIANA = 'gf'
|
||||
FRENCH_POLYNESIA = 'pf'
|
||||
FRENCH_SOUTHERN_TERRITORIES = 'tf'
|
||||
GABON = 'ga'
|
||||
GAMBIA = 'gm'
|
||||
GEORGIA = 'ge'
|
||||
GERMANY = 'de'
|
||||
GHANA = 'gh'
|
||||
GIBRALTAR = 'gi'
|
||||
GREECE = 'gr'
|
||||
GREENLAND = 'gl'
|
||||
GRENADA = 'gd'
|
||||
GUADELOUPE = 'gp'
|
||||
GUAM = 'gu'
|
||||
GUATEMALA = 'gt'
|
||||
GUERNSEY = 'gg'
|
||||
GUINEA = 'gn'
|
||||
GUINEA_BISSAU = 'gw'
|
||||
GUYANA = 'gy'
|
||||
HAITI = 'ht'
|
||||
HEARD_ISLAND_AND_MCDONALD_ISLANDS = 'hm'
|
||||
HONDURAS = 'hn'
|
||||
HONG_KONG = 'hk'
|
||||
HUNGARY = 'hu'
|
||||
ICELAND = 'is'
|
||||
INDIA = 'in'
|
||||
INDONESIA = 'id'
|
||||
IRAN = 'ir'
|
||||
IRAQ = 'iq'
|
||||
IRELAND = 'ie'
|
||||
ISLE_OF_MAN = 'im'
|
||||
ISRAEL = 'il'
|
||||
ITALY = 'it'
|
||||
JAMAICA = 'jm'
|
||||
JAPAN = 'jp'
|
||||
JERSEY = 'je'
|
||||
JORDAN = 'jo'
|
||||
KAZAKHSTAN = 'kz'
|
||||
KENYA = 'ke'
|
||||
KIRIBATI = 'ki'
|
||||
KUWAIT = 'kw'
|
||||
KYRGYZSTAN = 'kg'
|
||||
LAOS = 'la'
|
||||
LATVIA = 'lv'
|
||||
LEBANON = 'lb'
|
||||
LESOTHO = 'ls'
|
||||
LIBERIA = 'lr'
|
||||
LIBYA = 'ly'
|
||||
LIECHTENSTEIN = 'li'
|
||||
LITHUANIA = 'lt'
|
||||
LUXEMBOURG = 'lu'
|
||||
MACAO = 'mo'
|
||||
MACEDONIA = 'mk'
|
||||
MADAGASCAR = 'mg'
|
||||
MALAWI = 'mw'
|
||||
MALAYSIA = 'my'
|
||||
MALDIVES = 'mv'
|
||||
MALI = 'ml'
|
||||
MALTA = 'mt'
|
||||
MARSHALL_ISLANDS = 'mh'
|
||||
MARTINIQUE = 'mq'
|
||||
MAURITANIA = 'mr'
|
||||
MAURITIUS = 'mu'
|
||||
MAYOTTE = 'yt'
|
||||
MEXICO = 'mx'
|
||||
MOLDOVA = 'md'
|
||||
MONACO = 'mc'
|
||||
MONGOLIA = 'mn'
|
||||
MONTENEGRO = 'me'
|
||||
MONTSERRAT = 'ms'
|
||||
MOROCCO = 'ma'
|
||||
MOZAMBIQUE = 'mz'
|
||||
MYANMAR = 'mm'
|
||||
NAMIBIA = 'na'
|
||||
NAURU = 'nr'
|
||||
NEPAL = 'np'
|
||||
NETHERLANDS = 'nl'
|
||||
NEW_CALEDONIA = 'nc'
|
||||
NEW_ZEALAND = 'nz'
|
||||
NICARAGUA = 'ni'
|
||||
NIGER = 'ne'
|
||||
NIGERIA = 'ng'
|
||||
NIUE = 'nu'
|
||||
NORFOLK_ISLAND = 'nf'
|
||||
NORTH_KOREA = 'kp'
|
||||
NORTHERN_MARIANA_ISLANDS = 'mp'
|
||||
NORWAY = 'no'
|
||||
OMAN = 'om'
|
||||
PAKISTAN = 'pk'
|
||||
PALAU = 'pw'
|
||||
PALESTINE = 'ps'
|
||||
PANAMA = 'pa'
|
||||
PAPUA_NEW_GUINEA = 'pg'
|
||||
PARAGUAY = 'py'
|
||||
PERU = 'pe'
|
||||
PHILIPPINES = 'ph'
|
||||
PITCAIRN_ISLANDS = 'pn'
|
||||
POLAND = 'pl'
|
||||
PORTUGAL = 'pt'
|
||||
PUERTO_RICO = 'pr'
|
||||
QATAR = 'qa'
|
||||
REPUBLIC_OF_CONGO = 'cg'
|
||||
REUNION = 're'
|
||||
ROMANIA = 'ro'
|
||||
RUSSIA = 'ru'
|
||||
RWANDA = 'rw'
|
||||
SAINT_BARTHELEMY = 'bl'
|
||||
SAINT_HELENA_ASCENSION_AND_TRISTAN_DA_CUNHA = 'sh'
|
||||
SAINT_KITTS_AND_NEVIS = 'kn'
|
||||
SAINT_LUCIA = 'lc'
|
||||
SAINT_MARTIN = 'mf'
|
||||
SAINT_PIERRE_AND_MIQUELON = 'pm'
|
||||
SAINT_VINCENT_AND_THE_GRENADINES = 'vc'
|
||||
SAMOA = 'ws'
|
||||
SAN_MARINO = 'sm'
|
||||
SAO_TOME_AND_PRINCIPE = 'st'
|
||||
SAUDI_ARABIA = 'sa'
|
||||
SENEGAL = 'sn'
|
||||
SERBIA = 'rs'
|
||||
SEYCHELLES = 'sc'
|
||||
SIERRA_LEONE = 'sl'
|
||||
SINGAPORE = 'sg'
|
||||
SINT_MAARTEN = 'sx'
|
||||
SLOVAKIA = 'sk'
|
||||
SLOVENIA = 'si'
|
||||
SOLOMON_ISLANDS = 'sb'
|
||||
SOMALIA = 'so'
|
||||
SOUTH_AFRICA = 'za'
|
||||
SOUTH_GEORGIA_AND_THE_SOUTH_SANDWICH_ISLANDS = 'gs'
|
||||
SOUTH_KOREA = 'kr'
|
||||
SOUTH_SUDAN = 'ss'
|
||||
SPAIN = 'es'
|
||||
SRI_LANKA = 'lk'
|
||||
SUDAN = 'sd'
|
||||
SURINAME = 'sr'
|
||||
SVALBARD_AND_JAN_MAYEN = 'sj'
|
||||
SWAZILAND = 'sz'
|
||||
SWEDEN = 'se'
|
||||
SWITZERLAND = 'ch'
|
||||
SYRIA = 'sy'
|
||||
TAIWAN = 'tw'
|
||||
TAJIKISTAN = 'tj'
|
||||
TANZANIA = 'tz'
|
||||
THAILAND = 'th'
|
||||
TIMOR_LESTE = 'tl'
|
||||
TOGO = 'tg'
|
||||
TOKELAU = 'tk'
|
||||
TONGA = 'to'
|
||||
TRINIDAD_AND_TOBAGO = 'tt'
|
||||
TUNISIA = 'tn'
|
||||
TURKEY = 'tr'
|
||||
TURKMENISTAN = 'tm'
|
||||
TURKS_AND_CAICOS_ISLANDS = 'tc'
|
||||
TUVALU = 'tv'
|
||||
UGANDA = 'ug'
|
||||
UKRAINE = 'ua'
|
||||
UNITED_ARAB_EMIRATES = 'ae'
|
||||
UNITED_KINGDOM = 'gb'
|
||||
UNITED_STATES = 'us'
|
||||
UNITED_STATES_MINOR_OUTLYING_ISLANDS = 'um'
|
||||
URUGUAY = 'uy'
|
||||
US_VIRGIN_ISLANDS = 'vi'
|
||||
UZBEKISTAN = 'uz'
|
||||
VANUATU = 'vu'
|
||||
VATICAN = 'va'
|
||||
VENEZUELA = 've'
|
||||
VIETNAM = 'vn'
|
||||
WALLIS_AND_FUTUNA = 'wf'
|
||||
WESTERN_SAHARA = 'eh'
|
||||
YEMEN = 'ye'
|
||||
ZAMBIA = 'zm'
|
||||
ZIMBABWE = 'zw'
|
||||
|
||||
FORMER_SOVIET_UNION_COUNTRIES = set([RUSSIA, UKRAINE, BELARUS, KAZAKHSTAN, AZERBAIJAN, KYRGYZSTAN, GEORGIA, UZBEKISTAN, ARMENIA, TAJIKISTAN, MOLDOVA, TURKMENISTAN, LATVIA, LITHUANIA, ESTONIA])
|
||||
CJK_COUNTRIES = set([CHINA, JAPAN, SOUTH_KOREA, TAIWAN, HONG_KONG, MACAO])
|
||||
|
||||
all_country_iso_codes = set([c.alpha2.lower() for c in pycountry.countries])
|
||||
|
||||
@classmethod
|
||||
def is_valid_country_code(cls, alpha2_code):
|
||||
return alpha2_code and alpha2_code.lower() in cls.all_country_iso_codes
|
||||
187
scripts/geodata/countries/names.py
Normal file
187
scripts/geodata/countries/names.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
|
||||
import pycountry
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from lxml import etree
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.i18n.unicode_paths import CLDR_DIR
|
||||
from geodata.i18n.languages import *
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
|
||||
|
||||
COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'countries', 'names.yaml')
|
||||
|
||||
IGNORE_COUNTRIES = set([six.u('ZZ')])
|
||||
|
||||
COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')])
|
||||
COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')])
|
||||
|
||||
LANGUAGE_COUNTRY_OVERRIDES = {
|
||||
'en': {
|
||||
'CD': safe_decode('Democratic Republic of the Congo'),
|
||||
'CG': safe_decode('Republic of the Congo'),
|
||||
},
|
||||
|
||||
# Countries where the local language is absent from CLDR
|
||||
|
||||
# Tajik / Tajikistan
|
||||
'tg': {
|
||||
'TJ': safe_decode('Тоҷикистон'),
|
||||
},
|
||||
|
||||
# Maldivan / Maldives
|
||||
'dv': {
|
||||
'MV': safe_decode('ދިވެހިރާއްޖެ'),
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class CountryNames(object):
|
||||
def __init__(self, base_dir=CLDR_MAIN_PATH):
|
||||
self.base_dir = base_dir
|
||||
|
||||
self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries}
|
||||
self.iso_3166_names = {c.alpha2.lower(): c.name for c in pycountry.countries}
|
||||
|
||||
self.language_country_names = {}
|
||||
self.country_language_names = defaultdict(dict)
|
||||
|
||||
self.country_official_names = defaultdict(OrderedDict)
|
||||
self.country_local_names = defaultdict(OrderedDict)
|
||||
|
||||
local_languages = {}
|
||||
|
||||
country_local_language_names = defaultdict(dict)
|
||||
|
||||
for filename in os.listdir(base_dir):
|
||||
lang = filename.split('.xml')[0]
|
||||
if len(lang) > 3:
|
||||
continue
|
||||
|
||||
names = self.cldr_country_names(lang)
|
||||
lang = lang.lower()
|
||||
self.language_country_names[lang] = names
|
||||
|
||||
for country, name in names.iteritems():
|
||||
country = country.lower()
|
||||
|
||||
languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
|
||||
local_languages[country] = languages
|
||||
|
||||
self.country_language_names[country.lower()][lang.lower()] = name
|
||||
|
||||
if lang in local_languages.get(country, {}):
|
||||
country_local_language_names[country][lang] = name
|
||||
|
||||
for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES):
|
||||
if l not in self.language_country_names:
|
||||
self.language_country_names[l.lower()] = names
|
||||
|
||||
for c, name in six.iteritems(names):
|
||||
self.country_language_names[c.lower()][l.lower()] = name
|
||||
if c.lower() not in country_local_language_names:
|
||||
country_local_language_names[c.lower()][l.lower()] = name
|
||||
|
||||
for country, langs in six.iteritems(local_languages):
|
||||
names = country_local_language_names[country]
|
||||
num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
|
||||
for i, (lang, default) in enumerate(langs.iteritems()):
|
||||
name = names.get(lang)
|
||||
if not name:
|
||||
continue
|
||||
if default or num_defaults == 0:
|
||||
self.country_official_names[country][lang] = name
|
||||
if num_defaults == 0:
|
||||
break
|
||||
self.country_local_names[country][lang] = name
|
||||
|
||||
def cldr_country_names(self, language):
|
||||
'''
|
||||
Country names are tricky as there can be several versions
|
||||
and levels of verbosity e.g. United States of America
|
||||
vs. the more commonly used United States. Most countries
|
||||
have a similarly verbose form.
|
||||
|
||||
The CLDR repo (http://cldr.unicode.org/) has the most
|
||||
comprehensive localized database of country names
|
||||
(among other things), organized by language. This function
|
||||
parses CLDR XML for a given language and returns a dictionary
|
||||
of {country_code: name} for that language.
|
||||
'''
|
||||
filename = os.path.join(self.base_dir, '{}.xml'.format(language))
|
||||
xml = etree.parse(open(filename))
|
||||
|
||||
country_names = defaultdict(dict)
|
||||
|
||||
for territory in xml.xpath('*//territories/*'):
|
||||
country_code = territory.attrib['type']
|
||||
|
||||
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
|
||||
continue
|
||||
|
||||
country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
|
||||
|
||||
display_names = {}
|
||||
|
||||
for country_code, names in country_names.iteritems():
|
||||
if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
|
||||
display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
|
||||
continue
|
||||
|
||||
default_name = names.get(None)
|
||||
|
||||
if country_code in COUNTRY_USE_SHORT_NAME:
|
||||
display_names[country_code] = names.get('short', default_name)
|
||||
elif country_code in COUNTRY_USE_VARIANT_NAME:
|
||||
display_names[country_code] = names.get('variant', default_name)
|
||||
elif default_name is not None:
|
||||
display_names[country_code] = default_name
|
||||
|
||||
return display_names
|
||||
|
||||
def localized_name(self, country_code, language=None):
|
||||
'''
|
||||
Get the display name for a country code in the local language
|
||||
e.g. Россия for Russia, España for Spain, etc.
|
||||
|
||||
For most countries there is a single official name. For countries
|
||||
with more than one official language, this will return a concatenated
|
||||
version separated by a slash e.g. Maroc / المغرب for Morocco.
|
||||
|
||||
Note that all of the exceptions in road_sign_languages.tsv are also
|
||||
taken into account here so India for example uses the English name
|
||||
rather than concatenating all 27 toponyms.
|
||||
|
||||
This method should be roughly consistent with OSM's display names.
|
||||
|
||||
Usage:
|
||||
>>> country_names.localized_name('jp') # returns '日本'
|
||||
>>> country_names.localized_name('be') # returns 'België / Belgique / Belgien'
|
||||
'''
|
||||
|
||||
country_code = country_code.lower()
|
||||
if language is None:
|
||||
return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' '))
|
||||
for n in self.country_official_names[country_code].values()).keys())
|
||||
else:
|
||||
return self.country_language_names.get(country_code, {}).get(language)
|
||||
|
||||
def alpha3_code(self, alpha2_code):
|
||||
alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
|
||||
return alpha3.upper() if alpha3 else None
|
||||
|
||||
def iso_3166_name(self, alpha2_code):
|
||||
return self.iso_3166_names.get(alpha2_code.lower())
|
||||
|
||||
country_names = CountryNames()
|
||||
16
scripts/geodata/csv_utils.py
Normal file
16
scripts/geodata/csv_utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import csv
|
||||
import re
|
||||
from encoding import safe_encode, safe_decode
|
||||
|
||||
newline_regex = re.compile('\r\n|\r|\n')
|
||||
|
||||
csv.register_dialect('tsv_no_quote', delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')
|
||||
|
||||
|
||||
def tsv_string(s):
|
||||
return safe_encode(newline_regex.sub(u', ', safe_decode(s).strip()).replace(u'\t', u' '))
|
||||
|
||||
|
||||
def unicode_csv_reader(filename, **kw):
|
||||
for line in csv.reader(filename, **kw):
|
||||
yield [unicode(c, 'utf-8') for c in line]
|
||||
0
scripts/geodata/distance/__init__.py
Normal file
0
scripts/geodata/distance/__init__.py
Normal file
33
scripts/geodata/distance/haversine.py
Normal file
33
scripts/geodata/distance/haversine.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import math
|
||||
|
||||
EARTH_RADIUS_KM = 6373
|
||||
|
||||
|
||||
def haversine_distance(lat1, lon1, lat2, lon2, radius=EARTH_RADIUS_KM):
|
||||
"""Calculate the Haversine distance between two lat/lon pairs, given by:
|
||||
a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
|
||||
c = 2 ⋅ atan2( √a, √(1−a) )
|
||||
d = R ⋅ c
|
||||
|
||||
where R is the radius of the Earth (in kilometers). By default we use 6373 km,
|
||||
a radius optimized for calculating distances at approximately 39 degrees from
|
||||
the equator i.e. Washington, DC
|
||||
|
||||
:param lat1: first latitude
|
||||
:param lon1: first longitude (use negative range for longitudes West of the Prime Meridian)
|
||||
:param lat2: second latitude
|
||||
:param lon2: second longitude (use negative range for longitudes West of the Prime Meridian)
|
||||
:param radius: radius of the Earth in (miles|kilometers) depending on the desired units
|
||||
"""
|
||||
lat1 = math.radians(lat1)
|
||||
lat2 = math.radians(lat2)
|
||||
lon1 = math.radians(lon1)
|
||||
lon2 = math.radians(lon2)
|
||||
|
||||
dlon = lon2 - lon1
|
||||
dlat = lat2 - lat1
|
||||
a = (math.sin(dlat / 2.0)) ** 2 + math.cos(lat1) * math.cos(lat2) * (math.sin(dlon/2.0)) ** 2
|
||||
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
|
||||
d = radius * c
|
||||
return d
|
||||
34
scripts/geodata/encoding.py
Normal file
34
scripts/geodata/encoding.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import six
|
||||
|
||||
text_type = six.text_type
|
||||
string_types = six.string_types
|
||||
binary_type = six.binary_type
|
||||
|
||||
|
||||
def safe_decode(value, encoding='utf-8', errors='strict'):
|
||||
if isinstance(value, text_type):
|
||||
return value
|
||||
|
||||
if isinstance(value, (string_types, binary_type)):
|
||||
return value.decode(encoding, errors)
|
||||
else:
|
||||
return binary_type(value).decode(encoding, errors)
|
||||
|
||||
|
||||
def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'):
|
||||
if not isinstance(value, (string_types, binary_type)):
|
||||
return binary_type(value)
|
||||
|
||||
if isinstance(value, text_type):
|
||||
return value.encode(encoding, errors)
|
||||
else:
|
||||
if hasattr(incoming, 'lower'):
|
||||
incoming = incoming.lower()
|
||||
if hasattr(encoding, 'lower'):
|
||||
encoding = encoding.lower()
|
||||
|
||||
if value and encoding != incoming:
|
||||
value = safe_decode(value, encoding, errors)
|
||||
return value.encode(encoding, errors)
|
||||
else:
|
||||
return value
|
||||
62
scripts/geodata/enum.py
Normal file
62
scripts/geodata/enum.py
Normal file
@@ -0,0 +1,62 @@
|
||||
|
||||
class EnumValue(object):
|
||||
def __init__(self, value, name=None):
|
||||
self.value = value
|
||||
self.name = name
|
||||
|
||||
def __hash__(self):
|
||||
return self.value
|
||||
|
||||
def __cmp__(self, other):
|
||||
if isinstance(other, EnumValue):
|
||||
return self.value.__cmp__(other.value)
|
||||
else:
|
||||
return self.value.__cmp__(other)
|
||||
|
||||
def __unicode__(self):
|
||||
return self.name
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
|
||||
class EnumMeta(type):
|
||||
def __init__(self, name, bases, dict_):
|
||||
self.registry = self.registry.copy()
|
||||
self.name_registry = self.name_registry.copy()
|
||||
for k, v in dict_.iteritems():
|
||||
if isinstance(v, EnumValue) and v not in self.registry:
|
||||
if v.name is None:
|
||||
v.name = k
|
||||
self.registry[v.value] = v
|
||||
self.name_registry[v.name] = v
|
||||
return super(EnumMeta, self).__init__(name, bases, dict_)
|
||||
|
||||
def __iter__(self):
|
||||
return self.registry.itervalues()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.registry[key]
|
||||
|
||||
|
||||
class Enum(object):
|
||||
__metaclass__ = EnumMeta
|
||||
registry = {}
|
||||
name_registry = {}
|
||||
|
||||
@classmethod
|
||||
def from_id(cls, value):
|
||||
try:
|
||||
return cls.registry[value]
|
||||
except KeyError:
|
||||
raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value))
|
||||
|
||||
@classmethod
|
||||
def from_string(cls, name):
|
||||
try:
|
||||
return cls.name_registry[name]
|
||||
except KeyError:
|
||||
raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name))
|
||||
38
scripts/geodata/file_utils.py
Normal file
38
scripts/geodata/file_utils.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import os
|
||||
import subprocess
|
||||
import six
|
||||
|
||||
|
||||
def download_file(url, dest, retries=3, retry_delay=5):
|
||||
ensure_dir(os.path.dirname(dest))
|
||||
return subprocess.check_output(['curl', url, '-L', '-w', '%{http_code}',
|
||||
'--retry', six.text_type(retries),
|
||||
'--retry-delay', six.text_type(retry_delay),
|
||||
'-o', dest, '--silent']) == '200'
|
||||
|
||||
|
||||
def unzip_file(filename, dest):
|
||||
ensure_dir(dest)
|
||||
return subprocess.check_call(['unzip', '-o', filename, '-d', dest]) == 0
|
||||
|
||||
|
||||
def remove_file(filename):
|
||||
os.unlink(filename)
|
||||
|
||||
|
||||
def ensure_dir(d):
|
||||
if not os.path.exists(d):
|
||||
os.makedirs(d)
|
||||
|
||||
|
||||
class cd:
|
||||
"""Context manager for changing the current working directory"""
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
|
||||
def __enter__(self):
|
||||
self.saved_path = os.getcwd()
|
||||
os.chdir(self.path)
|
||||
|
||||
def __exit__(self, etype, value, traceback):
|
||||
os.chdir(self.saved_path)
|
||||
0
scripts/geodata/geonames/__init__.py
Normal file
0
scripts/geodata/geonames/__init__.py
Normal file
688
scripts/geodata/geonames/create_geonames_tsv.py
Normal file
688
scripts/geodata/geonames/create_geonames_tsv.py
Normal file
@@ -0,0 +1,688 @@
|
||||
'''
|
||||
create_geonames_tsv.py
|
||||
----------------------
|
||||
|
||||
This script formats the open GeoNames database (as well as
|
||||
its accompanying postal codes data set) into a schema'd
|
||||
tab-separated value file.
|
||||
|
||||
It generates a C header which uses an enum for the field names.
|
||||
This way if new fields are added or there's a typo, etc. the
|
||||
error will show up at compile-time.
|
||||
|
||||
The relevant C modules which operate on this data are:
|
||||
geodb_builder.c
|
||||
geonames.c
|
||||
|
||||
As well as the generated headers:
|
||||
geonames_fields.h
|
||||
postal_fields.h
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pycountry
|
||||
|
||||
import unicodedata
|
||||
|
||||
import urllib
|
||||
import urlparse
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
from lxml import etree
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.csv_utils import *
|
||||
from geodata.file_utils import *
|
||||
from geodata.countries.country_names import *
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
|
||||
from geodata.i18n.languages import *
|
||||
from geodata.i18n.unicode_paths import CLDR_DIR
|
||||
from geodata.log import log_to_file
|
||||
|
||||
multispace_regex = re.compile('[\s]+')
|
||||
|
||||
|
||||
def encode_field(value):
|
||||
return multispace_regex.sub(' ', safe_encode((value if value is not None else '')))
|
||||
|
||||
log_to_file(sys.stderr)
|
||||
|
||||
DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
|
||||
os.path.pardir, 'data', 'geonames')
|
||||
|
||||
COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
|
||||
CONTINENT_FEATURE_CODES = ('CONT',)
|
||||
|
||||
ADMIN_1_FEATURE_CODES = ('ADM1',)
|
||||
ADMIN_2_FEATURE_CODES = ('ADM2',)
|
||||
ADMIN_3_FEATURE_CODES = ('ADM3',)
|
||||
ADMIN_4_FEATURE_CODES = ('ADM4',)
|
||||
OTHER_ADMIN_FEATURE_CODES = ('ADM5',)
|
||||
ADMIN_OTHER_FEATURE_CODES = ('ADMD', )
|
||||
|
||||
POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
|
||||
'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL',
|
||||
'PPLR', 'PPLS', 'STLMT')
|
||||
NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
|
||||
|
||||
|
||||
class boundary_types:
|
||||
COUNTRY = 0
|
||||
ADMIN1 = 1
|
||||
ADMIN2 = 2
|
||||
ADMIN3 = 3
|
||||
ADMIN4 = 4
|
||||
ADMIN_OTHER = 5
|
||||
LOCALITY = 6
|
||||
NEIGHBORHOOD = 7
|
||||
|
||||
geonames_admin_dictionaries = OrderedDict([
|
||||
(boundary_types.COUNTRY, COUNTRY_FEATURE_CODES),
|
||||
(boundary_types.ADMIN1, ADMIN_1_FEATURE_CODES),
|
||||
(boundary_types.ADMIN2, ADMIN_2_FEATURE_CODES),
|
||||
(boundary_types.ADMIN3, ADMIN_3_FEATURE_CODES),
|
||||
(boundary_types.ADMIN4, ADMIN_4_FEATURE_CODES),
|
||||
(boundary_types.ADMIN_OTHER, ADMIN_OTHER_FEATURE_CODES),
|
||||
(boundary_types.LOCALITY, POPULATED_PLACE_FEATURE_CODES),
|
||||
(boundary_types.NEIGHBORHOOD, NEIGHBORHOOD_FEATURE_CODES),
|
||||
])
|
||||
|
||||
# Inserted post-query
|
||||
DUMMY_BOUNDARY_TYPE = '-1 as type'
|
||||
DUMMY_HAS_WIKIPEDIA_ENTRY = '0 as has_wikipedia_entry'
|
||||
DUMMY_LANGUAGE_PRIORITY = '0 as language_priority'
|
||||
|
||||
|
||||
class GeonamesField(object):
|
||||
def __init__(self, name, c_constant, default=None, is_dummy=False):
|
||||
self.name = name
|
||||
self.c_constant = c_constant
|
||||
self.default = default
|
||||
self.is_dummy = is_dummy
|
||||
|
||||
geonames_fields = [
|
||||
# Field if alternate_names present, default field name if not, C header constant
|
||||
GeonamesField('alternate_name', 'GEONAMES_NAME', default='gn.name'),
|
||||
GeonamesField('gn.geonames_id as geonames_id', 'GEONAMES_ID'),
|
||||
GeonamesField('gn.name as canonical', 'GEONAMES_CANONICAL'),
|
||||
GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True),
|
||||
GeonamesField(DUMMY_HAS_WIKIPEDIA_ENTRY, 'GEONAMES_HAS_WIKIPEDIA_ENTRY', is_dummy=True),
|
||||
GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"),
|
||||
GeonamesField(DUMMY_LANGUAGE_PRIORITY, 'GEONAMES_LANGUAGE_PRIORITY', is_dummy=True),
|
||||
GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'),
|
||||
GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'),
|
||||
GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'),
|
||||
GeonamesField('is_historic', 'GEONAMES_IS_HISTORICAL', default='0'),
|
||||
GeonamesField('gn.population', 'GEONAMES_POPULATION'),
|
||||
GeonamesField('gn.latitude', 'GEONAMES_LATITUDE'),
|
||||
GeonamesField('gn.longitude', 'GEONAMES_LONGITUDE'),
|
||||
GeonamesField('gn.feature_code', 'GEONAMES_FEATURE_CODE'),
|
||||
GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'),
|
||||
GeonamesField('c.geonames_id as country_gn_id', 'GEONAMES_COUNTRY_ID'),
|
||||
GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'),
|
||||
GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'),
|
||||
GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'),
|
||||
GeonamesField('a2.geonames_id as a2_gn_id', 'GEONAMES_ADMIN2_ID'),
|
||||
GeonamesField('gn.admin3_code as admin3_code', 'GEONAMES_ADMIN3_CODE'),
|
||||
GeonamesField('a3.geonames_id as a3_gn_id', 'GEONAMES_ADMIN3_ID'),
|
||||
GeonamesField('gn.admin4_code as admin4_code', 'GEONAMES_ADMIN4_CODE'),
|
||||
GeonamesField('a4.geonames_id as a4_gn_id', 'GEONAMES_ADMIN4_ID'),
|
||||
]
|
||||
|
||||
def geonames_field_index(s):
|
||||
for i, f in enumerate(geonames_fields):
|
||||
if f.c_constant == s:
|
||||
return i
|
||||
return None
|
||||
|
||||
|
||||
DUMMY_BOUNDARY_TYPE_INDEX = geonames_field_index('GEONAMES_BOUNDARY_TYPE')
|
||||
DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX = geonames_field_index('GEONAMES_HAS_WIKIPEDIA_ENTRY')
|
||||
|
||||
GEONAMES_ID_INDEX = geonames_field_index('GEONAMES_ID')
|
||||
LANGUAGE_INDEX = geonames_field_index('GEONAMES_ISO_LANGUAGE')
|
||||
|
||||
DUMMY_LANGUAGE_PRIORITY_INDEX = geonames_field_index('GEONAMES_LANGUAGE_PRIORITY')
|
||||
|
||||
CANONICAL_NAME_INDEX = geonames_field_index('GEONAMES_CANONICAL')
|
||||
|
||||
NAME_INDEX = geonames_field_index('GEONAMES_NAME')
|
||||
|
||||
COUNTRY_CODE_INDEX = geonames_field_index('GEONAMES_COUNTRY_CODE')
|
||||
|
||||
POPULATION_INDEX = geonames_field_index('GEONAMES_POPULATION')
|
||||
|
||||
PREFERRED_INDEX = geonames_field_index('GEONAMES_IS_PREFERRED_NAME')
|
||||
|
||||
HISTORICAL_INDEX = geonames_field_index('GEONAMES_IS_HISTORICAL')
|
||||
|
||||
|
||||
geonames_admin_joins = '''
|
||||
left join admin1_codes a1
|
||||
on a1.code = gn.admin1_code
|
||||
and a1.country_code = gn.country_code
|
||||
left join admin2_codes a2
|
||||
on a2.code = gn.admin2_code
|
||||
and a2.admin1_code = gn.admin1_code
|
||||
and a2.country_code = gn.country_code
|
||||
left join admin3_codes a3
|
||||
on a3.code = gn.admin3_code
|
||||
and a3.admin1_code = gn.admin1_code
|
||||
and a3.admin2_code = gn.admin2_code
|
||||
and a3.country_code = gn.country_code
|
||||
left join admin4_codes a4
|
||||
on a4.code = gn.admin4_code
|
||||
and a4.admin1_code = gn.admin1_code
|
||||
and a4.admin2_code = gn.admin2_code
|
||||
and a4.admin3_code = gn.admin3_code
|
||||
and a4.country_code = gn.country_code
|
||||
'''
|
||||
|
||||
# Canonical names are stored in the geonames table with alternates
|
||||
# stored in a separate table. UNION ALL query will capture them all.
|
||||
|
||||
base_geonames_query = '''
|
||||
select {geonames_fields}
|
||||
from geonames gn
|
||||
join countries c
|
||||
on gn.country_code = c.country_code
|
||||
{admin_joins}
|
||||
{{predicate}}
|
||||
union all
|
||||
select {alt_name_fields}
|
||||
from geonames gn
|
||||
join countries c
|
||||
on gn.country_code = c.country_code
|
||||
join alternate_names an
|
||||
on an.geonames_id = gn.geonames_id
|
||||
and iso_language not in ('doi','faac','iata',
|
||||
'icao','link','post','tcid')
|
||||
and an.alternate_name != gn.name
|
||||
{admin_joins}
|
||||
{{predicate}}
|
||||
'''.format(
|
||||
geonames_fields=', '.join((f.name if f.default is None else
|
||||
'{} as {}'.format(f.default, f.name)
|
||||
for f in geonames_fields)),
|
||||
alt_name_fields=', '.join((f.name for f in geonames_fields)),
|
||||
admin_joins=geonames_admin_joins
|
||||
)
|
||||
|
||||
IGNORE_COUNTRY_POSTAL_CODES = set([
|
||||
'AR', # GeoNames has pre-1999 postal codes
|
||||
])
|
||||
|
||||
postal_code_fields = [
|
||||
GeonamesField('postal_code', 'GN_POSTAL_CODE'),
|
||||
GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'),
|
||||
GeonamesField('c.geonames_id as country_geonames_id', 'GN_POSTAL_COUNTRY_GEONAMES_ID'),
|
||||
GeonamesField('c.population as country_population', 'GN_POSTAL_COUNTRY_POPULATION'),
|
||||
GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'),
|
||||
GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'),
|
||||
GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'),
|
||||
GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'),
|
||||
]
|
||||
|
||||
def postal_code_field_index(s):
|
||||
for i, f in enumerate(postal_code_fields):
|
||||
if f.c_constant == s:
|
||||
return i
|
||||
return None
|
||||
|
||||
POSTAL_CODE_INDEX = postal_code_field_index('GN_POSTAL_CODE')
|
||||
POSTAL_CODE_POP_INDEX = postal_code_field_index('GN_POSTAL_COUNTRY_POPULATION')
|
||||
|
||||
postal_codes_query = '''
|
||||
select
|
||||
{fields}
|
||||
from postal_codes p
|
||||
join countries c
|
||||
on p.country_code = c.country_code
|
||||
left join (
|
||||
select
|
||||
gn.geonames_id,
|
||||
alternate_name,
|
||||
country_code,
|
||||
gn.name
|
||||
from alternate_names an
|
||||
join geonames gn
|
||||
on an.geonames_id = gn.geonames_id
|
||||
where iso_language = 'post'
|
||||
) as n
|
||||
on p.postal_code = n.alternate_name
|
||||
and p.country_code = n.country_code
|
||||
left join admin1_codes a1
|
||||
on a1.code = p.admin1_code
|
||||
and p.country_code = a1.country_code
|
||||
left join admin2_codes a2
|
||||
on a2.code = p.admin2_code
|
||||
and a2.admin1_code = p.admin1_code
|
||||
and a2.country_code = p.country_code
|
||||
left join admin3_codes a3
|
||||
on a3.code = p.admin3_code
|
||||
and a3.admin1_code = p.admin1_code
|
||||
and a3.admin2_code = p.admin2_code
|
||||
and a3.country_code = p.country_code
|
||||
where p.country_code not in ({exclude_country_codes})
|
||||
group by postal_code, p.country_code
|
||||
'''.format(
|
||||
fields=','.join([f.name for f in postal_code_fields]),
|
||||
exclude_country_codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))
|
||||
|
||||
|
||||
wikipedia_query = '''
|
||||
select alternate_name, geonames_id, is_preferred_name
|
||||
from alternate_names
|
||||
where iso_language = 'link'
|
||||
and alternate_name like '%%en.wikipedia%%'
|
||||
order by alternate_name, is_preferred_name
|
||||
'''
|
||||
|
||||
BATCH_SIZE = 2000
|
||||
|
||||
|
||||
wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')
|
||||
|
||||
|
||||
def normalize_wikipedia_title(title):
|
||||
return safe_decode(title).replace(u'_', u' ')
|
||||
|
||||
|
||||
def normalize_wikipedia_url(url):
|
||||
url = urllib.unquote_plus(url)
|
||||
|
||||
parsed = urlparse.urlsplit(url)
|
||||
if parsed.query:
|
||||
params = urlparse.parse_qs(parsed.query)
|
||||
if 'title' in params:
|
||||
return normalize_wikipedia_title(params['title'][0])
|
||||
|
||||
title = parsed.path.rsplit('/', 1)[-1]
|
||||
if title not in ('index.php', 'index.html'):
|
||||
return normalize_wikipedia_title(title)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalize_name(name):
|
||||
name = name.replace('&', 'and')
|
||||
name = name.replace('-', ' ')
|
||||
name = name.replace(', ', ' ')
|
||||
name = name.replace(',', ' ')
|
||||
return name
|
||||
|
||||
|
||||
saint_replacements = [
|
||||
('st.', 'saint'),
|
||||
('st.', 'st'),
|
||||
('st', 'saint')
|
||||
]
|
||||
|
||||
|
||||
abbreviated_saint_regex = re.compile(r'\bSt(\.|\b)')
|
||||
|
||||
|
||||
def normalize_display_name(name):
|
||||
return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')
|
||||
|
||||
|
||||
def utf8_normalize(s, form='NFD'):
|
||||
return unicodedata.normalize(form, s)
|
||||
|
||||
|
||||
def get_wikipedia_titles(db):
|
||||
d = defaultdict(dict)
|
||||
|
||||
cursor = db.execute(wikipedia_query)
|
||||
|
||||
while True:
|
||||
batch = cursor.fetchmany(BATCH_SIZE)
|
||||
if not batch:
|
||||
break
|
||||
|
||||
for (url, geonames_id, is_preferred) in batch:
|
||||
title = normalize_wikipedia_url(safe_encode(url))
|
||||
if title is not None and title.strip():
|
||||
title = utf8_normalize(normalize_name(title))
|
||||
d[title.lower()][geonames_id] = int(is_preferred or 0)
|
||||
|
||||
return d
|
||||
|
||||
|
||||
def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
||||
'''
|
||||
Writes geonames.tsv using the specified db to the specified data directory
|
||||
'''
|
||||
filename = os.path.join(out_dir, 'geonames.tsv')
|
||||
temp_filename = filename + '.tmp'
|
||||
|
||||
f = open(temp_filename, 'w')
|
||||
|
||||
writer = csv.writer(f, 'tsv_no_quote')
|
||||
|
||||
init_languages()
|
||||
|
||||
init_country_names()
|
||||
|
||||
wiki_titles = get_wikipedia_titles(db)
|
||||
logging.info('Fetched Wikipedia titles')
|
||||
|
||||
# Iterate over GeoNames boundary types from largest (country) to smallest (neighborhood)
|
||||
for boundary_type, codes in geonames_admin_dictionaries.iteritems():
|
||||
if boundary_type != boundary_types.COUNTRY:
|
||||
predicate = 'where gn.feature_code in ({codes})'.format(
|
||||
codes=','.join(['"{}"'.format(c) for c in codes])
|
||||
)
|
||||
else:
|
||||
# The query for countries in GeoNames is somewhat non-trivial
|
||||
predicate = 'where gn.geonames_id in (select geonames_id from countries)'
|
||||
|
||||
query = base_geonames_query.format(
|
||||
predicate=predicate
|
||||
)
|
||||
|
||||
cursor = db.execute(query)
|
||||
i = 1
|
||||
while True:
|
||||
# Fetch rows in batches to save memory
|
||||
batch = cursor.fetchmany(BATCH_SIZE)
|
||||
if not batch:
|
||||
break
|
||||
rows = []
|
||||
for row in batch:
|
||||
row = list(row)
|
||||
row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
|
||||
|
||||
language = row[LANGUAGE_INDEX]
|
||||
|
||||
country_code = row[COUNTRY_CODE_INDEX]
|
||||
|
||||
is_preferred = int(row[PREFERRED_INDEX] or 0)
|
||||
is_historical = int(row[HISTORICAL_INDEX] or 0)
|
||||
|
||||
lang_spoken = get_country_languages(country_code.lower(), official=False).get(language, None)
|
||||
lang_official = get_country_languages(country_code.lower()).get(language, None) == 1
|
||||
null_language = not language.strip()
|
||||
|
||||
is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
|
||||
|
||||
alpha2_code = None
|
||||
is_orig_name = False
|
||||
|
||||
if boundary_type == boundary_types.COUNTRY:
|
||||
alpha2_code = row[COUNTRY_CODE_INDEX]
|
||||
|
||||
is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
|
||||
# Set the canonical for countries to the local name, see country_official_name in country_names.py
|
||||
country_canonical = country_localized_display_name(alpha2_code.lower())
|
||||
if not country_canonical or not country_canonical.strip():
|
||||
raise ValueError('Could not get local canonical name for country code={}'.format(alpha2_code))
|
||||
row[CANONICAL_NAME_INDEX] = country_canonical
|
||||
|
||||
geonames_id = row[GEONAMES_ID_INDEX]
|
||||
|
||||
name = utf8_normalize(safe_decode(row[NAME_INDEX]))
|
||||
|
||||
# For non-postal codes, don't count
|
||||
if name.isdigit():
|
||||
continue
|
||||
|
||||
wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), {}))
|
||||
|
||||
row[NAME_INDEX] = name
|
||||
|
||||
if boundary_type == boundary_types.COUNTRY:
|
||||
norm_name = normalize_name(name.lower())
|
||||
for s, repl in saint_replacements:
|
||||
if not wikipedia_entries:
|
||||
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
|
||||
|
||||
wiki_row = []
|
||||
|
||||
have_wikipedia = geonames_id in wikipedia_entries
|
||||
wiki_preferred = wikipedia_entries.get(geonames_id, 0)
|
||||
|
||||
'''
|
||||
The following set of heuristics assigns a numerical value to a given name
|
||||
alternative, such that in the case of ambiguous names, this value can be
|
||||
used as part of the ranking function (as indeed it will be during sort).
|
||||
The higher the value, the more likely the given entity resolution.
|
||||
'''
|
||||
if is_historical:
|
||||
# Historical names, unlikely to be used
|
||||
language_priority = 0
|
||||
elif not null_language and language != 'abbr' and lang_spoken is None:
|
||||
# Name of a place in language not widely spoken e.g. Japanese name for a US toponym
|
||||
language_priority = 1
|
||||
elif null_language and not is_preferred and not is_canonical:
|
||||
# Null-language alternate names not marked as preferred, dubious
|
||||
language_priority = 2
|
||||
elif language == 'abbr' and not is_preferred:
|
||||
# Abbreviation, not preferred
|
||||
language_priority = 3
|
||||
elif language == 'abbr' and is_preferred:
|
||||
# Abbreviation, preferred e.g. NYC, UAE
|
||||
language_priority = 4
|
||||
elif lang_spoken and not lang_official and not is_preferred:
|
||||
# Non-preferred name but in a spoken (non-official) language
|
||||
language_priority = 5
|
||||
elif lang_official == 1 and not is_preferred:
|
||||
# Name in an official language, not preferred
|
||||
language_priority = 6
|
||||
elif null_language and not is_preferred and is_canonical:
|
||||
# Canonical name, may be overly official e.g. Islamic Republic of Pakistan
|
||||
language_priority = 7
|
||||
elif is_preferred and not lang_official:
|
||||
# Preferred names, not an official language
|
||||
language_priority = 8
|
||||
elif is_preferred and lang_official:
|
||||
# Official language preferred
|
||||
language_priority = 9
|
||||
|
||||
row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
|
||||
|
||||
if have_wikipedia:
|
||||
wiki_row = row[:]
|
||||
wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
|
||||
rows.append(map(encode_field, wiki_row))
|
||||
|
||||
canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
|
||||
row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
|
||||
|
||||
have_normalized = False
|
||||
|
||||
if is_orig_name:
|
||||
canonical_row = wiki_row[:] if have_wikipedia else row[:]
|
||||
|
||||
canonical_row_name = normalize_display_name(name)
|
||||
if canonical_row_name != name:
|
||||
canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
|
||||
have_normalized = True
|
||||
rows.append(map(encode_field, canonical_row))
|
||||
|
||||
if not have_wikipedia:
|
||||
rows.append(map(encode_field, row))
|
||||
|
||||
# Country names have more specialized logic
|
||||
if boundary_type == boundary_types.COUNTRY:
|
||||
wikipedia_entries = wiki_titles.get(canonical.lower(), {})
|
||||
|
||||
canonical_row_name = normalize_display_name(canonical)
|
||||
|
||||
canonical_row = row[:]
|
||||
|
||||
if is_orig_name:
|
||||
canonical = safe_decode(canonical)
|
||||
canonical_row[NAME_INDEX] = safe_encode(canonical)
|
||||
|
||||
norm_name = normalize_name(canonical.lower())
|
||||
for s, repl in saint_replacements:
|
||||
if not wikipedia_entries:
|
||||
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
|
||||
|
||||
if not wikipedia_entries:
|
||||
norm_name = normalize_name(canonical_row_name.lower())
|
||||
for s, repl in saint_replacements:
|
||||
if not wikipedia_entries:
|
||||
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
|
||||
|
||||
have_wikipedia = geonames_id in wikipedia_entries
|
||||
wiki_preferred = wikipedia_entries.get(geonames_id, 0)
|
||||
|
||||
if have_wikipedia:
|
||||
canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
|
||||
|
||||
if (name != canonical):
|
||||
rows.append(map(encode_field, canonical_row))
|
||||
|
||||
if canonical_row_name != canonical and canonical_row_name != name:
|
||||
canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
|
||||
rows.append(map(encode_field, canonical_row))
|
||||
|
||||
if alpha2_code and is_orig_name:
|
||||
alpha2_row = row[:]
|
||||
alpha2_row[NAME_INDEX] = alpha2_code
|
||||
alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
|
||||
rows.append(map(encode_field, alpha2_row))
|
||||
|
||||
if alpha2_code.lower() in country_alpha3_map and is_orig_name:
|
||||
alpha3_row = row[:]
|
||||
alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code.lower()]
|
||||
alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
|
||||
rows.append(map(encode_field, alpha3_row))
|
||||
|
||||
writer.writerows(rows)
|
||||
logging.info('Did {} batches'.format(i))
|
||||
i += 1
|
||||
|
||||
cursor.close()
|
||||
f.flush()
|
||||
|
||||
f.close()
|
||||
|
||||
logging.info('Sorting...')
|
||||
|
||||
env = os.environ.copy()
|
||||
env['LC_ALL'] = 'C'
|
||||
|
||||
command = ['sort', '-t\t', '-u', '--ignore-case',
|
||||
'-k{0},{0}'.format(NAME_INDEX + 1),
|
||||
# If there's a Wikipedia link to this name for the given id, sort first
|
||||
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
|
||||
# Language priority rules as above
|
||||
'-k{0},{0}nr'.format(DUMMY_LANGUAGE_PRIORITY_INDEX + 1),
|
||||
# Sort descending by population (basic proxy for relevance)
|
||||
'-k{0},{0}nr'.format(POPULATION_INDEX + 1),
|
||||
# group rows for the same geonames ID together
|
||||
'-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
|
||||
# preferred names come first within that grouping
|
||||
'-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
|
||||
# since uniquing is done on the sort key, add language
|
||||
'-k{0},{0}'.format(LANGUAGE_INDEX + 1),
|
||||
'-o', filename, temp_filename]
|
||||
|
||||
p = subprocess.Popen(command, env=env)
|
||||
|
||||
return_code = p.wait()
|
||||
if return_code != 0:
|
||||
raise subprocess.CalledProcessError(return_code, command)
|
||||
|
||||
os.unlink(temp_filename)
|
||||
|
||||
|
||||
def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
||||
filename = os.path.join(out_dir, 'postal_codes.tsv')
|
||||
temp_filename = filename + '.tmp'
|
||||
f = open(temp_filename, 'w')
|
||||
|
||||
writer = csv.writer(f, 'tsv_no_quote')
|
||||
|
||||
cursor = db.execute(postal_codes_query)
|
||||
|
||||
i = 1
|
||||
while True:
|
||||
batch = cursor.fetchmany(BATCH_SIZE)
|
||||
if not batch:
|
||||
break
|
||||
rows = [
|
||||
map(encode_field, row)
|
||||
for row in batch
|
||||
]
|
||||
writer.writerows(rows)
|
||||
logging.info('Did {} batches'.format(i))
|
||||
i += 1
|
||||
|
||||
cursor.close()
|
||||
f.close()
|
||||
|
||||
logging.info('Sorting...')
|
||||
|
||||
subprocess.check_call([
|
||||
'sort', '-t\t', '--ignore-case',
|
||||
'-k{0},{0}'.format(POSTAL_CODE_INDEX + 1),
|
||||
'-k{0},{0}nr'.format(POSTAL_CODE_POP_INDEX + 1),
|
||||
'-o', filename,
|
||||
temp_filename
|
||||
])
|
||||
os.unlink(temp_filename)
|
||||
|
||||
# Generates a C header telling us the order of the fields as written
|
||||
GEONAMES_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'src', 'geonames_fields.h')
|
||||
|
||||
GEONAMES_FIELDS_HEADER_FILE = '''enum geonames_fields {{
|
||||
{fields},
|
||||
NUM_GEONAMES_FIELDS
|
||||
}};
|
||||
'''.format(fields=''',
|
||||
'''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(geonames_fields)]))
|
||||
|
||||
|
||||
def write_geonames_fields_header(filename=GEONAMES_FIELDS_HEADER):
|
||||
with open(filename, 'w') as f:
|
||||
f.write(GEONAMES_FIELDS_HEADER_FILE)
|
||||
|
||||
POSTAL_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'src', 'postal_fields.h')
|
||||
|
||||
POSTAL_FIELDS_HEADER_FILE = '''enum gn_postal_fields {{
|
||||
{fields},
|
||||
NUM_POSTAL_FIELDS
|
||||
}};
|
||||
'''.format(fields=''',
|
||||
'''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(postal_code_fields)]))
|
||||
|
||||
|
||||
def write_postal_fields_header(filename=POSTAL_FIELDS_HEADER):
|
||||
with open(filename, 'w') as f:
|
||||
f.write(POSTAL_FIELDS_HEADER_FILE)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-d', '--db',
|
||||
default=DEFAULT_GEONAMES_DB_PATH,
|
||||
help='SQLite db file')
|
||||
parser.add_argument('-o', '--out',
|
||||
default=DEFAULT_DATA_DIR, help='output directory')
|
||||
args = parser.parse_args()
|
||||
db = sqlite3.connect(args.db)
|
||||
|
||||
create_geonames_tsv(db, args.out)
|
||||
create_postal_codes_tsv(db, args.out)
|
||||
write_geonames_fields_header()
|
||||
write_postal_fields_header()
|
||||
db.close()
|
||||
30
scripts/geodata/geonames/db.py
Normal file
30
scripts/geodata/geonames/db.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class GeoNamesDB(object):
|
||||
names_query = '''
|
||||
select iso_language, alternate_name,
|
||||
is_preferred_name, is_short_name
|
||||
from alternate_names
|
||||
where geonames_id = ?
|
||||
and is_historic != '1'
|
||||
and is_colloquial != '1'
|
||||
and iso_language != 'post'
|
||||
order by iso_language, cast(is_preferred_name as integer) desc, cast(is_short_name as integer)
|
||||
'''
|
||||
|
||||
def __init__(self, filename):
|
||||
self.db = sqlite3.connect(filename)
|
||||
|
||||
def query(self, query, *params):
|
||||
return self.db.execute(self.names_query, params)
|
||||
|
||||
def get_alternate_names(self, geonames_id):
|
||||
cursor = self.query(self.names_query, geonames_id)
|
||||
language_names = defaultdict(list)
|
||||
for language, name, is_preferred, is_short in cursor:
|
||||
language_names[language].append((name,
|
||||
int(is_preferred or 0),
|
||||
int(is_short or 0)))
|
||||
return dict(language_names)
|
||||
333
scripts/geodata/geonames/geonames_sqlite.py
Normal file
333
scripts/geodata/geonames/geonames_sqlite.py
Normal file
@@ -0,0 +1,333 @@
|
||||
import os
|
||||
import shutil
|
||||
import sqlite3
|
||||
|
||||
import tempfile
|
||||
import urlparse
|
||||
import urllib2
|
||||
import subprocess
|
||||
|
||||
import logging
|
||||
|
||||
import argparse
|
||||
|
||||
import csv
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.geonames.paths import *
|
||||
|
||||
from geodata.file_utils import *
|
||||
from geodata.log import *
|
||||
|
||||
from itertools import islice, chain
|
||||
|
||||
log_to_file(sys.stderr)
|
||||
logger = logging.getLogger('geonames.sqlite')
|
||||
|
||||
BASE_URL = 'http://download.geonames.org/export/'
|
||||
|
||||
DUMP_URL = urlparse.urljoin(BASE_URL, 'dump/')
|
||||
ALL_COUNTRIES_ZIP_FILE = 'allCountries.zip'
|
||||
HIERARCHY_ZIP_FILE = 'hierarchy.zip'
|
||||
ALTERNATE_NAMES_ZIP_FILE = 'alternateNames.zip'
|
||||
|
||||
ZIP_URL = urlparse.urljoin(BASE_URL, 'zip/')
|
||||
|
||||
GEONAMES_DUMP_FILES = (ALL_COUNTRIES_ZIP_FILE,
|
||||
HIERARCHY_ZIP_FILE,
|
||||
ALTERNATE_NAMES_ZIP_FILE)
|
||||
|
||||
# base_url, local_dir, is_gzipped, local_filename
|
||||
|
||||
|
||||
GEONAMES_FILES = [(DUMP_URL, '', True, ALL_COUNTRIES_ZIP_FILE),
|
||||
(DUMP_URL, '', True, HIERARCHY_ZIP_FILE),
|
||||
(DUMP_URL, '', True, ALTERNATE_NAMES_ZIP_FILE),
|
||||
(ZIP_URL, 'zip', True, ALL_COUNTRIES_ZIP_FILE),
|
||||
]
|
||||
|
||||
|
||||
def download_file(url, dest):
|
||||
logger.info('Downloading file from {}'.format(url))
|
||||
subprocess.check_call(['wget', url, '-O', dest])
|
||||
|
||||
|
||||
def admin_ddl(admin_level):
|
||||
columns = ['country_code TEXT'] + \
|
||||
['admin{}_code TEXT'.format(i)
|
||||
for i in xrange(1, admin_level)]
|
||||
|
||||
create = '''
|
||||
CREATE TABLE admin{level}_codes (
|
||||
geonames_id INT,
|
||||
code TEXT,
|
||||
name TEXT,
|
||||
{fields}
|
||||
)'''.format(level=admin_level,
|
||||
fields=''',
|
||||
'''.join(columns))
|
||||
|
||||
indices = (
|
||||
'''CREATE INDEX admin{}_code_index ON
|
||||
admin{}_codes (code)'''.format(admin_level, admin_level),
|
||||
'''CREATE INDEX admin{}_gn_id_index ON
|
||||
admin{}_codes (geonames_id)'''.format(admin_level, admin_level),
|
||||
)
|
||||
|
||||
return (create, ) + indices
|
||||
|
||||
geonames_ddl = {
|
||||
'geonames': (
|
||||
'''CREATE TABLE geonames (
|
||||
geonames_id INT PRIMARY KEY,
|
||||
name TEXT,
|
||||
ascii_name TEXT,
|
||||
alternate_names TEXT,
|
||||
latitude DOUBLE,
|
||||
longitude DOUBLE,
|
||||
feature_class TEXT,
|
||||
feature_code TEXT,
|
||||
country_code TEXT,
|
||||
cc2 TEXT,
|
||||
admin1_code TEXT,
|
||||
admin2_code TEXT,
|
||||
admin3_code TEXT,
|
||||
admin4_code TEXT,
|
||||
population LONG DEFAULT 0,
|
||||
elevation INT,
|
||||
dem INT,
|
||||
timezone TEXT,
|
||||
modification_date TEXT)''',
|
||||
'''CREATE INDEX feature_code ON
|
||||
geonames (feature_code)''',
|
||||
'''CREATE INDEX country_code ON
|
||||
geonames (country_code)''',
|
||||
'''CREATE INDEX admin_codes ON
|
||||
geonames (country_code, admin1_code, admin2_code, admin3_code, admin4_code)''',
|
||||
),
|
||||
|
||||
'alternate_names': (
|
||||
'''CREATE TABLE alternate_names (
|
||||
alternate_name_id INT PRIMARY KEY,
|
||||
geonames_id INT,
|
||||
iso_language TEXT,
|
||||
alternate_name TEXT,
|
||||
is_preferred_name BOOLEAN DEFAULT 0,
|
||||
is_short_name BOOLEAN DEFAULT 0,
|
||||
is_colloquial BOOLEAN DEFAULT 0,
|
||||
is_historic BOOLEAN DEFAULT 0)''',
|
||||
'''CREATE INDEX geonames_id_index ON
|
||||
alternate_names (geonames_id)''',
|
||||
'''CREATE INDEX geonames_id_alt_name_index ON
|
||||
alternate_names(geonames_id, alternate_name)''',
|
||||
),
|
||||
|
||||
'hierarchy': (
|
||||
'''CREATE TABLE hierarchy (
|
||||
parent_id INT,
|
||||
child_id INT,
|
||||
type TEXT
|
||||
);''',
|
||||
'''CREATE INDEX parent_child_index ON
|
||||
hierarchy (parent_id, child_id)''',
|
||||
'''CREATE INDEX child_parent_index ON
|
||||
hierarchy (child_id, parent_id)''',
|
||||
),
|
||||
|
||||
'postal_codes': (
|
||||
'''CREATE TABLE postal_codes (
|
||||
country_code TEXT,
|
||||
postal_code TEXT,
|
||||
place_name TEXT,
|
||||
admin1 TEXT,
|
||||
admin1_code TEXT,
|
||||
admin2 TEXT,
|
||||
admin2_code TEXT,
|
||||
admin3 TEXT,
|
||||
admin3_code TEXT,
|
||||
latitude DOUBLE,
|
||||
longitude DOUBLE,
|
||||
accuracy INT
|
||||
)''',
|
||||
'''CREATE INDEX post_code_index ON
|
||||
postal_codes (country_code, postal_code)''',
|
||||
'''CREATE INDEX postal_code_admins ON
|
||||
postal_codes (country_code, admin1_code, admin2_code, admin3_code)''',
|
||||
),
|
||||
'admin1_codes': admin_ddl(1),
|
||||
'admin2_codes': admin_ddl(2),
|
||||
'admin3_codes': admin_ddl(3),
|
||||
'admin4_codes': admin_ddl(4),
|
||||
|
||||
}
|
||||
|
||||
geonames_file_table_map = {
|
||||
('', ALL_COUNTRIES_ZIP_FILE): 'geonames',
|
||||
('', ALTERNATE_NAMES_ZIP_FILE): 'alternate_names',
|
||||
('', HIERARCHY_ZIP_FILE): 'hierarchy',
|
||||
('zip', ALL_COUNTRIES_ZIP_FILE): 'postal_codes',
|
||||
}
|
||||
|
||||
|
||||
country_codes_create_table = (
|
||||
'drop table if exists country_codes',
|
||||
'''
|
||||
create table country_codes as
|
||||
select distinct country_code from geonames
|
||||
where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS', 'TERR')
|
||||
''',
|
||||
)
|
||||
|
||||
proper_countries_create_table = (
|
||||
'drop table if exists proper_countries',
|
||||
'''
|
||||
create table proper_countries as
|
||||
select * from geonames
|
||||
where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
|
||||
and country_code in (select country_code from country_codes)
|
||||
''',
|
||||
)
|
||||
|
||||
territories_create_table = (
|
||||
'drop table if exists territories',
|
||||
'''
|
||||
create table territories as
|
||||
select * from geonames where feature_code = 'TERR'
|
||||
and country_code not in (select country_code from proper_countries);
|
||||
''',
|
||||
)
|
||||
|
||||
countries_create_table = (
|
||||
'drop table if exists countries',
|
||||
'''
|
||||
create table countries as
|
||||
select * from proper_countries
|
||||
union
|
||||
select * from territories;
|
||||
''',
|
||||
'create index country_geonames_id on countries (geonames_id)',
|
||||
'create index conntry_country_code on countries (country_code)',
|
||||
)
|
||||
|
||||
country_alises_create_table = (
|
||||
'drop table if exists country_aliases',
|
||||
'''
|
||||
create table country_aliases as
|
||||
select name, country_code
|
||||
from countries
|
||||
union
|
||||
select alternate_name, country_code
|
||||
from alternate_names an
|
||||
join countries c
|
||||
on c.geonames_id = an.geonames_id
|
||||
where alternate_name != ''
|
||||
and iso_language not in ('doi','faac','iata',
|
||||
'icao','link','post','tcid')
|
||||
'''
|
||||
)
|
||||
|
||||
country_table_create_statements = list(chain(country_codes_create_table,
|
||||
proper_countries_create_table,
|
||||
territories_create_table,
|
||||
countries_create_table,
|
||||
country_alises_create_table))
|
||||
|
||||
|
||||
def create_table(conn, table):
|
||||
cursor = conn.cursor()
|
||||
create_statements = geonames_ddl[table]
|
||||
cursor.execute('DROP TABLE IF EXISTS {}'.format(table))
|
||||
for statement in create_statements:
|
||||
cursor.execute(statement)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def batch_iter(iterable, batch_size):
|
||||
source_iter = iter(iterable)
|
||||
while True:
|
||||
batch = list(islice(source_iter, batch_size))
|
||||
if len(batch) > 0:
|
||||
yield batch
|
||||
else:
|
||||
return
|
||||
|
||||
|
||||
def populate_admin_table(conn, admin_level):
|
||||
logging.info('Doing admin level {}'.format(admin_level))
|
||||
|
||||
columns = ['geonames_id',
|
||||
'admin{}_code'.format(admin_level),
|
||||
'name',
|
||||
'country_code']
|
||||
columns.extend(['admin{}_code'.format(i)
|
||||
for i in xrange(1, admin_level)])
|
||||
|
||||
admin_insert_statement = '''
|
||||
insert into "admin{}_codes"
|
||||
select {}
|
||||
from geonames
|
||||
where feature_code = "ADM{}"
|
||||
'''.format(admin_level, ','.join(columns), admin_level)
|
||||
|
||||
conn.execute(admin_insert_statement)
|
||||
conn.commit()
|
||||
|
||||
logging.info('Done with admin level {}'.format(admin_level))
|
||||
|
||||
|
||||
def import_geonames_table(conn, table, f, batch_size=2000):
|
||||
# escape the brackets around the values format string so we can use later
|
||||
statement = 'INSERT INTO "{}" VALUES ({{}})'.format(table)
|
||||
cursor = conn.cursor()
|
||||
for i, batch in enumerate(batch_iter(f, batch_size)):
|
||||
num_cols = len(batch[0])
|
||||
cursor.executemany(statement.format(','.join(['?'] * num_cols)), batch)
|
||||
conn.commit()
|
||||
cursor = conn.cursor()
|
||||
logging.info('imported {} batches ({} records)'.format(i + 1, (i + 1) * batch_size))
|
||||
cursor.close()
|
||||
|
||||
|
||||
def create_geonames_sqlite_db(temp_dir, db_file=DEFAULT_GEONAMES_DB_PATH):
|
||||
conn = sqlite3.connect(db_file)
|
||||
logging.info('Created database at {}'.format(db_file))
|
||||
for url, directory, is_gzipped, filename in GEONAMES_FILES:
|
||||
table = geonames_file_table_map[(directory, filename)]
|
||||
create_table(conn, table)
|
||||
full_url = urlparse.urljoin(url, filename)
|
||||
dest_dir = os.path.join(temp_dir, directory)
|
||||
ensure_dir(dest_dir)
|
||||
dest_file = os.path.join(dest_dir, filename)
|
||||
download_file(full_url, dest_file)
|
||||
if is_gzipped:
|
||||
unzip_file(dest_file, dest_dir)
|
||||
filename = dest_file.replace('.zip', '.txt')
|
||||
reader = csv.reader(open(filename), delimiter='\t', quotechar=None)
|
||||
lines = (map(safe_decode, line) for line in reader)
|
||||
import_geonames_table(conn, table, lines)
|
||||
logging.info('Creating countries tables')
|
||||
for statement in country_table_create_statements:
|
||||
conn.execute(statement)
|
||||
conn.commit()
|
||||
logging.info('Creating admin tables')
|
||||
for admin_level in xrange(1, 5):
|
||||
create_table(conn, 'admin{}_codes'.format(admin_level))
|
||||
populate_admin_table(conn, admin_level)
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-t', '--temp-dir',
|
||||
default=tempfile.gettempdir(),
|
||||
help='Temporary work directory')
|
||||
parser.add_argument('-o', '--out',
|
||||
default=DEFAULT_GEONAMES_DB_PATH,
|
||||
help='SQLite3 db filename')
|
||||
args = parser.parse_args()
|
||||
create_geonames_sqlite_db(args.temp_dir, args.out)
|
||||
9
scripts/geodata/geonames/paths.py
Normal file
9
scripts/geodata/geonames/paths.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import os
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
GEONAMES_DB_NAME = 'geonames.db'
|
||||
|
||||
DEFAULT_GEONAMES_DB_PATH = os.path.join(this_dir, os.path.pardir,
|
||||
os.path.pardir, os.path.pardir,
|
||||
'data', 'geonames', GEONAMES_DB_NAME)
|
||||
0
scripts/geodata/geoplanet/__init__.py
Normal file
0
scripts/geodata/geoplanet/__init__.py
Normal file
1826
scripts/geodata/geoplanet/cleanup_geoplanet_db.sql
Normal file
1826
scripts/geodata/geoplanet/cleanup_geoplanet_db.sql
Normal file
File diff suppressed because it is too large
Load Diff
154
scripts/geodata/geoplanet/create_geoplanet_db.sh
Executable file
154
scripts/geodata/geoplanet/create_geoplanet_db.sh
Executable file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
: '
|
||||
create_geoplanet_db.sh
|
||||
-------------------------
|
||||
|
||||
Shell script to download Geo Planet and derive inputs
|
||||
for address parser training set construction.
|
||||
|
||||
Usage: ./create_geoplanet_db.sh out_dir
|
||||
'
|
||||
|
||||
if [ "$#" -ge 1 ]; then
|
||||
OUT_DIR=$1
|
||||
mkdir -p $OUT_DIR
|
||||
else
|
||||
OUT_DIR=$(pwd)
|
||||
fi
|
||||
|
||||
GEOPLANET_ZIP_FILE="geoplanet_data_7.10.0.zip"
|
||||
# Internet Archive URL
|
||||
GEOPLANET_URL="https://archive.org/download/$GEOPLANET_ZIP_FILE/$GEOPLANET_ZIP_FILE"
|
||||
GEOPLANET_ORIGINAL_PLACES_FILE="geoplanet_places_7.10.0.tsv"
|
||||
GEOPLANET_ADMINS_FILE="geoplanet_admins_7.10.0.tsv"
|
||||
GEOPLANET_ORIGINAL_ALIASES_FILE="geoplanet_aliases_7.10.0.tsv"
|
||||
|
||||
GEOPLANET_ALL_PLACES_FILE="geoplanet_all_places.tsv"
|
||||
GEOPLANET_PLACES_FILE="geoplanet_places.tsv"
|
||||
GEOPLANET_POSTAL_CODES_FILE="geoplanet_postal_codes.tsv"
|
||||
GEOPLANET_ALIASES_FILE="geoplanet_aliases.tsv"
|
||||
|
||||
GEOPLANET_GEONAMES_CONCORDANCE_FILE="geonames-geoplanet-matches.csv"
|
||||
GEOPLANET_GEONAMES_CONCORDANCE_URL="https://github.com/blackmad/geoplanet-concordance/raw/master/current/$GEOPLANET_GEONAMES_CONCORDANCE_FILE"
|
||||
|
||||
GEOPLANET_DB_FILE="geoplanet.db"
|
||||
|
||||
function download_file() {
|
||||
echo "Downloading $1"
|
||||
response=$(curl -sL -w "%{http_code}" $1 --retry 3 --retry-delay 5 -o $OUT_DIR/$2)
|
||||
if [ $response -ne "200" ]; then
|
||||
echo "Could not download $GEOPLANET_URL"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
if [ ! -f $OUT_DIR/$GEOPLANET_ZIP_FILE ]; then
|
||||
echo "Downloading GeoPlanet"
|
||||
download_file $GEOPLANET_URL $GEOPLANET_ZIP_FILE
|
||||
fi
|
||||
|
||||
cd $OUT_DIR
|
||||
echo "Unzipping GeoPlanet file"
|
||||
unzip -o $GEOPLANET_ZIP_FILE
|
||||
|
||||
echo "Creating GeoPlanet postal codes file"
|
||||
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Zip") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_POSTAL_CODES_FILE
|
||||
|
||||
echo "Creating GeoPlanet all places file"
|
||||
tail -n+2 $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_ALL_PLACES_FILE
|
||||
|
||||
echo "Creating GeoPlanet places file"
|
||||
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Continent" || $5 == "Country" || $5 == "Nationality" || $5 == "State" || $5 == "County" || $5 == "Town" || $5 == "LocalAdmin" || $5 == "Island" || $5 == "Suburb") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_PLACES_FILE
|
||||
|
||||
echo "Creating GeoPlanet aliases file"
|
||||
tail -n+2 $GEOPLANET_ORIGINAL_ALIASES_FILE > $GEOPLANET_ALIASES_FILE
|
||||
|
||||
echo "Fetching GeoNames concordance"
|
||||
download_file $GEOPLANET_GEONAMES_CONCORDANCE_URL $GEOPLANET_GEONAMES_CONCORDANCE_FILE
|
||||
|
||||
echo "Creating SQLite db"
|
||||
|
||||
echo "
|
||||
DROP TABLE IF EXISTS places;
|
||||
CREATE TABLE places (
|
||||
id integer primary key,
|
||||
country_code text,
|
||||
name text,
|
||||
language text,
|
||||
place_type text,
|
||||
parent_id integer
|
||||
);
|
||||
|
||||
.separator \t
|
||||
.import $OUT_DIR/$GEOPLANET_PLACES_FILE places
|
||||
|
||||
CREATE INDEX places_parent_id_index on places(parent_id);
|
||||
CREATE INDEX places_country_code on places(country_code);
|
||||
|
||||
DROP TABLE IF EXISTS all_places;
|
||||
CREATE TABLE all_places AS SELECT * FROM places WHERE 0;
|
||||
.import $OUT_DIR/$GEOPLANET_ALL_PLACES_FILE all_places
|
||||
|
||||
DROP TABLE IF EXISTS postal_codes;
|
||||
CREATE TABLE postal_codes (
|
||||
id integer primary key,
|
||||
country_code text,
|
||||
name text,
|
||||
language text,
|
||||
place_type text,
|
||||
parent_id integer
|
||||
);
|
||||
|
||||
.import $OUT_DIR/$GEOPLANET_POSTAL_CODES_FILE postal_codes
|
||||
CREATE INDEX postal_codes_parent_id_index on postal_codes(parent_id);
|
||||
CREATE INDEX postal_codes_country_code on postal_codes(country_code);
|
||||
|
||||
DROP TABLE IF EXISTS admins;
|
||||
CREATE TABLE admins (
|
||||
id integer primary key,
|
||||
country_code text,
|
||||
state_id integer,
|
||||
county_id integer,
|
||||
local_admin_id integer,
|
||||
country_id integer,
|
||||
continent_id integer
|
||||
);
|
||||
|
||||
.import $OUT_DIR/$GEOPLANET_ADMINS_FILE admins
|
||||
|
||||
CREATE INDEX admin_country_code on admins(country_code);
|
||||
CREATE INDEX admin_state_id on admins(state_id);
|
||||
CREATE INDEX admin_county_id on admins(county_id);
|
||||
CREATE INDEX admin_local_admin_id on admins(local_admin_id);
|
||||
CREATE INDEX admin_country_id on admins(country_id);
|
||||
CREATE INDEX admin_continent_id on admins(continent_id);
|
||||
|
||||
DROP TABLE IF EXISTS aliases;
|
||||
CREATE TABLE aliases (
|
||||
id integer,
|
||||
name text,
|
||||
name_type text,
|
||||
language text
|
||||
);
|
||||
|
||||
.import $OUT_DIR/$GEOPLANET_ALIASES_FILE aliases
|
||||
|
||||
CREATE INDEX alias_id on aliases(id);
|
||||
|
||||
DROP TABLE IF EXISTS geonames_concordance;
|
||||
CREATE TABLE geonames_concordance (
|
||||
id integer primary key,
|
||||
geonames_id integer,
|
||||
name text,
|
||||
lat number,
|
||||
lon number
|
||||
);
|
||||
|
||||
.mode csv
|
||||
.import $OUT_DIR/$GEOPLANET_GEONAMES_CONCORDANCE_FILE geonames_concordance
|
||||
|
||||
CREATE INDEX geonames_concordance_geonames_id on geonames_concordance(geonames_id);
|
||||
|
||||
" | sqlite3 $OUT_DIR/$GEOPLANET_DB_FILE
|
||||
353
scripts/geodata/geoplanet/geoplanet_training_data.py
Normal file
353
scripts/geodata/geoplanet/geoplanet_training_data.py
Normal file
@@ -0,0 +1,353 @@
|
||||
import argparse
|
||||
import csv
|
||||
import itertools
|
||||
import os
|
||||
import six
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.address_expansions.abbreviations import abbreviate
|
||||
from geodata.address_expansions.equivalence import equivalent
|
||||
from geodata.address_expansions.gazetteers import *
|
||||
|
||||
from geodata.address_formatting.formatter import AddressFormatter
|
||||
|
||||
from geodata.countries.names import country_names
|
||||
from geodata.postal_codes.validation import postcode_regexes
|
||||
from geodata.names.normalization import name_affixes
|
||||
from geodata.places.config import place_config
|
||||
|
||||
from geodata.csv_utils import tsv_string, unicode_csv_reader
|
||||
|
||||
GEOPLANET_DB_FILE = 'geoplanet.db'
|
||||
GEOPLANET_FORMAT_DATA_TAGGED_FILENAME = 'geoplanet_formatted_addresses_tagged.tsv'
|
||||
GEOPLANET_FORMAT_DATA_FILENAME = 'geoplanet_formatted_addresses.tsv'
|
||||
|
||||
|
||||
class GeoPlanetFormatter(object):
|
||||
# Map of GeoPlanet language codes to ISO-639 alpha2 language codes
|
||||
language_codes = {
|
||||
'ENG': 'en',
|
||||
'JPN': 'ja',
|
||||
'GER': 'de',
|
||||
'SPA': 'es',
|
||||
'FRE': 'fr',
|
||||
'UNK': 'unk',
|
||||
'ITA': 'it',
|
||||
'POR': 'pt',
|
||||
'POL': 'pl',
|
||||
'ARA': 'ar',
|
||||
'CZE': 'cs',
|
||||
'SWE': 'sv',
|
||||
'CHI': 'zh',
|
||||
'RUM': 'ro',
|
||||
'FIN': 'fi',
|
||||
'DUT': 'nl',
|
||||
'NOR': 'nb',
|
||||
'DAN': 'da',
|
||||
'HUN': 'hu',
|
||||
'KOR': 'kr',
|
||||
}
|
||||
|
||||
non_latin_script_languages = {
|
||||
'JPN', # Japanese
|
||||
'ARA', # Arabic
|
||||
'CHI', # Chinese
|
||||
'KOR', # Korean
|
||||
}
|
||||
|
||||
ALIAS_PREFERRED = 'P'
|
||||
ALIAS_PREFERRED_FOREIGN = 'Q'
|
||||
ALIAS_VARIANT = 'V'
|
||||
ALIAS_ABBREVIATED = 'A'
|
||||
ALIAS_COLLOQUIAL = 'S'
|
||||
|
||||
# Map of GeoPlanet place types to address formatter types
|
||||
place_types = {
|
||||
'Continent': AddressFormatter.WORLD_REGION,
|
||||
'Country': AddressFormatter.COUNTRY,
|
||||
'CountryRegion': AddressFormatter.COUNTRY_REGION,
|
||||
'State': AddressFormatter.STATE,
|
||||
'County': AddressFormatter.STATE_DISTRICT,
|
||||
'Island': AddressFormatter.ISLAND,
|
||||
'Town': AddressFormatter.CITY,
|
||||
# Note: if we do general place queris from GeoPlanet, this
|
||||
# may have to be mapped more carefully
|
||||
'LocalAdmin': AddressFormatter.CITY_DISTRICT,
|
||||
'Suburb': AddressFormatter.SUBURB,
|
||||
}
|
||||
|
||||
def __init__(self, geoplanet_db):
|
||||
self.db = sqlite3.connect(geoplanet_db)
|
||||
|
||||
# These aren't too large and it's easier to have them in memory
|
||||
self.places = {row[0]: row[1:] for row in self.db.execute('select * from places')}
|
||||
self.aliases = defaultdict(list)
|
||||
|
||||
self.coterminous_admins = {}
|
||||
self.admins_with_ambiguous_city = set()
|
||||
|
||||
print('Doing admin ambiguities')
|
||||
for row in self.db.execute('''select p.id,
|
||||
(select count(*) from places where parent_id = p.id) as num_places,
|
||||
(select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns,
|
||||
p2.id
|
||||
from places p
|
||||
join places p2
|
||||
on p2.parent_id = p.id
|
||||
and p.name = p2.name
|
||||
and p.place_type != "Town"
|
||||
and p2.place_type = "Town"
|
||||
group by p.id'''):
|
||||
place_id, num_places, num_towns, coterminous_town_id = row
|
||||
num_places = int(num_places)
|
||||
num_towns = int(num_towns)
|
||||
|
||||
if num_places == 1 and num_towns == 1:
|
||||
self.coterminous_admins[place_id] = coterminous_town_id
|
||||
self.admins_with_ambiguous_city.add(place_id)
|
||||
|
||||
print('num coterminous: {}'.format(len(self.coterminous_admins)))
|
||||
print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city)))
|
||||
|
||||
print('Doing aliases')
|
||||
for row in self.db.execute('''select a.* from aliases a
|
||||
left join places p
|
||||
on a.id = p.id
|
||||
and p.place_type in ("State", "County")
|
||||
and a.language != p.language
|
||||
where name_type != "S" -- no colloquial aliases like "The Big Apple"
|
||||
and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK
|
||||
and p.id is NULL -- exclude foreign-language states/county names
|
||||
order by id, language,
|
||||
case name_type
|
||||
when "P" then 1
|
||||
when "Q" then 2
|
||||
when "V" then 3
|
||||
when "A" then 4
|
||||
when "S" then 5
|
||||
else 6
|
||||
end'''):
|
||||
place = self.places.get(row[0])
|
||||
if not place:
|
||||
continue
|
||||
|
||||
self.aliases[row[0]].append(row[1:])
|
||||
|
||||
print('Doing variant aliases')
|
||||
variant_aliases = 0
|
||||
for i, row in enumerate(self.db.execute('''select a.*, p.name, p.country_code from aliases a
|
||||
join places p using(id)
|
||||
where a.name_type = "V"
|
||||
and a.language = p.language''')):
|
||||
place_name, country_code = row[-2:]
|
||||
country = country_code.lower()
|
||||
|
||||
row = row[:-2]
|
||||
place_id, alias, name_type, language = row
|
||||
|
||||
language = self.language_codes[language]
|
||||
if language != 'unk':
|
||||
alias_sans_affixes = name_affixes.replace_affixes(alias, language, country=country)
|
||||
if alias_sans_affixes:
|
||||
alias = alias_sans_affixes
|
||||
|
||||
place_name_sans_affixes = name_affixes.replace_affixes(place_name, language, country=country)
|
||||
if place_name_sans_affixes:
|
||||
place_name = place_name_sans_affixes
|
||||
else:
|
||||
language = None
|
||||
|
||||
if equivalent(place_name, alias, toponym_abbreviations_gazetteer, language):
|
||||
self.aliases[row[0]].append(row[1:])
|
||||
variant_aliases += 1
|
||||
|
||||
if i % 10000 == 0 and i > 0:
|
||||
print('tested {} variant aliases with {} positives'.format(i, variant_aliases))
|
||||
|
||||
self.aliases = dict(self.aliases)
|
||||
|
||||
self.formatter = AddressFormatter()
|
||||
|
||||
def get_place_hierarchy(self, place_id):
|
||||
all_places = []
|
||||
original_place_id = place_id
|
||||
place = self.places[place_id]
|
||||
all_places.append((place_id, ) + place)
|
||||
place_id = place[-1]
|
||||
while place_id != 1 and place_id != original_place_id:
|
||||
place = self.places[place_id]
|
||||
all_places.append((place_id,) + place)
|
||||
place_id = place[-1]
|
||||
return all_places
|
||||
|
||||
def get_aliases(self, place_id):
|
||||
return self.aliases.get(place_id, [])
|
||||
|
||||
def cleanup_name(self, name):
|
||||
return name.strip(' ,-')
|
||||
|
||||
def format_postal_codes(self, tag_components=True):
|
||||
all_postal_codes = self.db.execute('select * from postal_codes')
|
||||
for postal_code_id, country, postal_code, language, place_type, parent_id in all_postal_codes:
|
||||
country = country.lower()
|
||||
postcode_language = language
|
||||
|
||||
language = self.language_codes[language]
|
||||
|
||||
if len(postal_code) <= 3:
|
||||
postcode_regex = postcode_regexes.get(country)
|
||||
|
||||
valid_postcode = False
|
||||
if postcode_regex:
|
||||
match = postcode_regex.match(postal_code)
|
||||
if match and match.end() == len(postal_code):
|
||||
valid_postcode = True
|
||||
|
||||
if not valid_postcode:
|
||||
continue
|
||||
|
||||
# If the county/state is coterminous with a city and contains only one place,
|
||||
# set the parent_id to the city instead
|
||||
if parent_id in self.coterminous_admins:
|
||||
parent_id = self.coterminous_admins[parent_id]
|
||||
|
||||
place_hierarchy = self.get_place_hierarchy(parent_id)
|
||||
|
||||
containing_places = defaultdict(set)
|
||||
|
||||
language_places = {None: containing_places}
|
||||
|
||||
original_language = language
|
||||
|
||||
have_default_language = False
|
||||
|
||||
if place_hierarchy:
|
||||
base_place_id, _, _, _, base_place_type, _ = place_hierarchy[0]
|
||||
base_place_type = self.place_types[base_place_type]
|
||||
else:
|
||||
base_place_id = None
|
||||
base_place_type = None
|
||||
|
||||
place_types_seen = set()
|
||||
|
||||
for place_id, country, name, lang, place_type, parent in place_hierarchy:
|
||||
country = country.lower()
|
||||
|
||||
# First language
|
||||
if not have_default_language and lang != postcode_language:
|
||||
language = self.language_codes[lang]
|
||||
have_default_language = True
|
||||
|
||||
place_type = self.place_types[place_type]
|
||||
if AddressFormatter.CITY not in place_types_seen and place_id in self.admins_with_ambiguous_city:
|
||||
continue
|
||||
|
||||
name = self.cleanup_name(name)
|
||||
containing_places[place_type].add(name)
|
||||
|
||||
aliases = self.get_aliases(place_id)
|
||||
for name, name_type, alias_lang in aliases:
|
||||
if not alias_lang:
|
||||
alias_lang = 'UNK'
|
||||
if alias_lang == lang and lang != 'UNK':
|
||||
alias_language = None
|
||||
else:
|
||||
alias_language = self.language_codes[alias_lang]
|
||||
|
||||
language_places.setdefault(alias_language, defaultdict(set))
|
||||
lang_places = language_places[alias_language]
|
||||
|
||||
name = self.cleanup_name(name)
|
||||
|
||||
lang_places[place_type].add(name)
|
||||
|
||||
place_types_seen.add(place_type)
|
||||
|
||||
default_city_names = set([name.lower() for name in language_places.get(None, {}).get(AddressFormatter.CITY, [])])
|
||||
|
||||
for language, containing_places in six.iteritems(language_places):
|
||||
if language is None:
|
||||
language = original_language
|
||||
|
||||
country_localized_name = country_names.localized_name(country, language)
|
||||
if country_localized_name:
|
||||
containing_places[AddressFormatter.COUNTRY].add(country_localized_name)
|
||||
country_alpha3_code = country_names.alpha3_code(country)
|
||||
if country_alpha3_code and language in (None, 'ENG'):
|
||||
containing_places[AddressFormatter.COUNTRY].add(country_alpha3_code)
|
||||
|
||||
keys = containing_places.keys()
|
||||
all_values = containing_places.values()
|
||||
|
||||
keys_set = set(keys)
|
||||
|
||||
for i, values in enumerate(itertools.product(*all_values)):
|
||||
components = {
|
||||
AddressFormatter.POSTCODE: postal_code
|
||||
}
|
||||
|
||||
if not default_city_names:
|
||||
components.update(zip(keys, values))
|
||||
else:
|
||||
for k, v in zip(keys, values):
|
||||
if k == AddressFormatter.CITY or AddressFormatter.CITY in keys_set or v.lower() not in default_city_names:
|
||||
components[k] = v
|
||||
|
||||
format_language = language if self.formatter.template_language_matters(country, language) else None
|
||||
formatted = self.formatter.format_address(components, country, language=format_language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
|
||||
yield (language, country, formatted)
|
||||
|
||||
component_keys = set(components)
|
||||
components = place_config.dropout_components(components, (), country=country, population=0)
|
||||
|
||||
if len(components) > 1 and set(components) ^ component_keys:
|
||||
formatted = self.formatter.format_address(components, country, language=format_language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
yield (language, country, formatted)
|
||||
|
||||
def build_training_data(self, out_dir, tag_components=True):
|
||||
if tag_components:
|
||||
formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
||||
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
||||
else:
|
||||
formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_FILENAME), 'w')
|
||||
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
||||
|
||||
i = 0
|
||||
|
||||
for language, country, formatted_address in self.format_postal_codes(tag_components=tag_components):
|
||||
if not formatted_address or not formatted_address.strip():
|
||||
continue
|
||||
|
||||
formatted_address = tsv_string(formatted_address)
|
||||
if not formatted_address or not formatted_address.strip():
|
||||
continue
|
||||
|
||||
if tag_components:
|
||||
row = (language, country, formatted_address)
|
||||
else:
|
||||
row = (formatted_address,)
|
||||
|
||||
writer.writerow(row)
|
||||
i += 1
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print('did {} formatted addresses'.format(i))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 3:
|
||||
sys.exit('Usage: python geoplanet_training_data.py geoplanet_db_path out_dir')
|
||||
|
||||
geoplanet_db_path = sys.argv[1]
|
||||
out_dir = sys.argv[2]
|
||||
|
||||
geoplanet = GeoPlanetFormatter(geoplanet_db_path)
|
||||
geoplanet.build_training_data(out_dir)
|
||||
0
scripts/geodata/graph/__init__.py
Normal file
0
scripts/geodata/graph/__init__.py
Normal file
41
scripts/geodata/graph/scc.py
Normal file
41
scripts/geodata/graph/scc.py
Normal file
@@ -0,0 +1,41 @@
|
||||
VISIT, VISIT_EDGE, POST_VISIT = range(3)
|
||||
|
||||
|
||||
def strongly_connected_components(graph):
|
||||
'''
|
||||
Find strongly connected components in a graph using iterative
|
||||
depth-first search.
|
||||
|
||||
Based on:
|
||||
http://code.activestate.com/recipes/578507-strongly-connected-components-of-a-directed-graph/
|
||||
'''
|
||||
identified = set()
|
||||
stack = []
|
||||
index = {}
|
||||
boundaries = []
|
||||
|
||||
for v in graph:
|
||||
if v not in index:
|
||||
todo = [(VISIT, v)]
|
||||
while todo:
|
||||
op, v = todo.pop()
|
||||
if op == VISIT:
|
||||
index[v] = len(stack)
|
||||
stack.append(v)
|
||||
boundaries.append(index[v])
|
||||
todo.append((POST_VISIT, v))
|
||||
todo.extend([(VISIT_EDGE, w) for w in graph[v]])
|
||||
elif op == VISIT_EDGE:
|
||||
if v not in index:
|
||||
todo.append((VISIT, v))
|
||||
elif v not in identified:
|
||||
while index[v] < boundaries[-1]:
|
||||
boundaries.pop()
|
||||
else:
|
||||
# op == POST_VISIT
|
||||
if boundaries[-1] == index[v]:
|
||||
boundaries.pop()
|
||||
scc = stack[index[v]:]
|
||||
del stack[index[v]:]
|
||||
identified.update(scc)
|
||||
yield scc
|
||||
32
scripts/geodata/graph/topsort.py
Normal file
32
scripts/geodata/graph/topsort.py
Normal file
@@ -0,0 +1,32 @@
|
||||
|
||||
def topsort(graph):
|
||||
'''
|
||||
Topological sort for a dependency graph, e.g.
|
||||
|
||||
Usage:
|
||||
|
||||
>>> graph = {
|
||||
'a': ['b'],
|
||||
'b': ['d'],
|
||||
'c': ['d', 'a'],
|
||||
'd': [],
|
||||
}
|
||||
>>> topsort(graph)
|
||||
|
||||
Returns: ['d', 'b', 'a', 'c']
|
||||
|
||||
'''
|
||||
todos = set(graph.keys())
|
||||
seen = set()
|
||||
result = []
|
||||
while todos:
|
||||
for key in todos:
|
||||
deps = graph[key]
|
||||
if len([d for d in deps if d in seen]) == len(deps):
|
||||
break
|
||||
else:
|
||||
raise Exception('Cycle: {}'.format(todos))
|
||||
todos.remove(key)
|
||||
result.append(key)
|
||||
seen.add(key)
|
||||
return result
|
||||
0
scripts/geodata/i18n/__init__.py
Normal file
0
scripts/geodata/i18n/__init__.py
Normal file
139
scripts/geodata/i18n/cldr_languages.py
Normal file
139
scripts/geodata/i18n/cldr_languages.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import requests
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from cStringIO import StringIO
|
||||
from lxml import etree
|
||||
|
||||
from unicode_paths import CLDR_DIR
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
DEFAULT_LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'language', 'countries')
|
||||
|
||||
CLDR_SUPPLEMENTAL_DATA = os.path.join(CLDR_DIR, 'common', 'supplemental',
|
||||
'supplementalData.xml')
|
||||
|
||||
ISO_639_3 = 'http://www-01.sil.org/iso639-3/iso-639-3.tab'
|
||||
ISO_MACROLANGUAGES = 'http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab'
|
||||
|
||||
ISO_LANGUAGES_FILENAME = 'iso_languages.tsv'
|
||||
MACROLANGUAGES_FILENAME = 'iso_macrolanguages.tsv'
|
||||
COUNTRY_LANGUAGES_FILENAME = 'country_language.tsv'
|
||||
SCRIPT_LANGUAGES_FILENAME = 'script_languages.tsv'
|
||||
|
||||
REGIONAL = 'official_regional'
|
||||
UNKNOWN_COUNTRY = 'zz'
|
||||
UNKNOWN_LANGUAGES = ('und', 'zxx')
|
||||
|
||||
|
||||
def write_country_official_languages_file(xml, out_dir):
|
||||
lang_file = open(os.path.join(out_dir, COUNTRY_LANGUAGES_FILENAME), 'w')
|
||||
lang_writer = csv.writer(lang_file, delimiter='\t')
|
||||
|
||||
def get_population_pct(lang):
|
||||
return int(lang.attrib.get('populationPercent', 0))
|
||||
|
||||
lang_scripts = {}
|
||||
for lang in xml.xpath('//languageData/language'):
|
||||
language_code = lang.attrib['type'].lower()
|
||||
scripts = lang.get('scripts')
|
||||
if not scripts:
|
||||
continue
|
||||
territories = lang.get('territories')
|
||||
if (language_code, None) not in lang_scripts:
|
||||
lang_scripts[(language_code, None)] = scripts
|
||||
|
||||
if not territories:
|
||||
continue
|
||||
for territory in territories.strip().split():
|
||||
lang_scripts[(language_code, territory.lower())] = scripts
|
||||
|
||||
for territory in xml.xpath('//territoryInfo/territory'):
|
||||
country_code = territory.attrib['type'].lower()
|
||||
if country_code == UNKNOWN_COUNTRY:
|
||||
continue
|
||||
langs = territory.xpath('languagePopulation')
|
||||
languages = Counter()
|
||||
official = set()
|
||||
regional = set()
|
||||
for lang in langs:
|
||||
language = lang.attrib['type'].lower().split('_')[0]
|
||||
official_status = lang.attrib.get('officialStatus')
|
||||
languages[language] += float(lang.attrib['populationPercent'])
|
||||
if official_status and official_status != REGIONAL:
|
||||
official.add(language)
|
||||
elif official_status == REGIONAL:
|
||||
regional.add(language)
|
||||
|
||||
if official:
|
||||
languages = Counter({l: c for l, c in languages.iteritems()
|
||||
if l in official or l in regional})
|
||||
else:
|
||||
languages = Counter({l: c for l, c in languages.most_common(1)})
|
||||
|
||||
for lang, pct in languages.most_common():
|
||||
if lang in UNKNOWN_LANGUAGES:
|
||||
continue
|
||||
|
||||
script = lang_scripts.get((lang, country_code), lang_scripts.get((lang, None), ''))
|
||||
|
||||
lang_writer.writerow((country_code, lang, script.replace(' ', ','),
|
||||
str(min(pct, 100.0)), str(int(lang in official))))
|
||||
|
||||
RETIRED = 'R'
|
||||
INDIVIDUAL = 'I'
|
||||
MACRO = 'M'
|
||||
LIVING = 'L'
|
||||
|
||||
|
||||
def write_languages_file(langs, macro, out_dir):
|
||||
lang_file = open(os.path.join(out_dir, 'iso_languages.tsv'), 'w')
|
||||
writer = csv.writer(lang_file, delimiter='\t')
|
||||
writer.writerow(('ISO 639-3', 'ISO 639-2B', 'ISO 639-2T',
|
||||
'ISO 639-1', 'type', 'macro'))
|
||||
|
||||
macro_reader = csv.reader(StringIO(macro), delimiter='\t')
|
||||
headers = macro_reader.next()
|
||||
assert len(headers) == 3
|
||||
macros = {minor_code: macro_code for (macro_code, minor_code, status)
|
||||
in macro_reader if status != RETIRED}
|
||||
|
||||
lang_reader = csv.reader(StringIO(langs), delimiter='\t')
|
||||
headers = lang_reader.next()
|
||||
assert headers[:6] == ['Id', 'Part2B', 'Part2T',
|
||||
'Part1', 'Scope', 'Language_Type']
|
||||
|
||||
for line in lang_reader:
|
||||
iso639_3, iso639_2b, iso639_2t, iso639_1, scope, lang_type = line[:6]
|
||||
macro = macros.get(iso639_3, '')
|
||||
# Only living languages that are either individual or macro
|
||||
if scope in (INDIVIDUAL, MACRO) and lang_type == LIVING:
|
||||
writer.writerow((iso639_3, iso639_2b, iso639_2t,
|
||||
iso639_1, scope, macro))
|
||||
|
||||
|
||||
def fetch_cldr_languages(out_dir=DEFAULT_LANGUAGES_DIR):
|
||||
response = requests.get(ISO_639_3)
|
||||
langs = response.content
|
||||
|
||||
response = requests.get(ISO_MACROLANGUAGES)
|
||||
macro = response.content
|
||||
write_languages_file(langs, macro, out_dir)
|
||||
|
||||
supplemental = open(CLDR_SUPPLEMENTAL_DATA)
|
||||
xml = etree.parse(supplemental)
|
||||
write_country_official_languages_file(xml, out_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-o', '--out',
|
||||
default=DEFAULT_LANGUAGES_DIR,
|
||||
help='Out directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
fetch_cldr_languages(args.out)
|
||||
30
scripts/geodata/i18n/download_cldr.py
Normal file
30
scripts/geodata/i18n/download_cldr.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from unicode_paths import CLDR_DIR
|
||||
from geodata.file_utils import ensure_dir
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
CLDR_URL = 'http://www.unicode.org/Public/cldr/latest/core.zip'
|
||||
|
||||
|
||||
def download_cldr(temp_dir=None):
|
||||
if os.path.exists(CLDR_DIR):
|
||||
shutil.rmtree(CLDR_DIR)
|
||||
ensure_dir(CLDR_DIR)
|
||||
|
||||
if not temp_dir:
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
cldr_filename = os.path.join(temp_dir, CLDR_URL.rsplit('/', 1)[-1])
|
||||
|
||||
subprocess.check_call(['wget', CLDR_URL, '-O', cldr_filename])
|
||||
subprocess.check_call(['unzip', cldr_filename, '-d', CLDR_DIR])
|
||||
|
||||
if __name__ == '__main__':
|
||||
download_cldr(*sys.argv[1:])
|
||||
37
scripts/geodata/i18n/google.py
Normal file
37
scripts/geodata/i18n/google.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import re
|
||||
import requests
|
||||
import six.moves.urllib_parse as urlparse
|
||||
import ujson
|
||||
|
||||
requests.models.json = ujson
|
||||
|
||||
|
||||
GOOGLE_I18N_API = 'http://i18napis.appspot.com'
|
||||
GOOGLE_ADDRESS_DATA_API = urlparse.urljoin(GOOGLE_I18N_API, 'address/data/')
|
||||
|
||||
|
||||
class GoogleI18N(object):
|
||||
'''
|
||||
Fetches data from e.g. http://i18napis.appspot.com/address/data/GB
|
||||
and caches it in a dictionary for each country. These requests are
|
||||
lightweight, so for a given run of a program, max 250 requests
|
||||
will be made.
|
||||
'''
|
||||
def __init__(self):
|
||||
self.responses = {}
|
||||
|
||||
def get(self, country_code):
|
||||
ret = self.responses.get(country_code.lower())
|
||||
|
||||
if ret is None:
|
||||
url = urlparse.urljoin(GOOGLE_ADDRESS_DATA_API, country_code.upper())
|
||||
response = requests.get(url)
|
||||
if response.ok:
|
||||
ret = response.json()
|
||||
self.responses[country_code.lower()] = ret
|
||||
else:
|
||||
self.responses[country_code.lower()] = {}
|
||||
return ret
|
||||
|
||||
|
||||
google_i18n = GoogleI18N()
|
||||
86
scripts/geodata/i18n/languages.py
Normal file
86
scripts/geodata/i18n/languages.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import os
|
||||
import csv
|
||||
import sys
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.csv_utils import unicode_csv_reader
|
||||
|
||||
LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'language')
|
||||
|
||||
country_languages = defaultdict(OrderedDict)
|
||||
# Only official and de facto official, no official_regional
|
||||
official_languages = defaultdict(OrderedDict)
|
||||
|
||||
regional_languages = defaultdict(OrderedDict)
|
||||
road_language_overrides = defaultdict(OrderedDict)
|
||||
|
||||
languages = set()
|
||||
all_languages = languages
|
||||
|
||||
osm_admin1_ids = set()
|
||||
|
||||
languages_initialized = False
|
||||
|
||||
|
||||
def init_languages(languages_dir=LANGUAGES_DIR):
|
||||
global languages_initialized
|
||||
if languages_initialized:
|
||||
return
|
||||
path = os.path.join(languages_dir, 'countries', 'country_language.tsv')
|
||||
if not os.path.exists(path):
|
||||
raise ValueError('File does not exist: {}'.format(path))
|
||||
|
||||
for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
|
||||
country_languages[country][lang] = int(is_official)
|
||||
languages.add(lang)
|
||||
|
||||
for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
|
||||
if int(is_official) or len(country_languages[country]) == 1:
|
||||
official_languages[country][lang] = 1
|
||||
|
||||
path = os.path.join(languages_dir, 'countries', 'road_sign_languages.tsv')
|
||||
for country, lang, default in csv.reader(open(path), delimiter='\t'):
|
||||
road_language_overrides[country][lang] = int(default)
|
||||
if lang not in languages:
|
||||
languages.add(lang)
|
||||
|
||||
path = os.path.join(languages_dir, 'regional', 'adm1.tsv')
|
||||
|
||||
for country, key, value, langs, default in unicode_csv_reader(open(path), delimiter='\t'):
|
||||
if key == 'osm':
|
||||
osm_admin1_ids.add(tuple(value.split(':')))
|
||||
for lang in langs.split(','):
|
||||
regional_languages[(country, key, value)][lang] = int(default)
|
||||
if lang not in country_languages[country]:
|
||||
country_languages[country][lang] = 0
|
||||
if lang not in languages:
|
||||
languages.add(lang)
|
||||
|
||||
languages_initialized = True
|
||||
|
||||
|
||||
init_languages()
|
||||
|
||||
|
||||
def get_country_languages(country, official=True, overrides=True):
|
||||
if official:
|
||||
languages = official_languages[country]
|
||||
else:
|
||||
languages = country_languages[country]
|
||||
|
||||
if overrides:
|
||||
road_overrides = road_language_overrides.get(country)
|
||||
if road_overrides and road_overrides.values()[0]:
|
||||
languages = road_overrides
|
||||
elif road_overrides:
|
||||
languages.update(road_overrides)
|
||||
return languages
|
||||
|
||||
|
||||
def get_regional_languages(country, key, value):
|
||||
return regional_languages.get((country, key, value), OrderedDict())
|
||||
5
scripts/geodata/i18n/normalize.py
Normal file
5
scripts/geodata/i18n/normalize.py
Normal file
@@ -0,0 +1,5 @@
|
||||
import unicodedata
|
||||
|
||||
|
||||
def strip_accents(s):
|
||||
return u''.join([c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'])
|
||||
37
scripts/geodata/i18n/scanner.py
Normal file
37
scripts/geodata/i18n/scanner.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
class Scanner(object):
|
||||
'''
|
||||
Simple scanner implementation in Python using regular expression groups.
|
||||
Used to create dynamic lexicons for parsing various CLDR files
|
||||
without compiling a C scanner. Only C scanners are used at runtime
|
||||
'''
|
||||
|
||||
def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
|
||||
self.lexicon = lexicon
|
||||
|
||||
regexes, responses = zip(*lexicon)
|
||||
|
||||
self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
|
||||
self.responses = responses
|
||||
|
||||
def scan(self, s):
|
||||
|
||||
for match in self.regex.finditer(safe_decode(s)):
|
||||
i = match.lastindex
|
||||
response = self.responses[i - 1]
|
||||
token = match.group(i)
|
||||
if not callable(response):
|
||||
yield (token, response)
|
||||
else:
|
||||
responses = response(match, token)
|
||||
if responses is not None:
|
||||
for response, token in responses:
|
||||
yield (token, response)
|
||||
1680
scripts/geodata/i18n/transliteration_rules.py
Normal file
1680
scripts/geodata/i18n/transliteration_rules.py
Normal file
File diff suppressed because one or more lines are too long
273
scripts/geodata/i18n/unicode_data.py
Normal file
273
scripts/geodata/i18n/unicode_data.py
Normal file
@@ -0,0 +1,273 @@
|
||||
'''
|
||||
unicode_data.py
|
||||
---------------
|
||||
|
||||
Python's unicodedata module uses an outdated spec (Unicode 5.2) and since
|
||||
e.g. unicode categories are used in tokenization, we'd like to keep this
|
||||
as up-to-date as possible with the latest standard.
|
||||
'''
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict, namedtuple
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.file_utils import download_file
|
||||
from geodata.string_utils import wide_unichr, wide_ord
|
||||
|
||||
from unicode_properties import *
|
||||
|
||||
from unicode_paths import UNICODE_DATA_DIR
|
||||
|
||||
UNIDATA_URL = 'http://unicode.org/Public/UNIDATA/UnicodeData.txt'
|
||||
|
||||
UNIDATA_DIR = os.path.join(UNICODE_DATA_DIR, 'unidata')
|
||||
LOCAL_UNIDATA_FILE = os.path.join(UNIDATA_DIR, 'UnicodeData.txt')
|
||||
|
||||
unicode_categories = defaultdict(list)
|
||||
unicode_blocks = defaultdict(list)
|
||||
unicode_combining_classes = defaultdict(list)
|
||||
unicode_general_categories = defaultdict(list)
|
||||
unicode_scripts = defaultdict(list)
|
||||
unicode_properties = {}
|
||||
|
||||
unicode_script_ids = {}
|
||||
|
||||
unicode_blocks = {}
|
||||
unicode_category_aliases = {}
|
||||
unicode_property_aliases = {}
|
||||
unicode_property_value_aliases = {}
|
||||
unicode_word_breaks = {}
|
||||
|
||||
|
||||
# Ref: ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
|
||||
UNIDATA_FIELDS = [
|
||||
'code',
|
||||
'name',
|
||||
'category',
|
||||
'combining',
|
||||
'bidi_category',
|
||||
'decomp_mapping',
|
||||
'decimal_value',
|
||||
'digit_value',
|
||||
'numeric_value',
|
||||
'mirrored',
|
||||
'unicode_1_name',
|
||||
'comment',
|
||||
'upper_mapping',
|
||||
'lower_mapping',
|
||||
'title_mapping',
|
||||
]
|
||||
|
||||
UnicodeDataRow = namedtuple('UnicodeDataRow', ','.join(UNIDATA_FIELDS))
|
||||
|
||||
|
||||
def parse_unicode_data():
|
||||
'''
|
||||
Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
|
||||
'''
|
||||
if not os.path.exists(LOCAL_UNIDATA_FILE):
|
||||
download_file(UNIDATA_URL, LOCAL_UNIDATA_FILE)
|
||||
unidata_file = open(LOCAL_UNIDATA_FILE)
|
||||
|
||||
for line in csv.reader(unidata_file, delimiter=';'):
|
||||
yield UnicodeDataRow(*line)
|
||||
|
||||
|
||||
def iter_unicode_combining_classes():
|
||||
return unicode_combining_classes.iteritems()
|
||||
|
||||
|
||||
def iter_unicode_categories():
|
||||
return unicode_categories.iteritems()
|
||||
|
||||
|
||||
def get_unicode_category(cat):
|
||||
return unicode_categories[cat]
|
||||
|
||||
|
||||
def get_unicode_combining_class(c):
|
||||
return unicode_combining_classes[c]
|
||||
|
||||
|
||||
def get_unicode_categories():
|
||||
'''
|
||||
Build dict of unicode categories e.g.
|
||||
|
||||
{
|
||||
'Lu': ['A', 'B', 'C', ...]
|
||||
'Ll': ['a', 'b', 'c', ...]
|
||||
}
|
||||
'''
|
||||
categories = defaultdict(list)
|
||||
for row in parse_unicode_data():
|
||||
categories[row.category].append(wide_unichr(unicode_to_integer(row.code)))
|
||||
return dict(categories)
|
||||
|
||||
|
||||
def get_unicode_combining_classes():
|
||||
'''
|
||||
Build dict of unicode combining classes e.g.
|
||||
|
||||
{
|
||||
'0': ['\x00', '\x01', \x02', ...]
|
||||
}
|
||||
'''
|
||||
combining_classes = defaultdict(list)
|
||||
for row in parse_unicode_data():
|
||||
combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code)))
|
||||
return dict(combining_classes)
|
||||
|
||||
unicode_category_aliases = {
|
||||
'letter': 'L',
|
||||
'lower': 'Ll',
|
||||
'lowercase': 'Ll',
|
||||
'lowercaseletter': 'Ll',
|
||||
'upper': 'Lu',
|
||||
'uppercase': 'Lu',
|
||||
'uppercaseletter': 'Lu',
|
||||
'title': 'Lt',
|
||||
'nonspacing mark': 'Mn',
|
||||
'mark': 'M',
|
||||
}
|
||||
|
||||
COMBINING_CLASS_PROP = 'canonical_combining_class'
|
||||
BLOCK_PROP = 'block'
|
||||
GENERAL_CATEGORY_PROP = 'general_category'
|
||||
SCRIPT_PROP = 'script'
|
||||
WORD_BREAK_PROP = 'word_break'
|
||||
|
||||
|
||||
def init_unicode_categories():
|
||||
'''
|
||||
Initialize module-level dictionaries
|
||||
'''
|
||||
global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
|
||||
global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
|
||||
global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
|
||||
|
||||
unicode_categories.update(get_unicode_categories())
|
||||
unicode_combining_classes.update(get_unicode_combining_classes())
|
||||
|
||||
for key in unicode_categories.keys():
|
||||
unicode_general_categories[key[0]].extend(unicode_categories[key])
|
||||
|
||||
script_chars = get_chars_by_script()
|
||||
for i, script in enumerate(script_chars):
|
||||
if script:
|
||||
unicode_scripts[script.lower()].append(wide_unichr(i))
|
||||
|
||||
unicode_scripts = dict(unicode_scripts)
|
||||
|
||||
unicode_script_ids.update(build_master_scripts_list(script_chars))
|
||||
|
||||
unicode_blocks.update(get_unicode_blocks())
|
||||
unicode_properties.update(get_unicode_properties())
|
||||
unicode_property_aliases.update(get_property_aliases())
|
||||
|
||||
unicode_word_breaks.update(get_word_break_properties())
|
||||
|
||||
for key, value in get_property_value_aliases().iteritems():
|
||||
key = unicode_property_aliases.get(key, key)
|
||||
if key == GENERAL_CATEGORY_PROP:
|
||||
for k, v in value.iteritems():
|
||||
k = k.lower()
|
||||
unicode_category_aliases[k] = v
|
||||
if '_' in k:
|
||||
unicode_category_aliases[k.replace('_', '')] = v
|
||||
|
||||
unicode_property_value_aliases[key] = value
|
||||
|
||||
|
||||
regex_chars = re.compile('([\[\]\{\}\-\^])')
|
||||
|
||||
|
||||
def replace_regex_chars(s):
|
||||
return regex_chars.sub(r'\\\1', s)
|
||||
|
||||
|
||||
def format_regex_char(i):
|
||||
c = wide_unichr(i)
|
||||
return replace_regex_chars(c.encode('unicode-escape'))
|
||||
|
||||
|
||||
def make_char_set_regex(chars):
|
||||
'''
|
||||
Build a regex character set from a list of characters
|
||||
'''
|
||||
group_start = None
|
||||
group_end = None
|
||||
last_ord = -2
|
||||
|
||||
ords = map(wide_ord, chars)
|
||||
ords.sort()
|
||||
|
||||
ords.append(None)
|
||||
|
||||
groups = []
|
||||
|
||||
for i, o in enumerate(ords):
|
||||
if o is not None and o == last_ord + 1:
|
||||
group_end = o
|
||||
elif group_start is not None and group_end is not None:
|
||||
groups.append('-'.join((format_regex_char(group_start), format_regex_char(group_end))))
|
||||
group_end = None
|
||||
group_start = o
|
||||
elif group_start is not None and group_end is None:
|
||||
groups.append(format_regex_char(group_start))
|
||||
group_start = o
|
||||
else:
|
||||
group_start = o
|
||||
|
||||
last_ord = o
|
||||
|
||||
return u'[{}]'.format(u''.join(groups))
|
||||
|
||||
|
||||
name_category = [
|
||||
('control_chars', 'Cc'),
|
||||
('other_format_chars', 'Cf'),
|
||||
('other_not_assigned_chars', 'Cn'),
|
||||
('other_private_use_chars', 'Co'),
|
||||
('other_surrogate_chars', 'Cs'),
|
||||
('letter_lower_chars', 'Ll'),
|
||||
('letter_modifier_chars', 'Lm'),
|
||||
('letter_other_chars', 'Lo'),
|
||||
('letter_title_chars', 'Lt'),
|
||||
('letter_upper_chars', 'Lu'),
|
||||
('mark_spacing_combining_chars', 'Mc'),
|
||||
('mark_enclosing_chars', 'Me'),
|
||||
('mark_nonspacing_chars', 'Mn'),
|
||||
('number_or_digit_chars', 'Nd'),
|
||||
('number_letter_chars', 'Nl'),
|
||||
('number_other_chars', 'No'),
|
||||
('punct_connector_chars', 'Pc'),
|
||||
('punct_dash_chars', 'Pd'),
|
||||
('punct_close_chars', 'Pe'),
|
||||
('punct_final_quote_chars', 'Pf'),
|
||||
('punct_initial_quote_chars', 'Pi'),
|
||||
('punct_other_chars', 'Po'),
|
||||
('punct_open_chars', 'Ps'),
|
||||
('currency_symbol_chars', 'Sc'),
|
||||
('symbol_modifier_chars', 'Sk'),
|
||||
('symbol_math_chars', 'Sm'),
|
||||
('symbol_other_chars', 'So'),
|
||||
('separator_line_chars', 'Zl'),
|
||||
('separator_paragraph_chars', 'Zp'),
|
||||
('space', 'Zs'),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
init_unicode_categories()
|
||||
for name, cat in name_category:
|
||||
if cat not in unicode_categories:
|
||||
continue
|
||||
chars = unicode_categories[cat]
|
||||
print u'{} = {};'.format(name, make_char_set_regex(chars))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
11
scripts/geodata/i18n/unicode_paths.py
Normal file
11
scripts/geodata/i18n/unicode_paths.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources')
|
||||
|
||||
UNICODE_DATA_DIR = os.path.join(DATA_DIR, 'unicode')
|
||||
|
||||
CLDR_DIR = os.path.join(UNICODE_DATA_DIR, 'cldr')
|
||||
463
scripts/geodata/i18n/unicode_properties.py
Normal file
463
scripts/geodata/i18n/unicode_properties.py
Normal file
@@ -0,0 +1,463 @@
|
||||
'''
|
||||
scripts.py
|
||||
|
||||
This code uses the latest copy of Scripts.txt from unicode.org
|
||||
to generate a C file (and header) defining which script every character
|
||||
belongs to.
|
||||
'''
|
||||
|
||||
import csv
|
||||
import os
|
||||
import requests
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
import requests
|
||||
import subprocess
|
||||
|
||||
from cStringIO import StringIO
|
||||
|
||||
from collections import OrderedDict, defaultdict
|
||||
from itertools import islice
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from operator import itemgetter
|
||||
|
||||
from zipfile import ZipFile
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
from geodata.file_utils import ensure_dir, download_file
|
||||
from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
|
||||
|
||||
from cldr_languages import *
|
||||
from download_cldr import download_cldr
|
||||
from languages import get_country_languages
|
||||
from unicode_paths import UNICODE_DATA_DIR
|
||||
from word_breaks import script_regex, regex_char_range
|
||||
|
||||
SRC_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src')
|
||||
|
||||
SCRIPTS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'scripts')
|
||||
LOCAL_SCRIPTS_FILE = os.path.join(SCRIPTS_DATA_DIR, 'Scripts.txt')
|
||||
LOCAL_ISO_15924_FILE = os.path.join(SCRIPTS_DATA_DIR, 'iso15924.txt')
|
||||
|
||||
BLOCKS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'blocks')
|
||||
LOCAL_BLOCKS_FILE = os.path.join(BLOCKS_DATA_DIR, 'Blocks.txt')
|
||||
|
||||
PROPS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'props')
|
||||
LOCAL_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'PropList.txt')
|
||||
LOCAL_PROP_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyAliases.txt')
|
||||
LOCAL_PROP_VALUE_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyValueAliases.txt')
|
||||
LOCAL_DERIVED_CORE_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'DerivedCoreProperties.txt')
|
||||
|
||||
WORD_BREAKS_DIR = os.path.join(UNICODE_DATA_DIR, 'word_breaks')
|
||||
LOCAL_WORD_BREAKS_FILE = os.path.join(WORD_BREAKS_DIR, 'WordBreakProperty.txt')
|
||||
|
||||
SCRIPTS_HEADER = 'unicode_script_types.h'
|
||||
SCRIPTS_DATA_FILENAME = 'unicode_scripts_data.c'
|
||||
|
||||
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
|
||||
BLOCKS_URL = 'http://unicode.org/Public/UNIDATA/Blocks.txt'
|
||||
PROPS_URL = 'http://unicode.org/Public/UNIDATA/PropList.txt'
|
||||
PROP_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyAliases.txt'
|
||||
PROP_VALUE_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyValueAliases.txt'
|
||||
DERIVED_CORE_PROPS_URL = 'http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt'
|
||||
WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt'
|
||||
|
||||
ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
|
||||
|
||||
scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
|
||||
#define UNICODE_SCRIPT_TYPES_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define NUM_CODEPOINTS {num_codepoints}
|
||||
#define MAX_LANGS {max_langs}
|
||||
|
||||
typedef enum {{
|
||||
{script_enum}
|
||||
NUM_SCRIPTS
|
||||
}} script_t;
|
||||
|
||||
#endif
|
||||
'''
|
||||
|
||||
scripts_c_data_template = u'''
|
||||
script_t char_scripts[] = {{
|
||||
{char_scripts}
|
||||
}};
|
||||
|
||||
script_code_t script_codes[] = {{
|
||||
{script_codes}
|
||||
}};
|
||||
|
||||
script_languages_t script_languages[] = {{
|
||||
{script_languages}
|
||||
}};
|
||||
'''
|
||||
|
||||
script_code_template = '{{SCRIPT_{name}, "{code}"}}'
|
||||
|
||||
script_language_template = '{{{num_langs}, {languages}}}'
|
||||
|
||||
|
||||
def unicode_to_integer(u):
|
||||
return int('0x{}'.format(u), 16)
|
||||
|
||||
|
||||
def script_name_constant(i, u):
|
||||
return u'SCRIPT_{} = {}'.format(u.upper(), i)
|
||||
|
||||
|
||||
UNKNOWN_SCRIPT = 'Unknown'
|
||||
COMMON_SCRIPT = 'Common'
|
||||
|
||||
|
||||
def parse_char_range(r):
|
||||
return [unicode_to_integer(u) for u in r.split('..')]
|
||||
|
||||
|
||||
def get_chars_by_script():
|
||||
scripts_file = open(LOCAL_SCRIPTS_FILE)
|
||||
scripts = [None] * NUM_CODEPOINTS
|
||||
|
||||
# Lines look like:
|
||||
# 0041..005A ; Latin # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
|
||||
for char_range, script, char_class in script_regex.findall(scripts_file.read()):
|
||||
script_range = parse_char_range(char_range)
|
||||
if len(script_range) == 2:
|
||||
for i in xrange(script_range[0], script_range[1] + 1):
|
||||
scripts[i] = script
|
||||
elif script_range:
|
||||
scripts[script_range[0]] = script
|
||||
|
||||
return scripts
|
||||
|
||||
|
||||
COMMENT_CHAR = '#'
|
||||
DELIMITER_CHAR = ';'
|
||||
|
||||
|
||||
def parse_file(f):
|
||||
for line in f:
|
||||
line = line.split(COMMENT_CHAR)[0].strip()
|
||||
if not line:
|
||||
continue
|
||||
tokens = line.split(DELIMITER_CHAR)
|
||||
if tokens:
|
||||
yield [t.strip() for t in tokens]
|
||||
|
||||
|
||||
def get_property_aliases():
|
||||
prop_aliases_file = open(LOCAL_PROP_ALIASES_FILE)
|
||||
|
||||
aliases = {}
|
||||
|
||||
for line in parse_file(prop_aliases_file):
|
||||
prop = line[1]
|
||||
prop_aliases = [line[0]] + line[2:]
|
||||
|
||||
for alias in prop_aliases:
|
||||
aliases[alias.lower()] = prop.lower()
|
||||
|
||||
return aliases
|
||||
|
||||
|
||||
def get_property_value_aliases():
|
||||
prop_value_aliases_file = open(LOCAL_PROP_VALUE_ALIASES_FILE)
|
||||
|
||||
value_aliases = defaultdict(dict)
|
||||
|
||||
for line in parse_file(prop_value_aliases_file):
|
||||
prop = line[0]
|
||||
if prop not in ('ccc', 'gc'):
|
||||
value = line[2]
|
||||
aliases = [line[1]] + line[3:]
|
||||
else:
|
||||
value = line[1]
|
||||
aliases = line[2:]
|
||||
|
||||
for alias in aliases:
|
||||
value_aliases[prop.lower()][alias] = value
|
||||
|
||||
return dict(value_aliases)
|
||||
|
||||
|
||||
def get_unicode_blocks():
|
||||
blocks_file = open(LOCAL_BLOCKS_FILE)
|
||||
|
||||
blocks = defaultdict(list)
|
||||
|
||||
for line in parse_file(blocks_file):
|
||||
char_range, block = line
|
||||
char_range = parse_char_range(char_range)
|
||||
|
||||
if len(char_range) == 2:
|
||||
for i in xrange(char_range[0], char_range[1] + 1):
|
||||
blocks[block.lower()].append(wide_unichr(i))
|
||||
elif char_range:
|
||||
blocks[block.lower()].append(wide_unichr(char_range[0]))
|
||||
|
||||
return dict(blocks)
|
||||
|
||||
|
||||
def get_unicode_properties():
|
||||
props_file = open(LOCAL_PROPS_FILE)
|
||||
|
||||
props = defaultdict(list)
|
||||
|
||||
for line in parse_file(props_file):
|
||||
char_range, prop = line
|
||||
|
||||
char_range = parse_char_range(char_range)
|
||||
|
||||
if len(char_range) == 2:
|
||||
for i in xrange(char_range[0], char_range[1] + 1):
|
||||
props[prop.lower()].append(wide_unichr(i))
|
||||
elif char_range:
|
||||
props[prop.lower()].append(wide_unichr(char_range[0]))
|
||||
|
||||
derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
|
||||
for line in parse_file(derived_props_file):
|
||||
char_range, prop = line
|
||||
char_range = parse_char_range(char_range)
|
||||
|
||||
if len(char_range) == 2:
|
||||
for i in xrange(char_range[0], char_range[1] + 1):
|
||||
props[prop.lower()].append(wide_unichr(i))
|
||||
elif char_range:
|
||||
props[prop.lower()].append(wide_unichr(char_range[0]))
|
||||
|
||||
return dict(props)
|
||||
|
||||
|
||||
def get_word_break_properties():
|
||||
props_file = open(LOCAL_WORD_BREAKS_FILE)
|
||||
|
||||
props = defaultdict(list)
|
||||
|
||||
for line in parse_file(props_file):
|
||||
char_range, prop = line
|
||||
|
||||
char_range = parse_char_range(char_range)
|
||||
|
||||
if len(char_range) == 2:
|
||||
for i in xrange(char_range[0], char_range[1] + 1):
|
||||
props[prop].append(wide_unichr(i))
|
||||
elif char_range:
|
||||
props[prop].append(wide_unichr(char_range[0]))
|
||||
|
||||
return dict(props)
|
||||
|
||||
|
||||
def build_master_scripts_list(chars):
|
||||
all_scripts = OrderedDict.fromkeys(filter(bool, chars))
|
||||
|
||||
for i, script in enumerate(all_scripts.keys()):
|
||||
all_scripts[script] = i + 1
|
||||
|
||||
# Unknown script for all characters not covered
|
||||
all_scripts[UNKNOWN_SCRIPT] = 0
|
||||
|
||||
return all_scripts
|
||||
|
||||
|
||||
SCRIPT_ALIASES_SUPPLEMENTAL = {
|
||||
'Hant': 'Han',
|
||||
'Hans': 'Han'
|
||||
}
|
||||
|
||||
|
||||
def get_script_codes(all_scripts):
|
||||
|
||||
if not os.path.exists(LOCAL_ISO_15924_FILE):
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])
|
||||
|
||||
# This comes as a .zip
|
||||
script_codes_response = requests.get(ISO_15924_URL)
|
||||
zf = ZipFile(StringIO(script_codes_response.content))
|
||||
iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]
|
||||
|
||||
# Strip out the comments, etc.
|
||||
temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
|
||||
if line.strip() and not line.strip().startswith('#')])
|
||||
|
||||
f = open(LOCAL_ISO_15924_FILE, 'w')
|
||||
f.write(safe_encode(temp_iso15924_file))
|
||||
f.close()
|
||||
|
||||
script_codes_file = open(LOCAL_ISO_15924_FILE)
|
||||
|
||||
script_codes = {}
|
||||
seen_scripts = set()
|
||||
|
||||
# Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
|
||||
for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
|
||||
if name in all_scripts:
|
||||
script_codes[code] = name
|
||||
seen_scripts.add(name)
|
||||
else:
|
||||
normalized_name = name.split('(')[0].strip()
|
||||
if normalized_name in all_scripts and normalized_name not in seen_scripts:
|
||||
script_codes[code] = normalized_name
|
||||
seen_scripts.add(normalized_name)
|
||||
|
||||
value_aliases = get_property_value_aliases()
|
||||
script_aliases = value_aliases['sc']
|
||||
|
||||
for code, script in script_aliases.iteritems():
|
||||
if code not in script_codes and script in all_scripts:
|
||||
script_codes[code] = script
|
||||
|
||||
script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)
|
||||
|
||||
return script_codes
|
||||
|
||||
|
||||
SCRIPT_CODE_ALIASES = {
|
||||
'Jpan': ['Hani', 'Hira', 'Kana'],
|
||||
'Kore': ['Hang', 'Han']
|
||||
}
|
||||
|
||||
|
||||
def extract_language_scripts(xml):
|
||||
language_scripts = defaultdict(list)
|
||||
|
||||
for lang in xml.xpath('//languageData/language'):
|
||||
language_code = lang.attrib['type'].lower()
|
||||
scripts = lang.get('scripts')
|
||||
if not scripts:
|
||||
continue
|
||||
for script in scripts.split():
|
||||
script_aliases = SCRIPT_CODE_ALIASES.get(script)
|
||||
if not script_aliases:
|
||||
language_scripts[language_code].append(script)
|
||||
else:
|
||||
language_scripts[language_code].extend(script_aliases)
|
||||
|
||||
return language_scripts
|
||||
|
||||
|
||||
def batch_iter(iterable, batch_size):
|
||||
source_iter = iter(iterable)
|
||||
while True:
|
||||
batch = list(islice(source_iter, batch_size))
|
||||
if len(batch) > 0:
|
||||
yield batch
|
||||
else:
|
||||
return
|
||||
|
||||
|
||||
def get_script_languages():
|
||||
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
|
||||
# to identify the language. We keep track of those single language scripts to inform
|
||||
# the language classifier
|
||||
|
||||
chars = get_chars_by_script()
|
||||
all_scripts = build_master_scripts_list(chars)
|
||||
script_codes = get_script_codes(all_scripts)
|
||||
|
||||
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
|
||||
cldr_xml = etree.parse(cldr_supplemental_data)
|
||||
language_scripts = extract_language_scripts(cldr_xml)
|
||||
|
||||
country_languages_path = os.path.join(DEFAULT_LANGUAGES_DIR, COUNTRY_LANGUAGES_FILENAME)
|
||||
if not os.path.exists(country_languages_path):
|
||||
fetch_cldr_languages(DEFAULT_LANGUAGES_DIR)
|
||||
|
||||
country_language_file = open(country_languages_path)
|
||||
country_language_reader = csv.reader(country_language_file, delimiter='\t')
|
||||
|
||||
countries = set([country for country, lang, script, pct, is_official
|
||||
in country_language_reader])
|
||||
|
||||
spoken_languages = set.union(*(set(get_country_languages(country)) for country in countries))
|
||||
|
||||
script_code_languages = defaultdict(list)
|
||||
for language, scripts in language_scripts.iteritems():
|
||||
if language not in spoken_languages:
|
||||
continue
|
||||
for script in scripts:
|
||||
script_code_languages[script].append(language)
|
||||
|
||||
script_languages = defaultdict(list)
|
||||
|
||||
for script_code, script_name in script_codes.iteritems():
|
||||
langs = script_code_languages.get(script_code, [])
|
||||
script_languages[script_name].extend(langs)
|
||||
|
||||
for name in all_scripts.iterkeys():
|
||||
script_languages.setdefault(name, [])
|
||||
|
||||
return script_languages
|
||||
|
||||
|
||||
def main(out_dir=SRC_DIR):
|
||||
# Output is a C header and data file, see templates
|
||||
out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
|
||||
out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')
|
||||
|
||||
download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
|
||||
download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
|
||||
download_file(PROPS_URL, LOCAL_PROPS_FILE)
|
||||
download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
|
||||
download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
|
||||
download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
|
||||
download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)
|
||||
|
||||
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
|
||||
download_cldr()
|
||||
|
||||
chars = get_chars_by_script()
|
||||
all_scripts = build_master_scripts_list(chars)
|
||||
script_codes = get_script_codes(all_scripts)
|
||||
|
||||
script_languages = get_script_languages()
|
||||
|
||||
max_langs = 0
|
||||
|
||||
for script, langs in script_languages.iteritems():
|
||||
num_langs = len(langs)
|
||||
if num_langs > max_langs:
|
||||
max_langs = num_langs
|
||||
|
||||
# Generate C header and constants
|
||||
|
||||
script_enum = u'''
|
||||
'''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))])
|
||||
|
||||
out_header.write(scripts_header_template.format(num_codepoints=NUM_CODEPOINTS,
|
||||
max_langs=max_langs,
|
||||
script_enum=script_enum))
|
||||
out_header.close()
|
||||
|
||||
# Generate C data file
|
||||
|
||||
char_scripts_data = u''',
|
||||
'''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])
|
||||
|
||||
script_codes_data = u''',
|
||||
'''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])
|
||||
|
||||
sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]
|
||||
|
||||
script_language_data = u''',
|
||||
'''.join([script_language_template.format(num_langs=len(langs),
|
||||
languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL'))
|
||||
for langs in sorted_lang_scripts])
|
||||
|
||||
out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
|
||||
char_scripts=char_scripts_data,
|
||||
script_codes=script_codes_data,
|
||||
script_languages=script_language_data))
|
||||
out_file.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(*sys.argv[1:])
|
||||
140
scripts/geodata/i18n/word_breaks.py
Normal file
140
scripts/geodata/i18n/word_breaks.py
Normal file
@@ -0,0 +1,140 @@
|
||||
'''
|
||||
word_breaks.py
|
||||
|
||||
This script is used to automatically build ranges of unicode characters
|
||||
from the unicode spec's word break properties. These ranges help us
|
||||
build a tokenizer that does the right thing in every language with regard
|
||||
to word segmentation. The lines outputted by this script can be pasted
|
||||
into scanner.re before compliation.
|
||||
'''
|
||||
|
||||
import requests
|
||||
from collections import defaultdict
|
||||
import re
|
||||
|
||||
# Operate on WordBreakProperty.txt file
|
||||
hebrew_letter_regex = re.compile('^([^\s]+)[\s]+; Hebrew_Letter ')
|
||||
format_regex = re.compile('^([^\s]+)[\s]+; Format ')
|
||||
extend_regex = re.compile('^([^\s]+)[\s]+; Extend ')
|
||||
katakana_regex = re.compile('^([^\s]+)[\s]+; Katakana ')
|
||||
other_alpha_letter_regex = re.compile('^([^\s]+)[\s]+; ALetter # Lo (?!.*(?:HANGUL|TIBETAN|JAVANESE|BALINESE|YI) )')
|
||||
mid_letter_regex = re.compile('^([^\s]+)[\s]+; MidLetter')
|
||||
mid_number_regex = re.compile('^([^\s]+)[\s]+; MidNum ')
|
||||
mid_num_letter_regex = re.compile('^([^\s]+)[\s]+; MidNumLet ')
|
||||
numeric_regex = re.compile('^([^\s]+)[\s]+; Numeric ')
|
||||
extend_num_letter_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
|
||||
|
||||
# Operate on Scripts.txt file
|
||||
other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
|
||||
|
||||
script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
|
||||
|
||||
WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
|
||||
HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
|
||||
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
|
||||
|
||||
ideographic_scripts = set([
|
||||
'han',
|
||||
'hiragana',
|
||||
'hangul',
|
||||
'tibetan',
|
||||
'thai',
|
||||
'lao',
|
||||
'javanese',
|
||||
'balinese',
|
||||
'yi',
|
||||
])
|
||||
|
||||
|
||||
def regex_char_range(match):
|
||||
r = match.split('..')
|
||||
# Wide version
|
||||
return u'-'.join([('\u{}'.format(c.lower()) if len(c) < 5 else '\U{}'.format(c.lower().rjust(8, '0'))) for c in r])
|
||||
|
||||
|
||||
def get_letter_range(text, *regexes):
|
||||
char_ranges = []
|
||||
for line in text.split('\n'):
|
||||
for regex in regexes:
|
||||
m = regex.match(line)
|
||||
if m:
|
||||
char_ranges.append(regex_char_range(m.group(1)))
|
||||
return char_ranges
|
||||
|
||||
|
||||
def get_letter_ranges_for_scripts(text, scripts, char_class_regex):
|
||||
char_ranges = []
|
||||
for char_range, script, char_class in script_regex.findall(text):
|
||||
if script.lower() in scripts and char_class_regex.match(char_class):
|
||||
char_ranges.append(regex_char_range(char_range))
|
||||
return char_ranges
|
||||
|
||||
|
||||
def get_char_class(text, char_class_regex):
|
||||
char_ranges = []
|
||||
for char_range, script, char_class in script_regex.findall(text):
|
||||
if char_class_regex.match(char_class):
|
||||
char_ranges.append(regex_char_range(char_range))
|
||||
return char_ranges
|
||||
|
||||
|
||||
hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
|
||||
|
||||
|
||||
def get_hangul_syllable_ranges(text):
|
||||
char_ranges = defaultdict(list)
|
||||
for line in text.split('\n'):
|
||||
m = hangul_syllable_type_regex.match(line)
|
||||
if m:
|
||||
char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
|
||||
return dict(char_ranges)
|
||||
|
||||
|
||||
name_funcs = [
|
||||
('hebrew_letter_chars', hebrew_letter_regex),
|
||||
('format_chars', format_regex),
|
||||
('extend_chars', extend_regex),
|
||||
('katakana_chars', katakana_regex),
|
||||
('letter_other_alpha_chars', other_alpha_letter_regex),
|
||||
('mid_letter_chars', mid_letter_regex),
|
||||
('mid_number_chars', mid_number_regex),
|
||||
('mid_num_letter_chars', mid_num_letter_regex),
|
||||
('numeric_chars', numeric_regex),
|
||||
('extend_num_letter_chars', extend_num_letter_regex),
|
||||
]
|
||||
|
||||
IDEOGRAPHIC_CHARS = 'ideographic_chars'
|
||||
IDEOGRAPHIC_NUMERIC_CHARS = 'ideographic_numeric_chars'
|
||||
|
||||
numbers_regex = re.compile('N[ol]', re.I)
|
||||
letters_regex = re.compile('L*', re.I)
|
||||
|
||||
|
||||
def main():
|
||||
''' Insert these lines into scanner.re '''
|
||||
response = requests.get(WORD_BREAK_PROPERTIES_URL)
|
||||
|
||||
if response.ok:
|
||||
for name, reg in name_funcs:
|
||||
s = get_letter_range(response.content, reg)
|
||||
print '{} = [{}];'.format(name, ''.join(s))
|
||||
|
||||
response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
|
||||
|
||||
if response.ok:
|
||||
syllable_ranges = get_hangul_syllable_ranges(response.content)
|
||||
for name, ranges in syllable_ranges.iteritems():
|
||||
print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
|
||||
|
||||
response = requests.get(SCRIPTS_URL)
|
||||
if response.ok:
|
||||
s = ''.join(get_char_class(response.content, numbers_regex))
|
||||
|
||||
print '{} = [{}];'.format(IDEOGRAPHIC_NUMERIC_CHARS, ''.join(s))
|
||||
|
||||
s = ''.join(get_letter_ranges_for_scripts(response.content, ideographic_scripts, letters_regex))
|
||||
print '{} = [{}];'.format(IDEOGRAPHIC_CHARS, ''.join(s))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
0
scripts/geodata/intersections/__init__.py
Normal file
0
scripts/geodata/intersections/__init__.py
Normal file
18
scripts/geodata/intersections/query.py
Normal file
18
scripts/geodata/intersections/query.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from collections import namedtuple
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.math.sampling import weighted_choice
|
||||
|
||||
IntersectionQuery = namedtuple('IntersectionQuery', 'road1, intersection_phrase, road2')
|
||||
|
||||
NULL_INTERSECTION_QUERY = IntersectionQuery(None, None, None)
|
||||
|
||||
|
||||
class Intersection(object):
|
||||
@classmethod
|
||||
def phrase(cls, language, country=None):
|
||||
values, probs = address_config.alternative_probabilities('cross_streets.intersection', language, country=country)
|
||||
if not values:
|
||||
return None
|
||||
phrase, props = weighted_choice(values, probs)
|
||||
return phrase
|
||||
0
scripts/geodata/language_id/__init__.py
Normal file
0
scripts/geodata/language_id/__init__.py
Normal file
100
scripts/geodata/language_id/create_language_training_data.py
Normal file
100
scripts/geodata/language_id/create_language_training_data.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.osm.osm_address_training_data import WAYS_LANGUAGE_DATA_FILENAME, ADDRESS_LANGUAGE_DATA_FILENAME, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME, TOPONYM_LANGUAGE_DATA_FILENAME
|
||||
|
||||
LANGUAGES_ALL_FILE = 'languages.all'
|
||||
LANGAUGES_RANDOM_FILE = 'languages.random'
|
||||
LANGUAGES_TRAIN_FILE = 'languages.train'
|
||||
LANGUAGES_CV_FILE = 'languages.cv'
|
||||
LANGUAGES_TEST_FILE = 'languages.test'
|
||||
|
||||
|
||||
def create_language_training_data(osm_dir, split_data=True, train_split=0.8, cv_split=0.1):
|
||||
language_all_path = os.path.join(osm_dir, LANGUAGES_ALL_FILE)
|
||||
|
||||
ways_path = os.path.join(osm_dir, WAYS_LANGUAGE_DATA_FILENAME)
|
||||
|
||||
if os.system(' '.join(['cat', ways_path, '>', language_all_path])) != 0:
|
||||
raise SystemError('Could not find {}'.format(ways_path))
|
||||
|
||||
addresses_path = os.path.join(osm_dir, ADDRESS_LANGUAGE_DATA_FILENAME)
|
||||
|
||||
if os.system(' '.join(['cat', addresses_path, '>>', language_all_path])) != 0:
|
||||
raise SystemError('Could not find {}'.format(addresses_path))
|
||||
|
||||
formatted_path = os.path.join(osm_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME)
|
||||
|
||||
if os.system(' '.join(['cat', formatted_path, '>>', language_all_path])) != 0:
|
||||
raise SystemError('Could not find {}'.format(formatted_path))
|
||||
|
||||
toponyms_path = os.path.join(osm_dir, TOPONYM_LANGUAGE_DATA_FILENAME)
|
||||
|
||||
if os.system(' '.join(['cat', toponyms_path, '>>', language_all_path])) != 0:
|
||||
raise SystemError('Could not find {}'.format(toponyms_path))
|
||||
|
||||
languages_random_path = os.path.join(osm_dir, LANGAUGES_RANDOM_FILE)
|
||||
|
||||
if os.system(u' '.join(['shuf', '--random-source=/dev/urandom', language_all_path, '>', languages_random_path])) != 0:
|
||||
raise SystemError('shuffle failed')
|
||||
|
||||
languages_train_path = os.path.join(osm_dir, LANGUAGES_TRAIN_FILE)
|
||||
|
||||
if split_data:
|
||||
languages_test_path = os.path.join(osm_dir, LANGUAGES_TEST_FILE)
|
||||
|
||||
num_lines = sum((1 for line in open(languages_random_path)))
|
||||
train_lines = int(train_split * num_lines)
|
||||
|
||||
test_lines = num_lines - train_lines
|
||||
cv_lines = int(test_lines * (cv_split / (1.0 - train_split))) + 1
|
||||
|
||||
subprocess.check_call(['split', '-l', str(train_lines), languages_random_path, os.path.join(osm_dir, 'language-split-')])
|
||||
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-aa'), languages_train_path])
|
||||
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-ab'), languages_test_path])
|
||||
|
||||
languages_cv_path = os.path.join(osm_dir, LANGUAGES_CV_FILE)
|
||||
|
||||
subprocess.check_call(['split', '-l', str(cv_lines), languages_test_path, os.path.join(osm_dir, 'language-split-')])
|
||||
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-aa'), languages_cv_path])
|
||||
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-ab'), languages_test_path])
|
||||
else:
|
||||
subprocess.check_call(['mv', languages_random_path, languages_train_path])
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-n', '--no-split',
|
||||
action='store_false',
|
||||
default=True,
|
||||
help='Do not split data into train/cv/test')
|
||||
|
||||
parser.add_argument('-t', '--train-split',
|
||||
type=float,
|
||||
default=0.8,
|
||||
help='Train split percentage as a float (default 0.8)')
|
||||
|
||||
parser.add_argument('-c', '--cv-split',
|
||||
type=float,
|
||||
default=0.1,
|
||||
help='Cross-validation split percentage as a float (default 0.1)')
|
||||
|
||||
parser.add_argument('-o', '--osm-dir',
|
||||
default=os.getcwd(),
|
||||
help='OSM directory')
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.train_split + args.cv_split >= 1.0:
|
||||
raise ValueError('Train split + cross-validation split must be less than 1.0')
|
||||
|
||||
if not os.path.exists(args.osm_dir):
|
||||
raise ValueError('OSM directory does not exist')
|
||||
|
||||
create_language_training_data(args.osm_dir, split_data=args.no_split, train_split=args.train_split, cv_split=args.cv_split)
|
||||
176
scripts/geodata/language_id/disambiguation.py
Normal file
176
scripts/geodata/language_id/disambiguation.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
|
||||
|
||||
from geodata.address_expansions.gazetteers import *
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.string_utils import wide_iter, wide_ord
|
||||
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
|
||||
from geodata.text.normalize import normalized_tokens, normalize_string
|
||||
from geodata.text.tokenize import tokenize
|
||||
from geodata.text.token_types import token_types
|
||||
|
||||
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
|
||||
|
||||
# For toponyms, we want to limit the countries we consider to those where
|
||||
# the place names can themselves be considered training examples of the language
|
||||
WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
|
||||
'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']),
|
||||
'fr': set(['fr']),
|
||||
'it': set(['it']),
|
||||
'de': set(['de', 'at']),
|
||||
'nl': set(['nl']),
|
||||
'es': set(['es', 'ar', 'mx', 'cl', 'co', 'pe', 'ec', 'pr', 'uy',
|
||||
've', 'cu', 'do', 'bo', 'gt', 'cr', 'py', 'sv', 'pa',
|
||||
'ni', 'hn']),
|
||||
'pt': set(['pt', 'br']),
|
||||
}
|
||||
|
||||
char_scripts = get_chars_by_script()
|
||||
script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())}
|
||||
lang_scripts = defaultdict(set)
|
||||
|
||||
for script, langs in six.iteritems(script_languages):
|
||||
for lang in langs:
|
||||
lang_scripts[lang].add(script)
|
||||
|
||||
lang_scripts = dict(lang_scripts)
|
||||
|
||||
UNKNOWN_SCRIPT = 'Unknown'
|
||||
COMMON_SCRIPT = 'Common'
|
||||
MAX_ASCII = 127
|
||||
|
||||
|
||||
def get_string_script(s):
|
||||
s = safe_decode(s)
|
||||
str_len = len(s)
|
||||
script = last_script = UNKNOWN_SCRIPT
|
||||
is_ascii = True
|
||||
script_len = 0
|
||||
for c in wide_iter(s):
|
||||
script = char_scripts[wide_ord(c)]
|
||||
|
||||
if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
|
||||
script = last_script
|
||||
if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
|
||||
if (script_len < str_len):
|
||||
for c in reversed(list(wide_iter(s[:script_len]))):
|
||||
if char_scripts[wide_ord(c)] == COMMON_SCRIPT:
|
||||
script_len -= 1
|
||||
break
|
||||
is_ascii = is_ascii and ord(c) <= MAX_ASCII
|
||||
script_len += 1
|
||||
if script != UNKNOWN_SCRIPT:
|
||||
last_script = script
|
||||
return (last_script, script_len, is_ascii)
|
||||
|
||||
LATIN_SCRIPT = 'Latin'
|
||||
UNKNOWN_LANGUAGE = 'unk'
|
||||
AMBIGUOUS_LANGUAGE = 'xxx'
|
||||
|
||||
|
||||
def disambiguate_language_script(text, languages):
|
||||
script_langs = {}
|
||||
read_len = 0
|
||||
while read_len < len(text):
|
||||
script, script_len, is_ascii = get_string_script(text[read_len:])
|
||||
if script != LATIN_SCRIPT:
|
||||
script_valid = [l for l, d in languages if l in script_languages.get(script, [])]
|
||||
script_langs[script] = set(script_valid)
|
||||
|
||||
if script_len == len(text) and len(script_valid) == 1:
|
||||
return script_valid[0], script_langs
|
||||
|
||||
read_len += script_len
|
||||
|
||||
return UNKNOWN_LANGUAGE, script_langs
|
||||
|
||||
LATIN_TRANSLITERATED_SCRIPTS = {'Arabic', 'Cyrllic'}
|
||||
|
||||
|
||||
def has_non_latin_script(languages):
|
||||
for lang, is_default in languages:
|
||||
scripts = lang_scripts.get(lang, set())
|
||||
if LATIN_SCRIPT not in scripts or scripts & LATIN_TRANSLITERATED_SCRIPTS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def disambiguate_language(text, languages, scripts_only=False):
|
||||
text = safe_decode(text)
|
||||
valid_languages = OrderedDict(languages)
|
||||
|
||||
language_script, script_langs = disambiguate_language_script(text, languages)
|
||||
if language_script is not UNKNOWN_LANGUAGE:
|
||||
return language_script
|
||||
|
||||
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
|
||||
|
||||
tokens = normalized_tokens(text)
|
||||
|
||||
current_lang = None
|
||||
possible_lang = None
|
||||
|
||||
seen_languages = set()
|
||||
|
||||
for t, c, l, data in street_types_gazetteer.filter(tokens):
|
||||
if c == token_types.PHRASE:
|
||||
valid = OrderedDict()
|
||||
data = [safe_decode(d).split(u'|') for d in data]
|
||||
potentials = set([l for l, d, i, c in data if l in valid_languages])
|
||||
potential_defaults = set([l for l in potentials if valid_languages[l]])
|
||||
|
||||
phrase_len = sum((len(t_i[0]) for t_i in t))
|
||||
for lang, dictionary, is_canonical, canonical in data:
|
||||
is_canonical = int(is_canonical)
|
||||
is_stopword = dictionary == 'stopword'
|
||||
if lang not in valid_languages or (is_stopword and len(potentials) > 1):
|
||||
continue
|
||||
is_default = valid_languages[lang]
|
||||
|
||||
lang_valid = is_default or not seen_languages or lang in seen_languages
|
||||
|
||||
if lang_valid and phrase_len > 1 and ((is_canonical and not is_stopword) or (is_default and (len(potentials) == 1 or len(potential_defaults) == 1))):
|
||||
valid[lang] = 1
|
||||
elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
elif is_stopword and is_canonical and not is_default and lang in seen_languages:
|
||||
valid[lang] = 1
|
||||
elif not seen_languages and len(potentials) == 1 and phrase_len > 1:
|
||||
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
||||
|
||||
if seen_languages and valid and not any((l in seen_languages for l in valid)) and \
|
||||
(not any((valid_languages.get(l) for l in valid)) or any((valid_languages.get(l) for l in seen_languages))):
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
|
||||
valid = valid.keys()
|
||||
|
||||
if len(valid) == 1:
|
||||
current_lang = valid[0]
|
||||
else:
|
||||
valid_default = [l for l in valid if valid_languages.get(l)]
|
||||
if len(valid_default) == 1 and current_lang is not None and valid_default[0] != current_lang:
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
elif len(valid_default) == 1:
|
||||
current_lang = valid_default[0]
|
||||
|
||||
if any((current_lang not in langs for script, langs in script_langs.iteritems())):
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
|
||||
seen_languages.update(valid)
|
||||
|
||||
if current_lang is not None:
|
||||
return current_lang
|
||||
elif possible_lang is not None:
|
||||
if not any((possible_lang not in langs for script, langs in script_langs.iteritems())):
|
||||
return possible_lang
|
||||
else:
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
return UNKNOWN_LANGUAGE
|
||||
53
scripts/geodata/language_id/sample.py
Normal file
53
scripts/geodata/language_id/sample.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import random
|
||||
import bisect
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
'''
|
||||
Top languages on the Interwebs. Not a probability distribution
|
||||
as it doesn't sum to 1 and websites can be in more than one
|
||||
language. Reference:
|
||||
|
||||
https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
|
||||
'''
|
||||
INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
|
||||
('en', 0.555),
|
||||
('ru', 0.059),
|
||||
('de', 0.058),
|
||||
('ja', 0.05),
|
||||
('es', 0.046),
|
||||
('fr', 0.04),
|
||||
('zh', 0.028),
|
||||
('pt', 0.025),
|
||||
('it', 0.019),
|
||||
('pl', 0.017),
|
||||
('tr', 0.015),
|
||||
('nl', 0.013),
|
||||
('fa', 0.009),
|
||||
('ar', 0.008),
|
||||
('ko', 0.007),
|
||||
])
|
||||
|
||||
|
||||
def cdf(probs):
|
||||
total = float(sum(probs))
|
||||
|
||||
result = []
|
||||
cumulative = 0.0
|
||||
for w in probs:
|
||||
cumulative += w
|
||||
result.append(cumulative / total)
|
||||
return result
|
||||
|
||||
|
||||
MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
|
||||
INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
|
||||
|
||||
|
||||
def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
|
||||
cdf=INTERNET_LANGUAGES_CDF):
|
||||
assert len(keys) == len(cdf)
|
||||
|
||||
sample = random.random()
|
||||
idx = bisect.bisect(cdf, sample)
|
||||
return keys[idx]
|
||||
10
scripts/geodata/log.py
Normal file
10
scripts/geodata/log.py
Normal file
@@ -0,0 +1,10 @@
|
||||
import logging
|
||||
import sys
|
||||
|
||||
|
||||
def log_to_file(f, level=logging.INFO):
|
||||
handler = logging.StreamHandler(f)
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]: %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
logging.root.addHandler(handler)
|
||||
logging.root.setLevel(level)
|
||||
0
scripts/geodata/math/__init__.py
Normal file
0
scripts/geodata/math/__init__.py
Normal file
5
scripts/geodata/math/floats.py
Normal file
5
scripts/geodata/math/floats.py
Normal file
@@ -0,0 +1,5 @@
|
||||
FLOAT_EPSILON = 1e-09
|
||||
|
||||
|
||||
def isclose(a, b, rel_tol=FLOAT_EPSILON, abs_tol=0.0):
|
||||
return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
|
||||
42
scripts/geodata/math/sampling.py
Normal file
42
scripts/geodata/math/sampling.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import bisect
|
||||
import random
|
||||
import sys
|
||||
|
||||
from geodata.math.floats import isclose, FLOAT_EPSILON
|
||||
|
||||
|
||||
def weighted_choice(values, cdf):
|
||||
"""Pick one of n values given a discrete cumulative distribution"""
|
||||
assert values and cdf, 'values and probabilities cannot be empty/None'
|
||||
assert len(values) == len(cdf), 'len(values) != len(probs)'
|
||||
assert all(p >= 0.0 and p <= (1.0 + FLOAT_EPSILON) for p in cdf), 'Probabilities not valid: {}'.format(cdf)
|
||||
|
||||
x = random.random()
|
||||
i = bisect.bisect(cdf, x)
|
||||
return values[i]
|
||||
|
||||
|
||||
def check_probability_distribution(probs):
|
||||
cumulative = 0.0
|
||||
for p in probs:
|
||||
assert p >= 0.0, 'Probabilities cannot be negative'
|
||||
assert p <= 1.0, 'Probabilities cannot be > 1.0'
|
||||
cumulative += p
|
||||
assert isclose(cumulative, 1.0), 'Probabilities must sum to 1: probs={}, cumulative={}'.format(probs, cumulative)
|
||||
|
||||
|
||||
def cdf(probs):
|
||||
total = 0.0
|
||||
cumulative = [0.0] * len(probs)
|
||||
for i, p in enumerate(probs):
|
||||
total += p
|
||||
cumulative[i] = total
|
||||
|
||||
return cumulative
|
||||
|
||||
|
||||
def zipfian_distribution(n, b=1.0):
|
||||
"""Distribution where the ith item's frequency is proportional to its rank"""
|
||||
frequencies = [1. / (i ** b) for i in xrange(1, n + 1)]
|
||||
total = sum(frequencies)
|
||||
return [f / total for f in frequencies]
|
||||
0
scripts/geodata/metro_stations/__init__.py
Normal file
0
scripts/geodata/metro_stations/__init__.py
Normal file
52
scripts/geodata/metro_stations/reverse_geocode.py
Normal file
52
scripts/geodata/metro_stations/reverse_geocode.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import six
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.address_expansions.abbreviations import abbreviate
|
||||
from geodata.coordinates.conversion import latlon_to_decimal
|
||||
from geodata.math.floats import isclose
|
||||
from geodata.osm.extract import parse_osm
|
||||
from geodata.places.reverse_geocode import PlaceReverseGeocoder
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class MetroStationReverseGeocoder(PlaceReverseGeocoder):
|
||||
GEOHASH_PRECISION = 7
|
||||
|
||||
include_property_patterns = PlaceReverseGeocoder.include_property_patterns | set([
|
||||
'operator',
|
||||
'network',
|
||||
'station',
|
||||
])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-m', '--osm-metro-stations-file',
|
||||
help='Path to OSM metro stations file')
|
||||
|
||||
parser.add_argument('-p', '--precision',
|
||||
type=int,
|
||||
default=MetroStationReverseGeocoder.GEOHASH_PRECISION,
|
||||
help='Geohash precision')
|
||||
|
||||
parser.add_argument('-o', '--out-dir',
|
||||
default=os.getcwd(),
|
||||
help='Output directory')
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.osm_metro_stations_file:
|
||||
index = MetroStationReverseGeocoder.create_from_osm_file(args.osm_metro_stations_file, args.out_dir, precision=args.precision)
|
||||
else:
|
||||
parser.error('Must specify metro stations file')
|
||||
|
||||
index.save()
|
||||
0
scripts/geodata/names/__init__.py
Normal file
0
scripts/geodata/names/__init__.py
Normal file
102
scripts/geodata/names/deduping.py
Normal file
102
scripts/geodata/names/deduping.py
Normal file
@@ -0,0 +1,102 @@
|
||||
from geodata.text.normalize import *
|
||||
from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity
|
||||
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class NameDeduper(object):
|
||||
'''
|
||||
Base class for deduping geographic entity names e.g. for matching names
|
||||
from different databases (concordances).
|
||||
|
||||
By default uses Soft TFIDF similarity (see geodata.names.similarity)
|
||||
for non-ideographic names and Jaccard similarity with word frequencies
|
||||
for ideographic names.
|
||||
|
||||
See class attributes for options.
|
||||
'''
|
||||
|
||||
stopwords = set()
|
||||
'''Set of words which should not be considered in similarity'''
|
||||
|
||||
discriminative_words = set()
|
||||
'''Set of words which break similarity e.g. North, Heights'''
|
||||
|
||||
discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
|
||||
'''Set of categories which, if not contained in both sets, break similarity'''
|
||||
|
||||
content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
|
||||
'''Set of categories representing content tokens (default setting ignores punctuation)'''
|
||||
|
||||
replacements = {}
|
||||
'''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''
|
||||
|
||||
dupe_threshold = 0.9
|
||||
'''Similarity threshold above which entities are considered dupes'''
|
||||
|
||||
ignore_parentheticals = True
|
||||
'''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''
|
||||
|
||||
@classmethod
|
||||
def tokenize(cls, s):
|
||||
return normalized_tokens(s)
|
||||
|
||||
@classmethod
|
||||
def content_tokens(cls, s):
|
||||
tokens = cls.tokenize(s)
|
||||
if cls.ignore_parentheticals:
|
||||
tokens = remove_parens(tokens)
|
||||
return [(cls.replacements.get(t, t), c)
|
||||
for t, c in tokens
|
||||
if c in cls.content_categories and
|
||||
t not in cls.stopwords]
|
||||
|
||||
@classmethod
|
||||
def possible_match(cls, tokens1, tokens2):
|
||||
if not cls.discriminative_categories and not cls.discriminative_words:
|
||||
return True
|
||||
|
||||
intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
|
||||
invalid = any((True for t, c in tokens1 + tokens2
|
||||
if t not in intersection and
|
||||
(c in cls.discriminative_categories or t in cls.discriminative_words)
|
||||
))
|
||||
return not invalid
|
||||
|
||||
@classmethod
|
||||
def compare_ideographs(cls, s1, s2):
|
||||
tokens1 = cls.content_tokens(s1)
|
||||
tokens2 = cls.content_tokens(s2)
|
||||
|
||||
if not cls.possible_match(tokens1, tokens2):
|
||||
return 0.0
|
||||
|
||||
tokens1_only = [t for t, c in tokens1]
|
||||
tokens2_only = [t for t, c in tokens2]
|
||||
|
||||
if u''.join(tokens1_only) == u''.join(tokens2_only):
|
||||
return 1.0
|
||||
else:
|
||||
# Many Han/Hangul characters are common, shouldn't use IDF
|
||||
return jaccard_similarity(tokens1_only, tokens2_only)
|
||||
|
||||
@classmethod
|
||||
def compare(cls, s1, s2, idf):
|
||||
tokens1 = cls.content_tokens(s1)
|
||||
tokens2 = cls.content_tokens(s2)
|
||||
|
||||
if not cls.possible_match(tokens1, tokens2):
|
||||
return 0.0
|
||||
|
||||
tokens1_only = [t for t, c in tokens1]
|
||||
tokens2_only = [t for t, c in tokens2]
|
||||
|
||||
# Test exact equality, also handles things like Cabbage Town == Cabbagetown
|
||||
if u''.join(tokens1_only) == u''.join(tokens2_only):
|
||||
return 1.0
|
||||
else:
|
||||
return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)
|
||||
|
||||
@classmethod
|
||||
def is_dupe(cls, sim):
|
||||
return sim >= cls.dupe_threshold
|
||||
119
scripts/geodata/names/normalization.py
Normal file
119
scripts/geodata/names/normalization.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
import re
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
AFFIX_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'boundaries', 'names', 'languages')
|
||||
|
||||
|
||||
class NameAffixes(object):
|
||||
def __init__(self, config_dir=AFFIX_CONFIG_DIR):
|
||||
self.config_dir = config_dir
|
||||
|
||||
self.language_prefixes = {}
|
||||
self.language_suffixes = {}
|
||||
|
||||
self.language_prefix_regexes = {}
|
||||
self.language_suffix_regexes = {}
|
||||
|
||||
self.language_prefix_sim_only_regexes = {}
|
||||
self.language_suffix_sim_only_regexes = {}
|
||||
|
||||
for filename in os.listdir(config_dir):
|
||||
if not filename.endswith('.yaml'):
|
||||
continue
|
||||
lang = filename.rsplit('.yaml')[0]
|
||||
|
||||
conf = yaml.load(open(os.path.join(config_dir, filename)))
|
||||
self.add_affixes(lang, conf)
|
||||
|
||||
for country, country_conf in six.iteritems(conf.get('countries', {})):
|
||||
country_lang = (country, lang)
|
||||
self.add_affixes(country_lang, country_conf)
|
||||
|
||||
def add_affixes(self, lang, *confs):
|
||||
prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
|
||||
prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]
|
||||
|
||||
self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
|
||||
|
||||
suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
|
||||
suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]
|
||||
|
||||
self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
|
||||
|
||||
whitespace_phrase = six.u('[ \-]')
|
||||
|
||||
all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
|
||||
all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
|
||||
|
||||
if all_prefixes:
|
||||
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
|
||||
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
|
||||
|
||||
if all_suffixes:
|
||||
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
|
||||
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
|
||||
|
||||
sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
|
||||
if sim_only_prefixes:
|
||||
sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
|
||||
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
|
||||
|
||||
sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
|
||||
if sim_only_suffixes:
|
||||
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
|
||||
|
||||
self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
|
||||
|
||||
def replace_prefixes(self, name, lang, country=None, sim_only=False):
|
||||
name = safe_decode(name).strip()
|
||||
|
||||
if not sim_only or lang not in self.language_prefix_sim_only_regexes:
|
||||
d = self.language_prefix_regexes
|
||||
else:
|
||||
d = self.language_prefix_sim_only_regexes
|
||||
|
||||
re = None
|
||||
if country is not None:
|
||||
re = d.get((country, lang))
|
||||
if re:
|
||||
name = re.sub(six.u(''), name)
|
||||
|
||||
re = d.get(lang)
|
||||
|
||||
if not re:
|
||||
return name
|
||||
|
||||
return re.sub(six.u(''), name)
|
||||
|
||||
def replace_suffixes(self, name, lang, country=None, sim_only=False):
|
||||
name = safe_decode(name).strip()
|
||||
|
||||
if not sim_only or lang not in self.language_suffix_sim_only_regexes:
|
||||
d = self.language_suffix_regexes
|
||||
else:
|
||||
d = self.language_suffix_sim_only_regexes
|
||||
|
||||
re = None
|
||||
if country is not None:
|
||||
re = d.get((country, lang))
|
||||
if re:
|
||||
name = re.sub(six.u(''), name)
|
||||
|
||||
re = d.get(lang)
|
||||
|
||||
if not re:
|
||||
return name
|
||||
|
||||
return re.sub(six.u(''), name)
|
||||
|
||||
def replace_affixes(self, name, lang, country=None, sim_only=False):
|
||||
return self.replace_prefixes(self.replace_suffixes(name, lang, country=country, sim_only=sim_only), lang, country=country, sim_only=sim_only)
|
||||
|
||||
name_affixes = NameAffixes()
|
||||
85
scripts/geodata/names/similarity.py
Normal file
85
scripts/geodata/names/similarity.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import Levenshtein
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def ordered_word_count(tokens):
|
||||
counts = OrderedDict()
|
||||
for k in tokens:
|
||||
counts[k] = counts.get(k, 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
def soft_tfidf_similarity(tokens1, tokens2, idf,
|
||||
sim_func=Levenshtein.jaro_winkler, theta=0.95,
|
||||
common_word_threshold=100):
|
||||
'''
|
||||
Soft TFIDF is a hybrid distance function using both global statistics
|
||||
(inverse document frequency) and local similarity (Jaro-Winkler).
|
||||
|
||||
For each token t1 in the first string, find the token t2 which is most
|
||||
similar to t1 in terms of the local distance function.
|
||||
|
||||
The SoftTFIDF similarity is the dot product of the max token similarities
|
||||
and the cosine similarity of the TF-IDF vectors for all tokens where
|
||||
the max similarity is >= a given threshold theta.
|
||||
|
||||
sim_func should return a number in the range (0, 1) inclusive and theta
|
||||
should be in the same range i.e. this would _not_ work for a metric like
|
||||
basic Levenshtein or Damerau-Levenshtein distance where we'd want the
|
||||
value to be below the threshold. Those metrics can be transformed into
|
||||
a (0, 1) measure.
|
||||
|
||||
@param tokens1: normalized tokens of string 1 (list of strings only)
|
||||
@param tokens2: normalized tokens of string 2 (list of strings only)
|
||||
|
||||
@param idf: IDFIndex from geodata.statistics.tf_idf
|
||||
@param sim_func: similarity function which takes 2 strings and returns
|
||||
a number between 0 and 1
|
||||
@param theta: token-level threshold on sim_func's return value at
|
||||
which point two tokens are considered "close"
|
||||
|
||||
Reference:
|
||||
https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
|
||||
'''
|
||||
|
||||
token1_counts = ordered_word_count(tokens1)
|
||||
token2_counts = ordered_word_count(tokens2)
|
||||
|
||||
tfidf1 = idf.tfidf_vector(token1_counts)
|
||||
tfidf2 = idf.tfidf_vector(token2_counts)
|
||||
|
||||
total_sim = 0.0
|
||||
|
||||
t1_len = len(token1_counts)
|
||||
t2_len = len(token2_counts)
|
||||
|
||||
if t2_len < t1_len:
|
||||
token1_counts, token2_counts = token2_counts, token1_counts
|
||||
tfidf1, tfidf2 = tfidf2, tfidf1
|
||||
|
||||
for i, t1 in enumerate(token1_counts):
|
||||
sim, j = max([(sim_func(t1, t2), j) for j, t2 in enumerate(token2_counts)])
|
||||
if sim >= theta:
|
||||
total_sim += sim * tfidf1[i] * tfidf2[j]
|
||||
|
||||
return total_sim
|
||||
|
||||
|
||||
def jaccard_similarity(tokens1, tokens2):
|
||||
'''
|
||||
Traditionally Jaccard similarity is defined for two sets:
|
||||
|
||||
Jaccard(A, B) = (A ∩ B) / (A ∪ B)
|
||||
|
||||
Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b']
|
||||
would be 1.0, which is not ideal for entity name matching.
|
||||
|
||||
In this implementation the cardinality of the set intersections/unions
|
||||
are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67
|
||||
'''
|
||||
token1_counts = ordered_word_count(tokens1)
|
||||
token2_counts = ordered_word_count(tokens2)
|
||||
|
||||
intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts))
|
||||
return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)
|
||||
0
scripts/geodata/neighborhoods/__init__.py
Normal file
0
scripts/geodata/neighborhoods/__init__.py
Normal file
622
scripts/geodata/neighborhoods/reverse_geocode.py
Normal file
622
scripts/geodata/neighborhoods/reverse_geocode.py
Normal file
@@ -0,0 +1,622 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import argparse
|
||||
import fnmatch
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
import re
|
||||
import six
|
||||
import subprocess
|
||||
import sys
|
||||
import yaml
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.address_formatting.formatter import AddressFormatter
|
||||
from geodata.coordinates.conversion import latlon_to_decimal
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.file_utils import ensure_dir, download_file
|
||||
from geodata.i18n.unicode_properties import get_chars_by_script
|
||||
from geodata.i18n.word_breaks import ideographic_scripts
|
||||
from geodata.names.deduping import NameDeduper
|
||||
from geodata.osm.admin_boundaries import OSMNeighborhoodPolygonReader
|
||||
from geodata.osm.components import osm_address_components
|
||||
from geodata.osm.definitions import osm_definitions
|
||||
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
|
||||
from geodata.polygons.index import *
|
||||
from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder
|
||||
from geodata.statistics.tf_idf import IDFIndex
|
||||
|
||||
|
||||
class NeighborhoodDeduper(NameDeduper):
|
||||
# Lossless conversions only
|
||||
replacements = {
|
||||
u'saint': u'st',
|
||||
u'and': u'&',
|
||||
u'〇': u'0',
|
||||
u'一': u'1',
|
||||
u'二': u'2',
|
||||
u'三': u'3',
|
||||
u'四': u'4',
|
||||
u'五': u'5',
|
||||
u'六': u'6',
|
||||
u'七': u'7',
|
||||
u'八': u'8',
|
||||
u'九': u'9',
|
||||
u'十': u'10',
|
||||
}
|
||||
|
||||
discriminative_words = set([
|
||||
# Han numbers
|
||||
u'〇', u'一',
|
||||
u'二', u'三',
|
||||
u'四', u'五',
|
||||
u'六', u'七',
|
||||
u'八', u'九',
|
||||
u'十', u'百',
|
||||
u'千', u'万',
|
||||
u'億', u'兆',
|
||||
u'京', u'第',
|
||||
|
||||
# Roman numerals
|
||||
u'i', u'ii',
|
||||
u'iii', u'iv',
|
||||
u'v', u'vi',
|
||||
u'vii', u'viii',
|
||||
u'ix', u'x',
|
||||
u'xi', u'xii',
|
||||
u'xiii', u'xiv',
|
||||
u'xv', u'xvi',
|
||||
u'xvii', u'xviii',
|
||||
u'xix', u'xx',
|
||||
|
||||
# English directionals
|
||||
u'north', u'south',
|
||||
u'east', u'west',
|
||||
u'northeast', u'northwest',
|
||||
u'southeast', u'southwest',
|
||||
|
||||
# Spanish, Portguese and Italian directionals
|
||||
u'norte', u'nord', u'sur', u'sul', u'sud',
|
||||
u'est', u'este', u'leste', u'oeste', u'ovest',
|
||||
|
||||
# New in various languages
|
||||
u'new',
|
||||
u'nova',
|
||||
u'novo',
|
||||
u'nuevo',
|
||||
u'nueva',
|
||||
u'nuovo',
|
||||
u'nuova',
|
||||
|
||||
# Qualifiers
|
||||
u'heights',
|
||||
u'hills',
|
||||
|
||||
u'upper', u'lower',
|
||||
u'little', u'great',
|
||||
|
||||
u'park',
|
||||
u'parque',
|
||||
|
||||
u'village',
|
||||
|
||||
])
|
||||
|
||||
stopwords = set([
|
||||
u'cp',
|
||||
u'de',
|
||||
u'la',
|
||||
u'urbanizacion',
|
||||
u'do',
|
||||
u'da',
|
||||
u'dos',
|
||||
u'del',
|
||||
u'community',
|
||||
u'bairro',
|
||||
u'barrio',
|
||||
u'le',
|
||||
u'el',
|
||||
u'mah',
|
||||
u'раион',
|
||||
u'vila',
|
||||
u'villa',
|
||||
u'kampung',
|
||||
u'ahupua`a',
|
||||
|
||||
])
|
||||
|
||||
|
||||
class ClickThatHoodReverseGeocoder(GeohashPolygonIndex):
|
||||
persistent_polygons = False
|
||||
cache_size = 0
|
||||
|
||||
SCRATCH_DIR = '/tmp'
|
||||
|
||||
# Contains accurate boundaries for neighborhoods sans weird GeoPlanet names like "Adelphi" or "Crown Heights South"
|
||||
NEIGHBORHOODS_REPO = 'https://github.com/codeforamerica/click_that_hood'
|
||||
|
||||
config_path = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'neighborhoods', 'click_that_hood.yaml')
|
||||
|
||||
config = yaml.load(open(config_path))
|
||||
|
||||
@classmethod
|
||||
def clone_repo(cls, path):
|
||||
subprocess.check_call(['rm', '-rf', path])
|
||||
subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
|
||||
|
||||
@classmethod
|
||||
def create_neighborhoods_index(cls):
|
||||
scratch_dir = cls.SCRATCH_DIR
|
||||
repo_path = os.path.join(scratch_dir, 'click_that_hood')
|
||||
cls.clone_repo(repo_path)
|
||||
|
||||
data_path = os.path.join(repo_path, 'public', 'data')
|
||||
|
||||
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods')
|
||||
ensure_dir(neighborhoods_dir)
|
||||
|
||||
index = cls(save_dir=neighborhoods_dir)
|
||||
|
||||
for c in cls.config['files']:
|
||||
filename = c['filename']
|
||||
component = c['component']
|
||||
|
||||
path = os.path.join(data_path, filename)
|
||||
features = json.load(open(path))['features']
|
||||
for f in features:
|
||||
f['properties']['component'] = component
|
||||
|
||||
try:
|
||||
index.add_geojson_like_file(features)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return index
|
||||
|
||||
|
||||
class OSMNeighborhoodReverseGeocoder(OSMReverseGeocoder):
|
||||
persistent_polygons = False
|
||||
cache_size = 10000
|
||||
simplify_polygons = False
|
||||
polygon_reader = OSMNeighborhoodPolygonReader
|
||||
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['postal_code'])
|
||||
|
||||
cache_size = 0
|
||||
|
||||
SCRATCH_DIR = '/tmp'
|
||||
|
||||
@classmethod
|
||||
def create_neighborhoods_index(cls, osm_neighborhoods_file):
|
||||
scratch_dir = cls.SCRATCH_DIR
|
||||
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
|
||||
ensure_dir(neighborhoods_dir)
|
||||
|
||||
return cls.create_from_osm_file(osm_neighborhoods_file, output_dir=neighborhoods_dir)
|
||||
|
||||
|
||||
class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
'''
|
||||
Neighborhoods are very important in cities like NYC, SF, Chicago, London
|
||||
and many others. We want the address parser to be trained with addresses
|
||||
that sufficiently capture variations in address patterns, including
|
||||
neighborhoods. Quattroshapes neighborhood data (in the US at least)
|
||||
is not great in terms of names, mostly becasue GeoPlanet has so many
|
||||
incorrect names. The neighborhoods project, also known as ClickThatHood
|
||||
has very accurate polygons with correct names, but only for a handful
|
||||
of cities. OSM usually lists neighborhoods and some other local admin
|
||||
areas like boroughs as points rather than polygons.
|
||||
|
||||
This index merges all of the above data sets in prioritized order
|
||||
(ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
|
||||
tests for neighborhoods. The properties vary by source but each has
|
||||
source has least a "name" key which in practice is what we care about.
|
||||
|
||||
Quattroshapes data is no longer accessible and has been replaced by
|
||||
WhosOnFirst.
|
||||
'''
|
||||
|
||||
PRIORITIES_FILENAME = 'priorities.json'
|
||||
|
||||
DUPE_THRESHOLD = 0.9
|
||||
|
||||
persistent_polygons = True
|
||||
cache_size = 100000
|
||||
|
||||
source_priorities = {
|
||||
'osm': 0, # Best names/polygons, same coordinate system
|
||||
'osm_cth': 1, # Prefer the OSM names if possible
|
||||
'clickthathood': 2, # Better names/polygons than WhosOnFirst
|
||||
'osm_wof': 3, # Prefer OSM names matched with WhosOnFirst polygon
|
||||
'wof': 4, # Replacement of Quattroshapes
|
||||
}
|
||||
|
||||
level_priorities = {
|
||||
'neighborhood': 0,
|
||||
'local_admin': 1,
|
||||
}
|
||||
|
||||
regex_replacements = [
|
||||
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes
|
||||
(re.compile('^paris-(?=[\d])', re.I), ''),
|
||||
(re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
|
||||
]
|
||||
|
||||
quattroshapes_city_district_patterns = [
|
||||
six.u('Praha [\d]+'),
|
||||
]
|
||||
|
||||
quattroshapes_city_district_regex = re.compile('|'.join([six.u('^\s*{}\s*$').format(p) for p in quattroshapes_city_district_patterns]), re.I | re.U)
|
||||
|
||||
@classmethod
|
||||
def count_words(cls, s):
|
||||
doc = defaultdict(int)
|
||||
for t, c in NeighborhoodDeduper.content_tokens(s):
|
||||
doc[t] += 1
|
||||
return doc
|
||||
|
||||
@classmethod
|
||||
def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
|
||||
'''
|
||||
Given an OSM file (planet or some other bounds) containing neighborhoods
|
||||
as points (some suburbs have boundaries)
|
||||
|
||||
and their dependencies, create an R-tree index for coarse-grained
|
||||
reverse geocoding.
|
||||
|
||||
Note: the input file is expected to have been created using
|
||||
osmfilter. Use fetch_osm_address_data.sh for planet or copy the
|
||||
admin borders commands if using other geometries.
|
||||
'''
|
||||
index = cls(save_dir=output_dir)
|
||||
|
||||
logger = logging.getLogger('neighborhoods')
|
||||
|
||||
logger.info('Creating ClickThatHood neighborhoods')
|
||||
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
|
||||
|
||||
logger.info('Creating OSM neighborhoods')
|
||||
osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
|
||||
|
||||
logger.info('Creating WhosOnFirst neighborhoods')
|
||||
wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods"))
|
||||
|
||||
country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
|
||||
|
||||
osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
|
||||
osm_admin_rtree.cache_size = 1000
|
||||
|
||||
logger.info('Creating IDF index')
|
||||
idf = IDFIndex()
|
||||
|
||||
char_scripts = get_chars_by_script()
|
||||
|
||||
for idx in (cth, wof, osmn):
|
||||
for i in xrange(idx.i):
|
||||
props = idx.get_properties(i)
|
||||
name = props.get('name')
|
||||
if name is not None:
|
||||
doc = cls.count_words(name)
|
||||
idf.update(doc)
|
||||
|
||||
for key, attrs, deps in parse_osm(filename):
|
||||
for k, v in six.iteritems(attrs):
|
||||
if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
|
||||
doc = cls.count_words(v)
|
||||
idf.update(doc)
|
||||
|
||||
for i in six.moves.xrange(osmn.i):
|
||||
props = osmn.get_properties(i)
|
||||
poly = osmn.get_polygon(i)
|
||||
|
||||
props['source'] = 'osm'
|
||||
props['component'] = AddressFormatter.SUBURB
|
||||
props['polygon_type'] = 'neighborhood'
|
||||
|
||||
index.index_polygon(poly.context)
|
||||
index.add_polygon(poly.context, props)
|
||||
|
||||
wof.matched = [False] * wof.i
|
||||
cth.matched = [False] * cth.i
|
||||
|
||||
logger.info('Matching OSM points to neighborhood polygons')
|
||||
# Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons
|
||||
num_polys = 0
|
||||
for element_id, attrs, deps in parse_osm(filename):
|
||||
try:
|
||||
lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
osm_name = attrs.get('name')
|
||||
if not osm_name:
|
||||
continue
|
||||
|
||||
id_type, element_id = element_id.split(':')
|
||||
element_id = long(element_id)
|
||||
|
||||
props['type'] = id_type
|
||||
props['id'] = element_id
|
||||
|
||||
possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD)
|
||||
is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD)
|
||||
|
||||
country, candidate_languages = country_rtree.country_and_languages(lat, lon)
|
||||
|
||||
component_name = None
|
||||
|
||||
component_name = osm_address_components.component_from_properties(country, attrs)
|
||||
|
||||
ranks = []
|
||||
osm_names = []
|
||||
|
||||
for key in OSM_NAME_TAGS:
|
||||
name = attrs.get(key)
|
||||
if name:
|
||||
osm_names.append(name)
|
||||
|
||||
for name_key in OSM_NAME_TAGS:
|
||||
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
|
||||
|
||||
for idx in (cth, wof):
|
||||
candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
|
||||
|
||||
if candidates:
|
||||
max_sim = 0.0
|
||||
arg_max = None
|
||||
|
||||
normalized_wof_names = {}
|
||||
|
||||
for osm_name in osm_names:
|
||||
|
||||
contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
|
||||
for c in safe_decode(osm_name)))
|
||||
|
||||
for i in candidates:
|
||||
props = idx.get_properties(i)
|
||||
name = normalized_wof_names.get(i)
|
||||
if not name:
|
||||
name = props.get('name')
|
||||
if not name:
|
||||
continue
|
||||
for pattern, repl in cls.regex_replacements:
|
||||
name = pattern.sub(repl, name)
|
||||
normalized_wof_names[i] = name
|
||||
|
||||
if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood':
|
||||
continue
|
||||
|
||||
if not contains_ideographs:
|
||||
sim = NeighborhoodDeduper.compare(osm_name, name, idf)
|
||||
else:
|
||||
# Many Han/Hangul characters are common, shouldn't use IDF
|
||||
sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)
|
||||
|
||||
if sim > max_sim:
|
||||
max_sim = sim
|
||||
poly = idx.get_polygon(i)
|
||||
arg_max = (max_sim, props, poly.context, idx, i)
|
||||
|
||||
if arg_max:
|
||||
ranks.append(arg_max)
|
||||
|
||||
ranks.sort(key=operator.itemgetter(0), reverse=True)
|
||||
if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
|
||||
score, props, poly, idx, i = ranks[0]
|
||||
|
||||
existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True)
|
||||
existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True)
|
||||
|
||||
skip_node = False
|
||||
|
||||
for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries):
|
||||
for poly_index, osm_props in enumerate(boundaries):
|
||||
containing_component = None
|
||||
name = osm_props.get('name')
|
||||
# Only exact name matches here since we're comparins OSM to OSM
|
||||
if name and name.lower() != attrs.get('name', '').lower():
|
||||
continue
|
||||
|
||||
if boundaries is existing_neighborhood_boundaries:
|
||||
containing_component = AddressFormatter.SUBURB
|
||||
skip_node = True
|
||||
break
|
||||
else:
|
||||
containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]]
|
||||
|
||||
containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids)
|
||||
|
||||
if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]:
|
||||
skip_node = True
|
||||
break
|
||||
if skip_node:
|
||||
break
|
||||
|
||||
# Skip this element
|
||||
if skip_node:
|
||||
continue
|
||||
|
||||
if idx is cth:
|
||||
if props['component'] == AddressFormatter.SUBURB:
|
||||
attrs['polygon_type'] = 'neighborhood'
|
||||
elif props['component'] == AddressFormatter.CITY_DISTRICT:
|
||||
attrs['polygon_type'] = 'local_admin'
|
||||
else:
|
||||
continue
|
||||
source = 'osm_cth'
|
||||
else:
|
||||
level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None)
|
||||
|
||||
source = 'osm_quattro'
|
||||
if level == 'neighborhood':
|
||||
attrs['polygon_type'] = 'neighborhood'
|
||||
else:
|
||||
attrs['polygon_type'] = 'local_admin'
|
||||
|
||||
containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries]
|
||||
component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids)
|
||||
attrs['component'] = component
|
||||
|
||||
attrs['source'] = source
|
||||
index.index_polygon(poly)
|
||||
index.add_polygon(poly, attrs)
|
||||
idx.matched[i] = True
|
||||
|
||||
num_polys += 1
|
||||
if num_polys % 1000 == 0 and num_polys > 0:
|
||||
logger.info('did {} neighborhoods'.format(num_polys))
|
||||
|
||||
for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')):
|
||||
for i in xrange(idx.i):
|
||||
props = idx.get_properties(i)
|
||||
poly = idx.get_polygon(i)
|
||||
if idx.matched[i]:
|
||||
continue
|
||||
props['source'] = source
|
||||
if idx is cth:
|
||||
component = props['component']
|
||||
if component == AddressFormatter.SUBURB:
|
||||
props['polygon_type'] = 'neighborhood'
|
||||
elif component == AddressFormatter.CITY_DISTRICT:
|
||||
props['polygon_type'] = 'local_admin'
|
||||
else:
|
||||
continue
|
||||
elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood':
|
||||
component = AddressFormatter.SUBURB
|
||||
name = props.get('name')
|
||||
if not name:
|
||||
continue
|
||||
for pattern, repl in cls.regex_replacements:
|
||||
name = pattern.sub(repl, name)
|
||||
|
||||
props['name'] = name
|
||||
|
||||
if cls.quattroshapes_city_district_regex.match(name):
|
||||
component = AddressFormatter.CITY_DISTRICT
|
||||
|
||||
props['component'] = component
|
||||
props['polygon_type'] = 'neighborhood'
|
||||
else:
|
||||
# We don't actually care about local admin polygons unless they match OSM
|
||||
continue
|
||||
index.index_polygon(poly.context)
|
||||
index.add_polygon(poly.context, props)
|
||||
|
||||
return index
|
||||
|
||||
def setup(self):
|
||||
self.priorities = []
|
||||
|
||||
def index_polygon_properties(self, properties):
|
||||
self.priorities.append((self.level_priorities[properties['polygon_type']], self.source_priorities[properties['source']]))
|
||||
|
||||
def load_polygon_properties(self, d):
|
||||
self.priorities = [tuple(p) for p in json.load(open(os.path.join(d, self.PRIORITIES_FILENAME)))]
|
||||
|
||||
def save_polygon_properties(self, d):
|
||||
json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w'))
|
||||
|
||||
def priority(self, i):
|
||||
return self.priorities[i]
|
||||
|
||||
def get_candidate_polygons(self, lat, lon):
|
||||
candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon)
|
||||
return sorted(candidates, key=self.priority)
|
||||
|
||||
|
||||
class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex):
|
||||
persistent_polygons = False
|
||||
cache_size = None
|
||||
|
||||
NAME = "wof:name"
|
||||
ASCII_NAME = "gn:asciiname"
|
||||
LEVEL = "wof:placetype"
|
||||
GEONAMES_ID = "gn:geonameid"
|
||||
SUPERSEDED = "wof:superseded_by"
|
||||
|
||||
NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"}
|
||||
POLYGON_TYPES = {"Polygon", "MultiPolygon"}
|
||||
|
||||
@classmethod
|
||||
def is_valid_neighbourhood(cls, geojson):
|
||||
validity = not geojson["properties"].get(cls.SUPERSEDED)
|
||||
for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}:
|
||||
validity &= geojson["properties"].get(field)
|
||||
return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES
|
||||
|
||||
@classmethod
|
||||
def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None):
|
||||
index = cls(save_dir=output_dir, index_filename=index_filename)
|
||||
|
||||
for root, dirnames, filenames in os.walk(wof_dir):
|
||||
for fname in fnmatch.filter(filenames, "*.geojson"):
|
||||
with open(os.path.join(root, fname)) as f:
|
||||
geojson = json.load(f)
|
||||
if cls.is_valid_neighbourhood(geojson):
|
||||
properties = {
|
||||
"name": safe_decode(geojson["properties"].get(cls.NAME)),
|
||||
"name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)),
|
||||
"qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)),
|
||||
"gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID))
|
||||
}
|
||||
|
||||
poly_type = geojson['geometry']['type']
|
||||
if poly_type == 'Polygon':
|
||||
poly = cls.to_polygon(geojson['geometry']['coordinates'][0])
|
||||
index.index_polygon(poly)
|
||||
poly = index.simplify_polygon(poly)
|
||||
index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props)
|
||||
elif poly_type == 'MultiPolygon':
|
||||
polys = []
|
||||
for coords in geojson['geometry']['coordinates']:
|
||||
poly = cls.to_polygon(coords[0])
|
||||
polys.append(poly)
|
||||
index.index_polygon(poly)
|
||||
|
||||
multi_poly = index.simplify_polygon(MultiPolygon(polys))
|
||||
index.add_polygon(multi_poly, dict(geojson['properties']))
|
||||
|
||||
return index
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-w', '--wof-dir',
|
||||
help='Path to WhosOnFirst dir')
|
||||
|
||||
parser.add_argument('-a', '--osm-admin-rtree-dir',
|
||||
help='Path to OSM admin rtree dir')
|
||||
|
||||
parser.add_argument('-c', '--country-rtree-dir',
|
||||
help='Path to country rtree dir')
|
||||
|
||||
parser.add_argument('-b', '--osm-neighborhood-borders-file',
|
||||
help='Path to OSM neighborhood borders file (with dependencies, .osm format)')
|
||||
|
||||
parser.add_argument('-n', '--osm-neighborhoods-file',
|
||||
help='Path to OSM neighborhoods file (no dependencies, .osm format)')
|
||||
|
||||
parser.add_argument('-o', '--out-dir',
|
||||
default=os.getcwd(),
|
||||
help='Output directory')
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
|
||||
index = NeighborhoodReverseGeocoder.create_from_osm_and_wof(
|
||||
args.osm_neighborhoods_file,
|
||||
args.wof_dir,
|
||||
args.country_rtree_dir,
|
||||
args.osm_admin_rtree_dir,
|
||||
args.osm_neighborhood_borders_file,
|
||||
args.out_dir
|
||||
)
|
||||
else:
|
||||
parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file')
|
||||
|
||||
index.save()
|
||||
0
scripts/geodata/numbers/__init__.py
Normal file
0
scripts/geodata/numbers/__init__.py
Normal file
219
scripts/geodata/numbers/numex.py
Normal file
219
scripts/geodata/numbers/numex.py
Normal file
@@ -0,0 +1,219 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_encode
|
||||
from geodata.i18n.unicode_paths import DATA_DIR
|
||||
|
||||
|
||||
class InvalidNumexRuleException(Exception):
|
||||
pass
|
||||
|
||||
NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'numex')
|
||||
|
||||
NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
|
||||
|
||||
GENDER_MASCULINE = 'GENDER_MASCULINE'
|
||||
GENDER_FEMININE = 'GENDER_FEMININE'
|
||||
GENDER_NEUTER = 'GENDER_NEUTER'
|
||||
GENDER_NONE = 'GENDER_NONE'
|
||||
|
||||
gender_map = {
|
||||
'm': GENDER_MASCULINE,
|
||||
'f': GENDER_FEMININE,
|
||||
'n': GENDER_NEUTER,
|
||||
None: GENDER_NONE,
|
||||
}
|
||||
|
||||
|
||||
CATEGORY_PLURAL = 'CATEGORY_PLURAL'
|
||||
CATEGORY_DEFAULT = 'CATEGORY_DEFAULT'
|
||||
|
||||
valid_numex_keys = set(['name', 'value', 'type', 'left', 'right', 'gender', 'category', 'radix',
|
||||
'multiply_gte', 'exact_multiple_only', 'left_separator', 'right_separator'])
|
||||
|
||||
valid_ordinal_keys = set(['suffixes', 'gender', 'category'])
|
||||
|
||||
|
||||
category_map = {
|
||||
'plural': CATEGORY_PLURAL,
|
||||
None: CATEGORY_DEFAULT
|
||||
}
|
||||
|
||||
LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
|
||||
LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
|
||||
LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER'
|
||||
LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
|
||||
|
||||
left_context_map = {
|
||||
'add': LEFT_CONTEXT_ADD,
|
||||
'multiply': LEFT_CONTEXT_MULTIPLY,
|
||||
'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER,
|
||||
None: LEFT_CONTEXT_NONE,
|
||||
}
|
||||
|
||||
RIGHT_CONTEXT_MULTIPLY = 'NUMEX_RIGHT_CONTEXT_MULTIPLY'
|
||||
RIGHT_CONTEXT_ADD = 'NUMEX_RIGHT_CONTEXT_ADD'
|
||||
RIGHT_CONTEXT_NONE = 'NUMEX_RIGHT_CONTEXT_NONE'
|
||||
|
||||
right_context_map = {
|
||||
'add': RIGHT_CONTEXT_ADD,
|
||||
'multiply': RIGHT_CONTEXT_MULTIPLY,
|
||||
None: RIGHT_CONTEXT_NONE,
|
||||
}
|
||||
|
||||
CARDINAL = 'NUMEX_CARDINAL_RULE'
|
||||
ORDINAL = 'NUMEX_ORDINAL_RULE'
|
||||
ORDINAL_INDICATOR = 'NUMEX_ORDINAL_INDICATOR_RULE'
|
||||
|
||||
rule_type_map = {
|
||||
'cardinal': CARDINAL,
|
||||
'ordinal': ORDINAL,
|
||||
'ordinal_indicator': ORDINAL_INDICATOR,
|
||||
}
|
||||
|
||||
numex_key_template = u'"{key}"'
|
||||
numex_rule_template = u'{{{left_context_type}, {right_context_type}, {rule_type}, {gender}, {category}, {radix}, {value}LL}}'
|
||||
|
||||
stopword_rule = u'NUMEX_STOPWORD_RULE'
|
||||
|
||||
ordinal_indicator_template = u'{{"{key}", {gender}, {category}, "{value}"}}'
|
||||
|
||||
stopwords_template = u'"{word}"'
|
||||
|
||||
language_template = u'{{"{language}", {whole_words_only}, {rule_index}, {num_rules}, {ordinal_indicator_index}, {num_ordinal_indicators}}}'
|
||||
|
||||
numex_rules_data_template = u'''
|
||||
char *numex_keys[] = {{
|
||||
{numex_keys}
|
||||
}};
|
||||
|
||||
numex_rule_t numex_rules[] = {{
|
||||
{numex_rules}
|
||||
}};
|
||||
|
||||
ordinal_indicator_t ordinal_indicator_rules[] = {{
|
||||
{ordinal_indicator_rules}
|
||||
}};
|
||||
|
||||
numex_language_source_t numex_languages[] = {{
|
||||
{languages}
|
||||
}};
|
||||
'''
|
||||
|
||||
|
||||
def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
|
||||
all_keys = []
|
||||
all_rules = []
|
||||
|
||||
all_ordinal_indicators = []
|
||||
all_stopwords = []
|
||||
|
||||
all_languages = []
|
||||
|
||||
out = open(outfile, 'w')
|
||||
|
||||
for filename in os.listdir(dirname):
|
||||
path = os.path.join(dirname, filename)
|
||||
if not os.path.isfile(path) or not filename.endswith('.yaml'):
|
||||
continue
|
||||
|
||||
language = filename.split('.yaml', 1)[0]
|
||||
|
||||
data = yaml.load(open(path))
|
||||
|
||||
whole_words_only = data.get('whole_words_only', False)
|
||||
|
||||
rules = data.get('rules', [])
|
||||
rule_index = len(all_rules)
|
||||
|
||||
for rule in rules:
|
||||
invalid_keys = set(rule.keys()) - valid_numex_keys
|
||||
if invalid_keys:
|
||||
raise InvalidNumexRuleException(u'Invalid keys: ({}) for language {}, rule: {}'.format(u','.join(invalid_keys), language, rule))
|
||||
gender = gender_map[rule.get('gender')]
|
||||
rule_type = rule_type_map[rule['type']]
|
||||
key = rule['name']
|
||||
value = rule['value']
|
||||
radix = rule.get('radix', 10)
|
||||
rule_category = rule.get('category')
|
||||
category = category_map.get(rule_category)
|
||||
if category is None:
|
||||
continue
|
||||
left_context_type = left_context_map[rule.get('left')]
|
||||
right_context_type = right_context_map[rule.get('right')]
|
||||
all_keys.append(unicode(numex_key_template.format(key=key)))
|
||||
all_rules.append(unicode(numex_rule_template.format(
|
||||
language=language,
|
||||
rule_type=rule_type,
|
||||
gender=gender,
|
||||
category=category,
|
||||
left_context_type=left_context_type,
|
||||
right_context_type=right_context_type,
|
||||
value=value,
|
||||
radix=radix
|
||||
)))
|
||||
|
||||
ordinal_indicator_index = len(all_ordinal_indicators)
|
||||
ordinal_indicators = data.get('ordinal_indicators', [])
|
||||
num_ordinal_indicators = 0
|
||||
|
||||
for rule in ordinal_indicators:
|
||||
gender = gender_map[rule.get('gender')]
|
||||
category = category_map[rule.get('category')]
|
||||
invalid_ordinal_keys = set(rule.keys()) - valid_ordinal_keys
|
||||
if invalid_ordinal_keys:
|
||||
raise InvalidNumexRuleException(u'Invalid keys ({}) in ordinal rule for language {}, rule: {}'.format(u','.join(invalid_ordinal_keys), language, rule))
|
||||
|
||||
for key, suffixes in rule['suffixes'].iteritems():
|
||||
for suffix in suffixes:
|
||||
all_ordinal_indicators.append(unicode(ordinal_indicator_template.format(
|
||||
key=key,
|
||||
value=suffix,
|
||||
gender=gender,
|
||||
category=category
|
||||
)))
|
||||
num_ordinal_indicators += len(suffixes)
|
||||
|
||||
stopwords = data.get('stopwords', [])
|
||||
stopword_index = len(all_stopwords)
|
||||
num_stopwords = len(stopwords)
|
||||
|
||||
for stopword in stopwords:
|
||||
all_keys.append(numex_key_template.format(key=unicode(stopword)))
|
||||
all_rules.append(stopword_rule)
|
||||
|
||||
num_rules = len(rules) + len(stopwords)
|
||||
|
||||
all_languages.append(unicode(language_template.format(
|
||||
language=language,
|
||||
whole_words_only=int(whole_words_only),
|
||||
rule_index=rule_index,
|
||||
num_rules=num_rules,
|
||||
ordinal_indicator_index=ordinal_indicator_index,
|
||||
num_ordinal_indicators=num_ordinal_indicators
|
||||
)))
|
||||
|
||||
out.write(safe_encode(numex_rules_data_template.format(
|
||||
numex_keys=u''',
|
||||
'''.join(all_keys),
|
||||
numex_rules=u''',
|
||||
'''.join(all_rules),
|
||||
ordinal_indicator_rules=u''',
|
||||
'''.join(all_ordinal_indicators),
|
||||
stopwords=u''',
|
||||
'''.join(all_stopwords),
|
||||
languages=u''',
|
||||
'''.join(all_languages),
|
||||
)))
|
||||
|
||||
out.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parse_numex_rules(*sys.argv[1:])
|
||||
108
scripts/geodata/numbers/ordinals.py
Normal file
108
scripts/geodata/numbers/ordinals.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import bisect
|
||||
import math
|
||||
import os
|
||||
import operator
|
||||
import random
|
||||
import six
|
||||
import sys
|
||||
import yaml
|
||||
|
||||
from collections import defaultdict
|
||||
from marisa_trie import BytesTrie
|
||||
|
||||
from geodata.text.phrases import PhraseFilter
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
from geodata.i18n.unicode_paths import DATA_DIR
|
||||
|
||||
from geodata.numbers.numex import NUMEX_DATA_DIR
|
||||
|
||||
|
||||
class OrdinalSuffixTrie(PhraseFilter):
|
||||
def __init__(self, ordinal_rules):
|
||||
self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
|
||||
self.configured = True
|
||||
|
||||
def search_substring(self, s):
|
||||
if len(s) == 0:
|
||||
return None, 0
|
||||
|
||||
for i in xrange(len(s) + 1):
|
||||
if not self.trie.has_keys_with_prefix(s[:i]):
|
||||
i -= 1
|
||||
break
|
||||
if i > 0:
|
||||
return (self.trie.get(s[:i]), i)
|
||||
else:
|
||||
return None, 0
|
||||
|
||||
def search_suffix(self, token):
|
||||
suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
|
||||
if suffix_search:
|
||||
return suffix_search[0].split('|')
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class OrdinalExpressions(object):
|
||||
def __init__(self, base_dir=NUMEX_DATA_DIR):
|
||||
self.cardinal_rules = {}
|
||||
self.cardinal_rules_ones = {}
|
||||
|
||||
self.ordinal_rules = {}
|
||||
self.ordinal_suffix_rules = {}
|
||||
|
||||
for filename in os.listdir(base_dir):
|
||||
if filename.endswith('.yaml'):
|
||||
lang = filename.split('.yaml')[0]
|
||||
f = open(os.path.join(base_dir, filename))
|
||||
data = yaml.load(f)
|
||||
|
||||
rules = data.get('rules')
|
||||
if rules is not None and hasattr(rules, '__getslice__'):
|
||||
cardinals = []
|
||||
ordinals = defaultdict(list)
|
||||
for rule in rules:
|
||||
name = rule.get('name')
|
||||
value = rule.get('value')
|
||||
rule_type = rule.get('type')
|
||||
if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
|
||||
continue
|
||||
gender = rule.get('gender', None)
|
||||
category = rule.get('category', None)
|
||||
if rule_type == 'ordinal':
|
||||
ordinals[(value, gender, category)].append(name)
|
||||
else:
|
||||
cardinals.append(rule)
|
||||
if value == 1:
|
||||
self.cardinal_rules_ones[(lang, gender, category)] = name
|
||||
|
||||
self.cardinal_rules[lang] = cardinals
|
||||
self.ordinal_rules[lang] = ordinals
|
||||
|
||||
ordinal_indicators = data.get('ordinal_indicators')
|
||||
if ordinal_indicators is not None and hasattr(ordinal_indicators, '__getslice__'):
|
||||
for rule_set in ordinal_indicators:
|
||||
gender = rule_set.get('gender', None)
|
||||
category = rule_set.get('category', None)
|
||||
self.ordinal_suffix_rules[(lang, gender, category)] = OrdinalSuffixTrie(rule_set['suffixes'])
|
||||
|
||||
def get_suffixes(self, num, lang, gender=None, category=None):
|
||||
trie = self.ordinal_suffix_rules.get((lang, gender, category))
|
||||
if not trie:
|
||||
return None
|
||||
|
||||
return trie.search_suffix(str(num))
|
||||
|
||||
def get_suffix(self, num, lang, gender=None, category=None):
|
||||
suffixes = self.get_suffixes(num, lang, gender=gender, category=category)
|
||||
if not suffixes:
|
||||
return None
|
||||
return random.choice(suffixes)
|
||||
|
||||
def suffixed_number(self, num, lang, gender=None, category=None):
|
||||
suffix = self.get_suffix(num, lang, gender=gender, category=category)
|
||||
if not suffix:
|
||||
return None
|
||||
return six.u('{}{}').format(safe_decode(num), safe_decode(suffix))
|
||||
|
||||
ordinal_expressions = OrdinalExpressions()
|
||||
449
scripts/geodata/numbers/spellout.py
Normal file
449
scripts/geodata/numbers/spellout.py
Normal file
@@ -0,0 +1,449 @@
|
||||
import bisect
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from geodata.numbers.numex import NUMEX_DATA_DIR
|
||||
|
||||
|
||||
class NumericExpressions(object):
|
||||
default_separator = ' '
|
||||
|
||||
def __init__(self, base_dir=NUMEX_DATA_DIR):
|
||||
self.cardinal_rules = {}
|
||||
self.cardinal_rules_sorted = {}
|
||||
self.cardinal_rules_ones = defaultdict(dict)
|
||||
self.cardinal_rules_ones_sorted = {}
|
||||
|
||||
self.default_separators = {}
|
||||
|
||||
self.ordinal_rules = {}
|
||||
self.ordinal_suffix_rules = {}
|
||||
|
||||
for filename in os.listdir(base_dir):
|
||||
if filename.endswith('.yaml'):
|
||||
lang = filename.split('.yaml')[0]
|
||||
f = open(os.path.join(base_dir, filename))
|
||||
data = yaml.load(f)
|
||||
|
||||
default_separator = data.get('default_separator')
|
||||
if default_separator is not None:
|
||||
self.default_separators[lang] = default_separator
|
||||
|
||||
rules = data.get('rules')
|
||||
if rules is not None and hasattr(rules, '__getslice__'):
|
||||
cardinals = defaultdict(list)
|
||||
ordinals = defaultdict(list)
|
||||
for rule in rules:
|
||||
name = rule.get('name')
|
||||
value = rule.get('value')
|
||||
rule_type = rule.get('type')
|
||||
if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
|
||||
continue
|
||||
gender = rule.get('gender', None)
|
||||
category = rule.get('category', None)
|
||||
if rule_type == 'ordinal':
|
||||
ordinals[(value, gender, category)].append(rule)
|
||||
else:
|
||||
cardinals[(value, gender, category)].append(rule)
|
||||
if value == 1 and 'multiply_gte' in rule:
|
||||
self.cardinal_rules_ones[lang][rule['multiply_gte']] = rule
|
||||
|
||||
self.cardinal_rules[lang] = cardinals
|
||||
self.ordinal_rules[lang] = ordinals
|
||||
|
||||
self.cardinal_rules_sorted[lang] = sorted(set([v for v, g, c in cardinals]))
|
||||
self.cardinal_rules_ones_sorted[lang] = sorted(self.cardinal_rules_ones[lang].keys())
|
||||
|
||||
self.cardinal_rules_ones = dict(self.cardinal_rules_ones)
|
||||
|
||||
def spellout_cardinal(self, num, lang, gender=None, category=None, random_choice_cardinals=False):
|
||||
num = int(num)
|
||||
remainder = 0
|
||||
|
||||
if lang not in self.cardinal_rules:
|
||||
return None
|
||||
|
||||
rules = self.cardinal_rules.get(lang)
|
||||
cardinals = self.cardinal_rules_sorted.get(lang)
|
||||
if not rules or not cardinals:
|
||||
return None
|
||||
|
||||
default_separator = self.default_separators.get(lang, self.default_separator)
|
||||
|
||||
if num == 0:
|
||||
cardinal = rules.get((num, gender, category))
|
||||
if cardinal:
|
||||
if not random_choice_cardinals:
|
||||
cardinal = cardinal[0]
|
||||
else:
|
||||
cardinal = random.choice(cardinal)
|
||||
return cardinal['name']
|
||||
else:
|
||||
return None
|
||||
|
||||
cardinal_part = []
|
||||
|
||||
last_rule = {}
|
||||
left_multiply_rules = []
|
||||
|
||||
while num:
|
||||
i = bisect.bisect_left(cardinals, num)
|
||||
if i > len(cardinals) - 1:
|
||||
return None
|
||||
if i > 0 and cardinals[i] > num:
|
||||
val = cardinals[i - 1]
|
||||
else:
|
||||
val = cardinals[i]
|
||||
|
||||
multiple = num // val
|
||||
|
||||
if val == num:
|
||||
cardinal = rules.get((num, gender, category))
|
||||
else:
|
||||
cardinal = rules.get((val, None, None), [])
|
||||
|
||||
multiple_rule = None
|
||||
|
||||
if multiple > 1:
|
||||
multiple_val = rules.get((multiple, None, None))
|
||||
if multiple_val:
|
||||
if not random_choice_cardinals:
|
||||
multiple_rule = multiple_val[0]
|
||||
else:
|
||||
multiple_rule = random.choice(multiple_val)
|
||||
elif multiple == 1 and lang in self.cardinal_rules_ones_sorted:
|
||||
ones_rules = self.cardinal_rules_ones_sorted[lang]
|
||||
j = bisect.bisect_right(ones_rules, val)
|
||||
if j > 0 and ones_rules[j - 1] <= num:
|
||||
multiple_rule = self.cardinal_rules_ones[lang][ones_rules[j - 1]]
|
||||
|
||||
use_multiple = multiple > 1
|
||||
|
||||
is_left_multiply = False
|
||||
did_left_multiply = False
|
||||
|
||||
if not use_multiple:
|
||||
rule = None
|
||||
if cardinal and not random_choice_cardinals:
|
||||
rule = cardinal[0]
|
||||
elif cardinal:
|
||||
rule = random.choice(cardinal)
|
||||
else:
|
||||
for rule in cardinal:
|
||||
left_multiply = rule.get('left') == 'multiply'
|
||||
if left_multiply:
|
||||
if not multiple_rule:
|
||||
left_multiply_rules.append(rule)
|
||||
is_left_multiply = True
|
||||
last_rule = rule
|
||||
rule = None
|
||||
break
|
||||
else:
|
||||
rule = None
|
||||
|
||||
if rule is not None:
|
||||
left_add = last_rule.get('left') == 'add'
|
||||
right_add = last_rule.get('right') == 'add'
|
||||
|
||||
if multiple_rule:
|
||||
if right_add and cardinal_part:
|
||||
cardinal_part.append(last_rule.get('left_separator', default_separator))
|
||||
cardinal_part.append(multiple_rule['name'])
|
||||
cardinal_part.append(rule.get('left_separator', default_separator))
|
||||
|
||||
if right_add:
|
||||
if not multiple_rule and cardinal_part:
|
||||
right_separator = last_rule.get('right_separator', default_separator)
|
||||
cardinal_part.append(right_separator)
|
||||
cardinal_part.append(rule['name'])
|
||||
elif left_add and cardinal_part:
|
||||
last = cardinal_part.pop()
|
||||
cardinal_part.append(rule['name'])
|
||||
left_separator = last_rule.get('left_separator', default_separator)
|
||||
cardinal_part.append(left_separator)
|
||||
cardinal_part.append(last)
|
||||
elif not left_add and not right_add:
|
||||
cardinal_part.append(rule['name'])
|
||||
|
||||
last_rule = rule
|
||||
|
||||
if left_multiply_rules and 'right' not in rule and 'left' not in rule:
|
||||
left_multiply_rule = left_multiply_rules.pop()
|
||||
left_separator = left_multiply_rule.get('left_separator', default_separator)
|
||||
cardinal_part.append(left_separator)
|
||||
cardinal_part.append(left_multiply_rule['name'])
|
||||
did_left_multiply = True
|
||||
last_rule = left_multiply_rule
|
||||
|
||||
if not is_left_multiply and not did_left_multiply:
|
||||
num -= (multiple * val)
|
||||
elif not did_left_multiply:
|
||||
remainder = num % val
|
||||
num /= val
|
||||
else:
|
||||
num = remainder
|
||||
did_left_multiply = False
|
||||
|
||||
return six.u('').join(cardinal_part)
|
||||
|
||||
def roman_numeral(self, num):
|
||||
numeral = self.spellout_cardinal(num, 'la')
|
||||
if numeral is None:
|
||||
return None
|
||||
return numeral.upper()
|
||||
|
||||
def spellout_ordinal(self, num, lang, gender=None, category=None,
|
||||
random_choice_cardinals=False, random_choice_ordinals=False):
|
||||
num = int(num)
|
||||
remainder = 0
|
||||
|
||||
if lang not in self.cardinal_rules:
|
||||
return None
|
||||
|
||||
rules = self.ordinal_rules.get(lang)
|
||||
cardinal_rules = self.cardinal_rules.get(lang)
|
||||
cardinals = self.cardinal_rules_sorted.get(lang)
|
||||
if not rules or not cardinal_rules or not cardinals:
|
||||
return None
|
||||
|
||||
default_separator = self.default_separators.get(lang, self.default_separator)
|
||||
|
||||
expression = []
|
||||
|
||||
last_rule = {}
|
||||
left_multiply_rules = []
|
||||
|
||||
if num == 0 or (num, gender, category) in rules:
|
||||
ordinals = rules.get((num, gender, category))
|
||||
if ordinals:
|
||||
if not random_choice_ordinals:
|
||||
ordinal = ordinals[0]
|
||||
else:
|
||||
ordinal = random.choice(ordinals)
|
||||
return ordinal['name']
|
||||
else:
|
||||
return None
|
||||
|
||||
while num:
|
||||
i = bisect.bisect_left(cardinals, num)
|
||||
if i > len(cardinals) - 1:
|
||||
return None
|
||||
if i > 0 and cardinals[i] > num:
|
||||
val = cardinals[i - 1]
|
||||
else:
|
||||
val = cardinals[i]
|
||||
|
||||
if val == num and not remainder:
|
||||
if last_rule.get('right') == 'add':
|
||||
ordinals = rules.get((num, gender, category))
|
||||
if ordinals:
|
||||
if not random_choice_ordinals:
|
||||
ordinal = ordinals[0]
|
||||
else:
|
||||
ordinal = random.choice(ordinals)
|
||||
right_separator = last_rule.get('right_separator', default_separator)
|
||||
|
||||
return right_separator.join([six.u('').join(expression), ordinal['name']])
|
||||
else:
|
||||
return None
|
||||
elif last_rule.get('left') == 'add':
|
||||
last_num = last_rule['value']
|
||||
ordinals = rules.get((last_num, gender, category))
|
||||
if ordinals:
|
||||
if not random_choice_ordinals:
|
||||
ordinal = ordinals[0]
|
||||
else:
|
||||
ordinal = random.choice(ordinals)
|
||||
|
||||
last_rule = ordinal
|
||||
expression.pop()
|
||||
cardinals = cardinal_rules.get((num, None, None))
|
||||
if cardinals:
|
||||
if not random_choice_cardinals:
|
||||
rule = cardinals[0]
|
||||
else:
|
||||
rule = random.choice(cardinals)
|
||||
expression.append(rule['name'])
|
||||
else:
|
||||
return None
|
||||
last = ordinal['name']
|
||||
left_separator = last_rule.get('left_separator', default_separator)
|
||||
return left_separator.join([six.u('').join(expression), ordinal['name']])
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
ordinal = rules.get((val, None, None), [])
|
||||
cardinal = cardinal_rules.get((val, None, None), [])
|
||||
|
||||
multiple = num // val
|
||||
|
||||
multiple_rule = None
|
||||
|
||||
if multiple > 1:
|
||||
multiple_val = cardinal_rules.get((multiple, None, None))
|
||||
if multiple_val:
|
||||
if not random_choice_cardinals:
|
||||
multiple_rule = multiple_val[0]
|
||||
else:
|
||||
multiple_rule = random.choice(multiple_val)
|
||||
elif multiple == 1 and lang in self.cardinal_rules_ones_sorted:
|
||||
ones_rules = self.cardinal_rules_ones_sorted[lang]
|
||||
j = bisect.bisect_right(ones_rules, val)
|
||||
if j > 0 and ones_rules[j - 1] <= num:
|
||||
multiple_rule = self.cardinal_rules_ones[lang][ones_rules[j - 1]]
|
||||
|
||||
use_multiple = multiple > 1
|
||||
|
||||
is_left_multiply = False
|
||||
did_left_multiply = False
|
||||
|
||||
if not use_multiple:
|
||||
rule = None
|
||||
if ordinal and not remainder:
|
||||
for rule in ordinal:
|
||||
if rule.get('right') == 'add':
|
||||
break
|
||||
else:
|
||||
rule = None
|
||||
|
||||
if not rule and cardinal and not random_choice_cardinals:
|
||||
rule = cardinal[0]
|
||||
elif not rule and cardinal:
|
||||
rule = random.choice(cardinal)
|
||||
else:
|
||||
rule = None
|
||||
have_ordinal = False
|
||||
if ordinal:
|
||||
for rule in ordinal:
|
||||
left_multiply = rule.get('left') == 'multiply'
|
||||
if left_multiply and rule.get('right') == 'add':
|
||||
if not multiple_rule:
|
||||
left_multiply_rules.append(rule)
|
||||
is_left_multiply = True
|
||||
last_rule = rule
|
||||
rule = None
|
||||
have_ordinal = True
|
||||
break
|
||||
else:
|
||||
rule = None
|
||||
|
||||
if not have_ordinal:
|
||||
for rule in cardinal:
|
||||
left_multiply = rule.get('left') == 'multiply'
|
||||
if left_multiply:
|
||||
if not multiple_rule:
|
||||
left_multiply_rules.append(rule)
|
||||
is_left_multiply = True
|
||||
last_rule = rule
|
||||
rule = None
|
||||
break
|
||||
else:
|
||||
rule = None
|
||||
|
||||
if rule is not None:
|
||||
left_add = last_rule.get('left') == 'add'
|
||||
right_add = last_rule.get('right') == 'add'
|
||||
|
||||
if multiple_rule:
|
||||
if right_add and expression:
|
||||
expression.append(last_rule.get('left_separator', default_separator))
|
||||
expression.append(multiple_rule['name'])
|
||||
expression.append(rule.get('left_separator', default_separator))
|
||||
|
||||
if right_add:
|
||||
if not multiple_rule and expression:
|
||||
right_separator = last_rule.get('right_separator', default_separator)
|
||||
expression.append(right_separator)
|
||||
expression.append(rule['name'])
|
||||
elif left_add and expression:
|
||||
last = expression.pop()
|
||||
expression.append(rule['name'])
|
||||
left_separator = last_rule.get('left_separator', default_separator)
|
||||
expression.append(left_separator)
|
||||
expression.append(last)
|
||||
elif not left_add and not right_add:
|
||||
expression.append(rule['name'])
|
||||
|
||||
last_rule = rule
|
||||
|
||||
if left_multiply_rules and 'right' not in rule and 'left' not in rule:
|
||||
left_multiply_rule = left_multiply_rules.pop()
|
||||
print 'left_multiply_rule', left_multiply_rule
|
||||
left_separator = left_multiply_rule.get('left_separator', default_separator)
|
||||
expression.append(left_separator)
|
||||
expression.append(left_multiply_rule['name'])
|
||||
did_left_multiply = True
|
||||
last_rule = left_multiply_rule
|
||||
|
||||
if not is_left_multiply and not did_left_multiply:
|
||||
num -= (multiple * val)
|
||||
elif not did_left_multiply:
|
||||
remainder = num % val
|
||||
num /= val
|
||||
else:
|
||||
num = remainder
|
||||
remainder = 0
|
||||
did_left_multiply = False
|
||||
|
||||
def spellout_cardinal_hundreds(self, num, lang, gender=None, category=None, splitter=six.u(' ')):
|
||||
if num % 100 >= 10:
|
||||
first_hundred = self.spellout_cardinal(num % 100, lang, gender=gender, category=category)
|
||||
elif num % 100 == 0:
|
||||
rules = self.cardinal_rules.get(lang)
|
||||
if not rules:
|
||||
return None
|
||||
|
||||
cardinals = rules.get((100, gender, category))
|
||||
if not cardinals:
|
||||
return None
|
||||
|
||||
for rule in cardinals:
|
||||
if rule.get('left') == 'multiply' and not rule.get('exact_multiple_only'):
|
||||
break
|
||||
else:
|
||||
rule = None
|
||||
|
||||
if not rule:
|
||||
return None
|
||||
|
||||
first_hundred = rule['name']
|
||||
else:
|
||||
rules = self.cardinal_rules.get(lang)
|
||||
if not rules:
|
||||
return None
|
||||
|
||||
tens_place = num % 10
|
||||
zero_rules = rules.get((0, gender, category))
|
||||
if not zero_rules:
|
||||
return None
|
||||
|
||||
tens_place_rules = rules.get((tens_place, gender, category))
|
||||
if not tens_place_rules:
|
||||
return None
|
||||
|
||||
zero_rule = random.choice(zero_rules)
|
||||
tens_rule = random.choice(tens_place_rules)
|
||||
|
||||
first_hundred = splitter.join([zero_rule['name'], tens_rule['name']])
|
||||
|
||||
if not first_hundred:
|
||||
return None
|
||||
|
||||
parts = [first_hundred]
|
||||
|
||||
for i in xrange(1, int(math.ceil(math.log(num, 100)))):
|
||||
part = self.spellout_cardinal(num / 100 ** i, lang, gender=gender, category=category)
|
||||
if not part:
|
||||
return None
|
||||
parts.append(part)
|
||||
return splitter.join(reversed(parts))
|
||||
|
||||
|
||||
numeric_expressions = NumericExpressions()
|
||||
0
scripts/geodata/openaddresses/__init__.py
Normal file
0
scripts/geodata/openaddresses/__init__.py
Normal file
33
scripts/geodata/openaddresses/config.py
Normal file
33
scripts/geodata/openaddresses/config.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
import six
|
||||
import yaml
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'parser', 'data_sets', 'openaddresses.yaml')
|
||||
|
||||
|
||||
class OpenAddressesConfig(object):
|
||||
def __init__(self, path=OPENADDRESSES_PARSER_DATA_CONFIG):
|
||||
self.path = path
|
||||
|
||||
config = yaml.load(open(path))
|
||||
self.config = config['global']
|
||||
self.country_configs = config['countries']
|
||||
|
||||
@property
|
||||
def sources(self):
|
||||
for country, config in six.iteritems(self.country_configs):
|
||||
for file_config in config.get('files', []):
|
||||
filename = file_config['filename'].rsplit('.', 1)[0]
|
||||
|
||||
yield country, filename
|
||||
|
||||
for subdir, subdir_config in six.iteritems(config.get('subdirs', {})):
|
||||
for file_config in subdir_config.get('files', []):
|
||||
filename = file_config['filename'].rsplit('.', 1)[0]
|
||||
|
||||
yield country, subdir, filename
|
||||
|
||||
openaddresses_config = OpenAddressesConfig()
|
||||
114
scripts/geodata/openaddresses/download_openaddresses.py
Normal file
114
scripts/geodata/openaddresses/download_openaddresses.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import argparse
|
||||
import os
|
||||
import requests
|
||||
import six
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import yaml
|
||||
|
||||
from six.moves.urllib_parse import urljoin, quote_plus, unquote_plus
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.openaddresses.config import openaddresses_config
|
||||
from geodata.csv_utils import unicode_csv_reader
|
||||
from geodata.file_utils import ensure_dir, download_file, unzip_file, cd, remove_file
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
|
||||
BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io'
|
||||
|
||||
OPENADDRESSES_LATEST_DIR = urljoin(BASE_OPENADDRESSES_DATA_URL, 'latest/run/')
|
||||
|
||||
OPENADDRESSES_STATE_FILE_NAME = 'state.txt'
|
||||
OPENADDRESSES_STATE_URL = urljoin(BASE_OPENADDRESSES_DATA_URL, OPENADDRESSES_STATE_FILE_NAME)
|
||||
|
||||
|
||||
def download_and_unzip_file(url, out_dir):
|
||||
zip_filename = url.rsplit('/', 1)[-1].strip()
|
||||
zip_local_path = os.path.join(out_dir, zip_filename)
|
||||
|
||||
success = download_file(url, zip_local_path) and unzip_file(zip_local_path, out_dir)
|
||||
|
||||
if os.path.exists(zip_local_path):
|
||||
remove_file(zip_local_path)
|
||||
|
||||
return success
|
||||
|
||||
|
||||
def download_pre_release_downloads(out_dir):
|
||||
for url in openaddresses_config.config.get('pre_release_downloads', []):
|
||||
print(six.u('doing pre_release {}').format(safe_decode(url)))
|
||||
|
||||
success = download_and_unzip_file(url, out_dir)
|
||||
if not success:
|
||||
print(six.u('ERR: could not download {}').format(source))
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def openaddresses_download_all_files(out_dir):
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME)
|
||||
if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path):
|
||||
sys.exit('Could not download state.txt file')
|
||||
|
||||
reader = unicode_csv_reader(open(local_state_file_path), delimiter='\t')
|
||||
headers = reader.next()
|
||||
|
||||
source_index = headers.index('source')
|
||||
url_index = headers.index('processed')
|
||||
|
||||
download_pre_release_downloads(out_dir)
|
||||
|
||||
for row in reader:
|
||||
source = row[source_index].rsplit('.')[0]
|
||||
processed = row[url_index]
|
||||
if not processed or not processed.strip():
|
||||
continue
|
||||
|
||||
print(six.u('doing {}').format(source))
|
||||
success = download_and_unzip_file(processed, out_dir)
|
||||
if not success:
|
||||
print(six.u('ERR: could not download {}').format(source))
|
||||
|
||||
remove_file(local_state_file_path)
|
||||
|
||||
|
||||
def openaddresses_download_configured_files(out_dir):
|
||||
for path in openaddresses_config.sources:
|
||||
|
||||
source = six.b('/').join([safe_encode(p) for p in path])
|
||||
filename = safe_encode(path[-1]) + six.b('.zip')
|
||||
zip_path = filename + '.zip'
|
||||
zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] + [quote_plus(filename)])
|
||||
|
||||
url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path)
|
||||
|
||||
download_pre_release_downloads(out_dir)
|
||||
|
||||
print(six.u('doing {}').format(safe_decode(source)))
|
||||
success = download_and_unzip_file(url, out_dir)
|
||||
if not success:
|
||||
print(six.u('ERR: could not download {}').format(source))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-o', '--out-dir',
|
||||
required=True,
|
||||
help='Output directory')
|
||||
|
||||
parser.add_argument('--all', action='store_true',
|
||||
default=False, help='Download all completed OpenAddresses files')
|
||||
|
||||
args = parser.parse_args()
|
||||
ensure_dir(args.out_dir)
|
||||
|
||||
if args.all:
|
||||
openaddresses_download_all_files(args.out_dir)
|
||||
else:
|
||||
openaddresses_download_configured_files(args.out_dir)
|
||||
698
scripts/geodata/openaddresses/formatter.py
Normal file
698
scripts/geodata/openaddresses/formatter.py
Normal file
@@ -0,0 +1,698 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import csv
|
||||
import ftfy
|
||||
import itertools
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from geodata.addresses.units import Unit
|
||||
from geodata.address_expansions.abbreviations import abbreviate
|
||||
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||
from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_types_gazetteer, toponym_abbreviations_gazetteer
|
||||
from geodata.address_formatting.formatter import AddressFormatter
|
||||
from geodata.addresses.components import AddressComponents
|
||||
from geodata.countries.constants import Countries
|
||||
from geodata.countries.names import country_names
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.i18n.languages import get_country_languages
|
||||
from geodata.i18n.word_breaks import ideographic_scripts
|
||||
from geodata.language_id.disambiguation import UNKNOWN_LANGUAGE, get_string_script
|
||||
from geodata.math.sampling import cdf, weighted_choice
|
||||
from geodata.openaddresses.config import openaddresses_config
|
||||
from geodata.places.config import place_config
|
||||
from geodata.postal_codes.phrases import PostalCodes
|
||||
from geodata.text.tokenize import tokenize
|
||||
from geodata.text.token_types import token_types
|
||||
from geodata.text.utils import is_numeric, is_numeric_strict
|
||||
|
||||
from geodata.csv_utils import tsv_string, unicode_csv_reader
|
||||
|
||||
OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv'
|
||||
OPENADDRESSES_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv'
|
||||
|
||||
null_regex = re.compile('^\s*(?:null|none)\s*$', re.I)
|
||||
unknown_regex = re.compile('\bunknown\b', re.I)
|
||||
not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I)
|
||||
sin_numero_regex = re.compile('^\s*s\s*/\s*n\s*$', re.I)
|
||||
|
||||
russian_number_regex_str = safe_decode(r'(?:№\s*)?(?:(?:[\d]+\w?(?:[\-/](?:(?:[\d]+\w?)|\w))*)|(?:[\d]+\s*\w?)|(?:\b\w\b))')
|
||||
dom_korpus_stroyeniye_regex = re.compile(safe_decode('(?:(?:дом(?=\s)|д\.?)\s*)?{}(?:(?:\s*,|\s+)\s*(?:(?:корпус(?=\s)|к\.?)\s*){})?(?:(?:\s*,|\s+)\s*(?:(?:строение(?=\s)|с\.?)\s*){})?\s*$').format(russian_number_regex_str, russian_number_regex_str, russian_number_regex_str), re.I | re.U)
|
||||
uchastok_regex = re.compile(safe_decode('{}\s*(?:,?\s*участок\s+{}\s*)?$').format(russian_number_regex_str, russian_number_regex_str), re.I | re.U)
|
||||
bea_nomera_regex = re.compile(safe_decode('^\s*б\s*/\s*н\s*$'), re.I)
|
||||
fraction_regex = re.compile('^\s*[\d]+[\s]*/[\s]*(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)[\s]*$', re.I)
|
||||
number_space_letter_regex = re.compile('^[\d]+\s+[a-z]$', re.I)
|
||||
number_slash_number_regex = re.compile('^(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)[\s]*/[\s]*(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)$', re.I)
|
||||
number_fraction_regex = re.compile('^(?:[\d]+\s+)?(?:1[\s]*/[\s]*[234]|2[\s]*/[\s]*3)$')
|
||||
|
||||
colombian_standard_house_number_regex = re.compile('^(\d+[\s]*[a-z]?)\s+([a-z]?[\d]+[\s]*[a-z]?)?', re.I)
|
||||
|
||||
dutch_house_number_regex = re.compile('([\d]+)( [a-z])?( [\d]+)?', re.I)
|
||||
|
||||
SPANISH = 'es'
|
||||
PORTUGUESE = 'pt'
|
||||
RUSSIAN = 'ru'
|
||||
CHINESE = 'zh'
|
||||
|
||||
|
||||
class OpenAddressesFormatter(object):
|
||||
field_regex_replacements = {
|
||||
# All fields
|
||||
None: [
|
||||
(re.compile('<\s*null\s*>', re.I), u''),
|
||||
(re.compile('[\s]{2,}'), six.u(' ')),
|
||||
(re.compile('\`'), u"'"),
|
||||
(re.compile('\-?\*'), u""),
|
||||
],
|
||||
AddressFormatter.HOUSE_NUMBER: [
|
||||
# Most of the house numbers in Montreal start with "#"
|
||||
(re.compile('^#', re.UNICODE), u''),
|
||||
# Some house numbers have multiple hyphens
|
||||
(re.compile('[\-]{2,}'), u'-'),
|
||||
# Some house number ranges are split up like "12 -14"
|
||||
(re.compile('[\s]*\-[\s]*'), u'-'),
|
||||
]
|
||||
}
|
||||
|
||||
unit_type_regexes = {}
|
||||
|
||||
for (lang, dictionary_type), values in six.iteritems(address_phrase_dictionaries.phrases):
|
||||
if dictionary_type == 'unit_types_numbered':
|
||||
unit_phrases = [safe_encode(p) for p in itertools.chain(*values) if len(p) > 2]
|
||||
pattern = re.compile(r'\b(?:{})\s+(?:#?\s*)(?:[\d]+|[a-z]|[a-z]\-?[\d]+|[\d]+\-?[a-z])\s*$'.format(safe_encode('|').join(unit_phrases)),
|
||||
re.I | re.UNICODE)
|
||||
unit_type_regexes[lang] = pattern
|
||||
|
||||
def __init__(self, components, country_rtree, debug=False):
|
||||
self.components = components
|
||||
self.country_rtree = country_rtree
|
||||
|
||||
self.debug = debug
|
||||
|
||||
self.formatter = AddressFormatter()
|
||||
|
||||
class validators:
|
||||
@classmethod
|
||||
def validate_postcode(cls, postcode):
|
||||
'''
|
||||
Postcodes that are all zeros are improperly-formatted NULL values
|
||||
'''
|
||||
return not all((c in ('0', '-', '.', ' ', ',') for c in postcode))
|
||||
|
||||
@classmethod
|
||||
def validate_street(cls, street):
|
||||
'''
|
||||
Streets should not be simple numbers. If they are it's probably a
|
||||
copy/paste error and should be the house number.
|
||||
'''
|
||||
return not is_numeric(street)
|
||||
|
||||
@classmethod
|
||||
def validate_house_number(cls, house_number):
|
||||
'''
|
||||
House number doesn't necessarily have to be numeric, but in some of the
|
||||
OpenAddresses data sets the house number field is equal to the capitalized
|
||||
street name, so this at least provides protection against insane values
|
||||
for house number at the cost of maybe missing a few houses numbered "A", etc.
|
||||
|
||||
Also OpenAddresses primarily comes from county GIS servers, etc. which use
|
||||
a variety of database schemas and don't always handle NULLs very well. Again,
|
||||
while a single zero is a valid house number, in OpenAddresses it's more likely
|
||||
an error
|
||||
|
||||
While a single zero is a valid house number, more than one zero is not, or
|
||||
at least not in OpenAddresses
|
||||
'''
|
||||
|
||||
try:
|
||||
house_number = int(house_number.strip())
|
||||
return house_number > 0
|
||||
except (ValueError, TypeError):
|
||||
house_number = house_number.strip()
|
||||
return house_number and (is_numeric(house_number) or fraction_regex.match(house_number) or number_space_letter_regex.match(house_number) or
|
||||
number_slash_number_regex.match(house_number) or number_fraction_regex.match(house_number)) and not all((c == '0' for c in house_number if c.isdigit()))
|
||||
|
||||
@classmethod
|
||||
def validate_house_number_sin_numero(cls, house_number):
|
||||
if sin_numero_regex.match(house_number):
|
||||
return True
|
||||
return cls.validate_house_number(house_number)
|
||||
|
||||
@classmethod
|
||||
def validate_russian_house_number(cls, house_number):
|
||||
if dom_korpus_stroyeniye_regex.match(house_number):
|
||||
return True
|
||||
elif uchastok_regex.match(house_number):
|
||||
return True
|
||||
elif bea_nomera_regex.match(house_number):
|
||||
return True
|
||||
return cls.validate_house_number(house_number)
|
||||
|
||||
@classmethod
|
||||
def validate_colombian_house_number(cls, house_number):
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def validate_chinese_house_number(cls, house_number):
|
||||
if not house_number:
|
||||
return False
|
||||
tokens = tokenize(house_number)
|
||||
|
||||
if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋', u'附')) for t, c in tokens):
|
||||
return True
|
||||
return cls.validate_house_number(house_number)
|
||||
|
||||
component_validators = {
|
||||
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
|
||||
AddressFormatter.ROAD: validators.validate_street,
|
||||
AddressFormatter.POSTCODE: validators.validate_postcode,
|
||||
}
|
||||
|
||||
language_validators = {
|
||||
SPANISH: {
|
||||
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
|
||||
},
|
||||
PORTUGUESE: {
|
||||
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
|
||||
},
|
||||
RUSSIAN: {
|
||||
AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number,
|
||||
},
|
||||
CHINESE: {
|
||||
AddressFormatter.HOUSE_NUMBER: validators.validate_chinese_house_number,
|
||||
}
|
||||
}
|
||||
|
||||
country_validators = {
|
||||
Countries.COLOMBIA: {
|
||||
AddressFormatter.HOUSE_NUMBER: validators.validate_colombian_house_number
|
||||
}
|
||||
}
|
||||
|
||||
chinese_annex_regex = re.compile(u'([\d]+)(?![\d号栋])', re.U)
|
||||
|
||||
@classmethod
|
||||
def format_chinese_house_number(cls, house_number):
|
||||
if not house_number:
|
||||
return house_number
|
||||
return cls.chinese_annex_regex.sub(u'\\1号', house_number)
|
||||
|
||||
@classmethod
|
||||
def format_colombian_house_number(cls, house_number):
|
||||
house_number = house_number.strip()
|
||||
match = colombian_standard_house_number_regex.match(house_number)
|
||||
if match:
|
||||
separator = random.choice((u'-', u' - ', u' '))
|
||||
|
||||
cross_street, building_number = match.groups()
|
||||
|
||||
numbers = []
|
||||
if cross_street and u' ' in cross_street and random.choice((True, False)):
|
||||
cross_street = cross_street.replace(u' ', u'')
|
||||
|
||||
if cross_street:
|
||||
numbers.append(cross_street)
|
||||
|
||||
if building_number and u' ' in building_number and random.choice((True, False)):
|
||||
building_number = building_number.replace(u' ', u'')
|
||||
|
||||
if building_number:
|
||||
numbers.append(building_number)
|
||||
|
||||
if numbers:
|
||||
house_number = separator.join(numbers)
|
||||
house_number_prefixes = (u'#', u'no.', u'no', u'nº')
|
||||
if random.choice((True, False)) and not any((house_number.lower().startswith(p) for p in house_number_prefixes)):
|
||||
house_number = u' '.join([random.choice(house_number_prefixes), house_number])
|
||||
|
||||
return house_number
|
||||
|
||||
def get_property(self, key, *configs):
|
||||
for config in configs:
|
||||
value = config.get(key, None)
|
||||
if value is not None:
|
||||
return value
|
||||
return None
|
||||
|
||||
def cldr_country_name(self, country_code, language, configs):
|
||||
cldr_country_prob = float(self.get_property('cldr_country_probability', *configs))
|
||||
|
||||
country_name = None
|
||||
|
||||
if random.random() < cldr_country_prob:
|
||||
localized, iso_3166, alpha2, alpha3 = values = range(4)
|
||||
localized_prob = float(self.get_property('localized_name_probability', *configs))
|
||||
iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs))
|
||||
alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs))
|
||||
alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs))
|
||||
|
||||
probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])
|
||||
|
||||
country_type = weighted_choice(values, probs)
|
||||
|
||||
country_name = country_code.upper()
|
||||
if country_type == localized:
|
||||
country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
|
||||
elif country_type == iso_3166:
|
||||
country_name = country_names.iso3166_name(country_code)
|
||||
elif country_type == alpha3:
|
||||
country_name = country_names.alpha3_code(country_code) or country_name
|
||||
|
||||
return country_name
|
||||
|
||||
@classmethod
|
||||
def cleanup_number(cls, num, strip_commas=False):
|
||||
num = num.strip()
|
||||
if strip_commas:
|
||||
num = num.replace(six.u(','), six.u(''))
|
||||
try:
|
||||
num_int = int(num)
|
||||
except (ValueError, TypeError):
|
||||
try:
|
||||
num_float = float(num)
|
||||
leading_zeros = 0
|
||||
for c in num:
|
||||
if c == six.u('0'):
|
||||
leading_zeros += 1
|
||||
else:
|
||||
break
|
||||
num = safe_decode(int(num_float))
|
||||
if leading_zeros:
|
||||
num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return num
|
||||
|
||||
@classmethod
|
||||
def fix_component_encodings(cls, components):
|
||||
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}
|
||||
|
||||
def formatted_addresses(self, country_dir, path, configs, tag_components=True):
|
||||
abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
|
||||
separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
|
||||
abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs))
|
||||
separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0)
|
||||
abbreviate_toponym_prob = float(self.get_property('abbreviate_toponym_probability', *configs))
|
||||
|
||||
add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False)
|
||||
add_osm_neighborhoods = bool(self.get_property('add_osm_neighborhoods', *configs) or False)
|
||||
osm_neighborhood_overrides_city = self.get_property('osm_neighborhood_overrides_city', *configs)
|
||||
non_numeric_units = bool(self.get_property('non_numeric_units', *configs) or False)
|
||||
house_number_strip_commas = bool(self.get_property('house_number_strip_commas', *configs) or False)
|
||||
numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False)
|
||||
postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False)
|
||||
|
||||
address_only_probability = float(self.get_property('address_only_probability', *configs))
|
||||
place_only_probability = float(self.get_property('place_only_probability', *configs))
|
||||
place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs))
|
||||
|
||||
city_replacements = self.get_property('city_replacements', *configs)
|
||||
|
||||
override_country_dir = self.get_property('override_country_dir', *configs)
|
||||
|
||||
postcode_length = int(self.get_property('postcode_length', *configs) or 0)
|
||||
|
||||
drop_address_probability = place_only_probability + place_and_postcode_probability
|
||||
|
||||
ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or [])
|
||||
|
||||
ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE)
|
||||
for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))}
|
||||
|
||||
alias_fields_containing = {field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value]
|
||||
for field, value in six.iteritems(dict(self.get_property('alias_fields_containing', *configs) or {}))}
|
||||
|
||||
config_language = self.get_property('language', *configs)
|
||||
|
||||
add_components = self.get_property('add', *configs)
|
||||
|
||||
fields = self.get_property('fields', *configs)
|
||||
if not fields:
|
||||
return
|
||||
|
||||
field_map = {field_name: f['component'] for field_name, f in six.iteritems(fields)}
|
||||
mapped_values = {f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get')}
|
||||
|
||||
f = open(path)
|
||||
reader = unicode_csv_reader(f)
|
||||
headers = reader.next()
|
||||
|
||||
header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map}
|
||||
latitude_index = headers.index('LAT')
|
||||
longitude_index = headers.index('LON')
|
||||
|
||||
# Clear cached polygons
|
||||
self.components.osm_admin_rtree.clear_cache()
|
||||
self.components.neighborhoods_rtree.clear_cache()
|
||||
|
||||
for row in reader:
|
||||
try:
|
||||
latitude = float(row[latitude_index])
|
||||
longitude = float(row[longitude_index])
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
language = config_language
|
||||
|
||||
components = {}
|
||||
|
||||
skip_record = False
|
||||
|
||||
for i, key in six.iteritems(header_indices):
|
||||
value = row[i].strip()
|
||||
if not value and key in ignore_rows_missing_fields:
|
||||
skip_record = True
|
||||
break
|
||||
elif not value:
|
||||
continue
|
||||
|
||||
if key in mapped_values:
|
||||
value = mapped_values[key].get(value, value)
|
||||
|
||||
if key == AddressFormatter.ROAD and language == SPANISH:
|
||||
value = self.components.spanish_street_name(value)
|
||||
|
||||
if key == AddressFormatter.POSTCODE:
|
||||
value = self.cleanup_number(value)
|
||||
|
||||
if postcode_strip_non_digit_chars:
|
||||
value = six.u('').join((c for c in value if c.isdigit()))
|
||||
|
||||
if value and not is_numeric(value) and numeric_postcodes_only:
|
||||
continue
|
||||
else:
|
||||
if postcode_length:
|
||||
value = value.zfill(postcode_length)[:postcode_length]
|
||||
|
||||
if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
|
||||
if add_osm_boundaries:
|
||||
continue
|
||||
value = self.components.cleaned_name(value, first_comma_delimited_phrase=True)
|
||||
if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)):
|
||||
continue
|
||||
|
||||
if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
|
||||
continue
|
||||
|
||||
for exp, sub_val in self.field_regex_replacements.get(key, []):
|
||||
value = exp.sub(sub_val, value)
|
||||
|
||||
for exp, sub_val in self.field_regex_replacements.get(None, []):
|
||||
value = exp.sub(sub_val, value)
|
||||
|
||||
value = value.strip(', -')
|
||||
|
||||
validator = self.country_validators.get(country_dir, {}).get(key, self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None)))
|
||||
|
||||
if validator is not None and not validator(value):
|
||||
continue
|
||||
|
||||
if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
|
||||
continue
|
||||
|
||||
for (pattern, alias) in alias_fields_containing.get(key, []):
|
||||
if pattern.search(value):
|
||||
if 'component' in alias:
|
||||
key = alias['component']
|
||||
|
||||
if value:
|
||||
components[key] = value
|
||||
|
||||
if skip_record:
|
||||
continue
|
||||
|
||||
if components:
|
||||
country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
|
||||
if not (country and candidate_languages) or (country != country_dir and not override_country_dir):
|
||||
country = country_dir
|
||||
candidate_languages = get_country_languages(country)
|
||||
if not candidate_languages:
|
||||
continue
|
||||
candidate_languages = candidate_languages.items()
|
||||
|
||||
components = self.fix_component_encodings(components)
|
||||
|
||||
if language is None:
|
||||
language = AddressComponents.address_language(components, candidate_languages)
|
||||
|
||||
street = components.get(AddressFormatter.ROAD, None)
|
||||
if street is not None:
|
||||
street = street.strip()
|
||||
street = AddressComponents.cleaned_name(street)
|
||||
|
||||
if language == UNKNOWN_LANGUAGE:
|
||||
strip_unit_language = candidate_languages[0][0] if candidate_languages else None
|
||||
else:
|
||||
strip_unit_language = language
|
||||
|
||||
street = self.components.strip_unit_phrases_for_language(street, strip_unit_language)
|
||||
|
||||
street = abbreviate(street_types_gazetteer, street, language,
|
||||
abbreviate_prob=abbreviate_street_prob,
|
||||
separate_prob=separate_street_prob)
|
||||
components[AddressFormatter.ROAD] = street
|
||||
|
||||
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
|
||||
if house_number:
|
||||
house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas)
|
||||
|
||||
if language == CHINESE:
|
||||
house_number = self.format_chinese_house_number(house_number)
|
||||
|
||||
if country_dir == Countries.COLOMBIA:
|
||||
house_number = self.format_colombian_house_number(house_number)
|
||||
|
||||
if house_number is not None:
|
||||
components[AddressFormatter.HOUSE_NUMBER] = house_number
|
||||
|
||||
unit = components.get(AddressFormatter.UNIT, None)
|
||||
|
||||
street_required = country not in (Countries.JAPAN, Countries.CZECH_REPUBLIC) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES
|
||||
|
||||
postcode = components.get(AddressFormatter.POSTCODE, None)
|
||||
|
||||
if postcode:
|
||||
components[AddressFormatter.POSTCODE] = PostalCodes.add_country_code(postcode, country)
|
||||
|
||||
# If there's a postcode, we can still use just the city/state/postcode, otherwise discard
|
||||
if (not street and street_required) or (street and house_number and (street.lower() == house_number.lower())) or (unit and street and street.lower() == unit.lower()):
|
||||
if not postcode:
|
||||
continue
|
||||
components = self.components.drop_address(components)
|
||||
|
||||
# Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc.
|
||||
unit = components.get(AddressFormatter.UNIT, None)
|
||||
|
||||
if unit is not None:
|
||||
if is_numeric_strict(unit):
|
||||
unit = Unit.phrase(unit, language, country=country)
|
||||
elif non_numeric_units:
|
||||
unit = abbreviate(unit_types_gazetteer, unit, language,
|
||||
abbreviate_prob=abbreviate_unit_prob,
|
||||
separate_prob=separate_unit_prob)
|
||||
else:
|
||||
unit = None
|
||||
|
||||
if unit is not None:
|
||||
components[AddressFormatter.UNIT] = unit
|
||||
else:
|
||||
components.pop(AddressFormatter.UNIT)
|
||||
unit = None
|
||||
|
||||
# CLDR country name
|
||||
country_name = self.cldr_country_name(country, language, configs)
|
||||
if country_name:
|
||||
components[AddressFormatter.COUNTRY] = country_name
|
||||
|
||||
for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
|
||||
component = components.get(component_key, None)
|
||||
if component is not None:
|
||||
component = abbreviate(toponym_abbreviations_gazetteer, component, language,
|
||||
abbreviate_prob=abbreviate_toponym_prob)
|
||||
component = self.components.name_hyphens(component)
|
||||
components[component_key] = component
|
||||
|
||||
# Any components specified to be added by the config (usually state)
|
||||
if add_components:
|
||||
for k, v in six.iteritems(add_components):
|
||||
if k not in components:
|
||||
components[k] = v
|
||||
|
||||
# Get named states occasionally, added component is usually a state code
|
||||
address_state = self.components.state_name(components, country, language)
|
||||
if address_state:
|
||||
components[AddressFormatter.STATE] = address_state
|
||||
|
||||
state = components.get(AddressFormatter.STATE)
|
||||
if state:
|
||||
state = self.components.abbreviated_state(state, country, language)
|
||||
if state:
|
||||
components[AddressFormatter.STATE] = state
|
||||
|
||||
# This is expensive, so only turn on for files that don't supply their own city names
|
||||
# or for which those names are flawed
|
||||
osm_components = []
|
||||
|
||||
# Using population=0 instead of None means if there's no known population or
|
||||
# we don't need to add OSM components, we assume the population of the town is
|
||||
# very small and the place name shouldn't be used unqualified (i.e. needs information
|
||||
# like state name to disambiguate it)
|
||||
population = 0
|
||||
unambiguous_city = False
|
||||
if add_osm_boundaries or AddressFormatter.CITY not in components:
|
||||
osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
|
||||
self.components.add_admin_boundaries(components, osm_components, country, language, latitude, longitude)
|
||||
categorized = self.components.categorized_osm_components(country, osm_components)
|
||||
for component, label in categorized:
|
||||
if label == AddressFormatter.CITY:
|
||||
unambiguous_city = self.components.unambiguous_wikipedia(component, language)
|
||||
if 'population' in component:
|
||||
population = component['population']
|
||||
break
|
||||
|
||||
if AddressFormatter.CITY not in components and city_replacements:
|
||||
components.update({k: v for k, v in six.iteritems(city_replacements) if k not in components})
|
||||
|
||||
# The neighborhood index is cheaper so can turn on for whole countries
|
||||
neighborhood_components = []
|
||||
if add_osm_neighborhoods:
|
||||
neighborhood_components = self.components.neighborhood_components(latitude, longitude)
|
||||
self.components.add_neighborhoods(components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city)
|
||||
|
||||
self.components.cleanup_boundary_names(components)
|
||||
self.components.country_specific_cleanup(components, country)
|
||||
|
||||
self.components.replace_name_affixes(components, language, country=country)
|
||||
|
||||
self.components.replace_names(components)
|
||||
|
||||
self.components.prune_duplicate_names(components)
|
||||
|
||||
self.components.remove_numeric_boundary_names(components)
|
||||
self.components.add_house_number_phrase(components, language, country=country)
|
||||
self.components.add_postcode_phrase(components, language, country=country)
|
||||
|
||||
# Component dropout
|
||||
all_osm_components = osm_components + neighborhood_components
|
||||
components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)
|
||||
|
||||
self.components.add_genitives(components, language)
|
||||
|
||||
formatted = self.formatter.format_address(components, country, language=language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
yield (language, country, formatted)
|
||||
|
||||
if random.random() < address_only_probability and street:
|
||||
address_only_components = self.components.drop_places(components)
|
||||
address_only_components = self.components.drop_postcode(address_only_components)
|
||||
formatted = self.formatter.format_address(address_only_components, country, language=language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
yield (language, country, formatted)
|
||||
|
||||
rand_val = random.random()
|
||||
|
||||
if street and house_number and rand_val < drop_address_probability:
|
||||
components = self.components.drop_address(components)
|
||||
|
||||
if rand_val < place_and_postcode_probability:
|
||||
components = self.components.drop_postcode(components)
|
||||
|
||||
if components and (len(components) > 1 or add_osm_boundaries):
|
||||
formatted = self.formatter.format_address(components, country, language=language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
yield (language, country, formatted)
|
||||
|
||||
def build_training_data(self, base_dir, out_dir, tag_components=True, sources_only=None):
|
||||
all_sources_valid = sources_only is None
|
||||
valid_sources = set()
|
||||
if not all_sources_valid:
|
||||
for source in sources_only:
|
||||
if source.startswith(base_dir):
|
||||
source = os.path.relpath(source, base_dir)
|
||||
|
||||
parts = source.strip('/ ').split('/')
|
||||
if len(parts) > 3:
|
||||
raise AssertionError('Sources may only have at maximum 3 parts')
|
||||
valid_sources.add(tuple(parts))
|
||||
|
||||
if tag_components:
|
||||
formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
||||
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
||||
else:
|
||||
formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_FILENAME), 'w')
|
||||
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
||||
|
||||
i = 0
|
||||
|
||||
for country_dir in sorted(openaddresses_config.country_configs.keys()):
|
||||
country_config = openaddresses_config.country_configs[country_dir]
|
||||
# Clear country cache for each new country
|
||||
self.country_rtree.clear_cache()
|
||||
|
||||
for file_config in country_config.get('files', []):
|
||||
filename = file_config['filename']
|
||||
|
||||
if not all_sources_valid and not ((country_dir, filename) in valid_sources or (country_dir,) in valid_sources):
|
||||
continue
|
||||
|
||||
print(six.u('doing {}/{}').format(country_dir, filename))
|
||||
|
||||
path = os.path.join(base_dir, country_dir, filename)
|
||||
configs = (file_config, country_config, openaddresses_config.config)
|
||||
for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components):
|
||||
if not formatted_address or not formatted_address.strip():
|
||||
continue
|
||||
|
||||
formatted_address = tsv_string(formatted_address)
|
||||
if not formatted_address or not formatted_address.strip():
|
||||
continue
|
||||
|
||||
if tag_components:
|
||||
row = (language, country, formatted_address)
|
||||
else:
|
||||
row = (formatted_address,)
|
||||
|
||||
writer.writerow(row)
|
||||
i += 1
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print('did {} formatted addresses'.format(i))
|
||||
if self.debug:
|
||||
break
|
||||
|
||||
for subdir in sorted(country_config.get('subdirs', {}).keys()):
|
||||
subdir_config = country_config['subdirs'][subdir]
|
||||
subdir = safe_decode(subdir)
|
||||
for file_config in subdir_config.get('files', []):
|
||||
filename = file_config['filename']
|
||||
|
||||
if not all_sources_valid and not ((country_dir, subdir, filename) in valid_sources or (country_dir, subdir) in valid_sources or (country_dir,) in valid_sources):
|
||||
continue
|
||||
|
||||
print(six.u('doing {}/{}/{}').format(country_dir, subdir, filename))
|
||||
|
||||
path = os.path.join(base_dir, country_dir, subdir, filename)
|
||||
|
||||
configs = (file_config, subdir_config, country_config, openaddresses_config.config)
|
||||
for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components):
|
||||
if not formatted_address or not formatted_address.strip():
|
||||
continue
|
||||
|
||||
formatted_address = tsv_string(formatted_address)
|
||||
if not formatted_address or not formatted_address.strip():
|
||||
continue
|
||||
|
||||
if tag_components:
|
||||
row = (language, country, formatted_address)
|
||||
else:
|
||||
row = (formatted_address,)
|
||||
|
||||
writer.writerow(row)
|
||||
|
||||
i += 1
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print('did {} formatted addresses'.format(i))
|
||||
if self.debug:
|
||||
break
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user