Initial fork commit
This commit is contained in:
0
scripts/geodata/language_id/__init__.py
Normal file
0
scripts/geodata/language_id/__init__.py
Normal file
100
scripts/geodata/language_id/create_language_training_data.py
Normal file
100
scripts/geodata/language_id/create_language_training_data.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.osm.osm_address_training_data import WAYS_LANGUAGE_DATA_FILENAME, ADDRESS_LANGUAGE_DATA_FILENAME, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME, TOPONYM_LANGUAGE_DATA_FILENAME
|
||||
|
||||
LANGUAGES_ALL_FILE = 'languages.all'
|
||||
LANGAUGES_RANDOM_FILE = 'languages.random'
|
||||
LANGUAGES_TRAIN_FILE = 'languages.train'
|
||||
LANGUAGES_CV_FILE = 'languages.cv'
|
||||
LANGUAGES_TEST_FILE = 'languages.test'
|
||||
|
||||
|
||||
def create_language_training_data(osm_dir, split_data=True, train_split=0.8, cv_split=0.1):
|
||||
language_all_path = os.path.join(osm_dir, LANGUAGES_ALL_FILE)
|
||||
|
||||
ways_path = os.path.join(osm_dir, WAYS_LANGUAGE_DATA_FILENAME)
|
||||
|
||||
if os.system(' '.join(['cat', ways_path, '>', language_all_path])) != 0:
|
||||
raise SystemError('Could not find {}'.format(ways_path))
|
||||
|
||||
addresses_path = os.path.join(osm_dir, ADDRESS_LANGUAGE_DATA_FILENAME)
|
||||
|
||||
if os.system(' '.join(['cat', addresses_path, '>>', language_all_path])) != 0:
|
||||
raise SystemError('Could not find {}'.format(addresses_path))
|
||||
|
||||
formatted_path = os.path.join(osm_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME)
|
||||
|
||||
if os.system(' '.join(['cat', formatted_path, '>>', language_all_path])) != 0:
|
||||
raise SystemError('Could not find {}'.format(formatted_path))
|
||||
|
||||
toponyms_path = os.path.join(osm_dir, TOPONYM_LANGUAGE_DATA_FILENAME)
|
||||
|
||||
if os.system(' '.join(['cat', toponyms_path, '>>', language_all_path])) != 0:
|
||||
raise SystemError('Could not find {}'.format(toponyms_path))
|
||||
|
||||
languages_random_path = os.path.join(osm_dir, LANGAUGES_RANDOM_FILE)
|
||||
|
||||
if os.system(u' '.join(['shuf', '--random-source=/dev/urandom', language_all_path, '>', languages_random_path])) != 0:
|
||||
raise SystemError('shuffle failed')
|
||||
|
||||
languages_train_path = os.path.join(osm_dir, LANGUAGES_TRAIN_FILE)
|
||||
|
||||
if split_data:
|
||||
languages_test_path = os.path.join(osm_dir, LANGUAGES_TEST_FILE)
|
||||
|
||||
num_lines = sum((1 for line in open(languages_random_path)))
|
||||
train_lines = int(train_split * num_lines)
|
||||
|
||||
test_lines = num_lines - train_lines
|
||||
cv_lines = int(test_lines * (cv_split / (1.0 - train_split))) + 1
|
||||
|
||||
subprocess.check_call(['split', '-l', str(train_lines), languages_random_path, os.path.join(osm_dir, 'language-split-')])
|
||||
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-aa'), languages_train_path])
|
||||
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-ab'), languages_test_path])
|
||||
|
||||
languages_cv_path = os.path.join(osm_dir, LANGUAGES_CV_FILE)
|
||||
|
||||
subprocess.check_call(['split', '-l', str(cv_lines), languages_test_path, os.path.join(osm_dir, 'language-split-')])
|
||||
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-aa'), languages_cv_path])
|
||||
subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-ab'), languages_test_path])
|
||||
else:
|
||||
subprocess.check_call(['mv', languages_random_path, languages_train_path])
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-n', '--no-split',
|
||||
action='store_false',
|
||||
default=True,
|
||||
help='Do not split data into train/cv/test')
|
||||
|
||||
parser.add_argument('-t', '--train-split',
|
||||
type=float,
|
||||
default=0.8,
|
||||
help='Train split percentage as a float (default 0.8)')
|
||||
|
||||
parser.add_argument('-c', '--cv-split',
|
||||
type=float,
|
||||
default=0.1,
|
||||
help='Cross-validation split percentage as a float (default 0.1)')
|
||||
|
||||
parser.add_argument('-o', '--osm-dir',
|
||||
default=os.getcwd(),
|
||||
help='OSM directory')
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.train_split + args.cv_split >= 1.0:
|
||||
raise ValueError('Train split + cross-validation split must be less than 1.0')
|
||||
|
||||
if not os.path.exists(args.osm_dir):
|
||||
raise ValueError('OSM directory does not exist')
|
||||
|
||||
create_language_training_data(args.osm_dir, split_data=args.no_split, train_split=args.train_split, cv_split=args.cv_split)
|
||||
176
scripts/geodata/language_id/disambiguation.py
Normal file
176
scripts/geodata/language_id/disambiguation.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
|
||||
|
||||
from geodata.address_expansions.gazetteers import *
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.string_utils import wide_iter, wide_ord
|
||||
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
|
||||
from geodata.text.normalize import normalized_tokens, normalize_string
|
||||
from geodata.text.tokenize import tokenize
|
||||
from geodata.text.token_types import token_types
|
||||
|
||||
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
|
||||
|
||||
# For toponyms, we want to limit the countries we consider to those where
|
||||
# the place names can themselves be considered training examples of the language
|
||||
WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
|
||||
'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']),
|
||||
'fr': set(['fr']),
|
||||
'it': set(['it']),
|
||||
'de': set(['de', 'at']),
|
||||
'nl': set(['nl']),
|
||||
'es': set(['es', 'ar', 'mx', 'cl', 'co', 'pe', 'ec', 'pr', 'uy',
|
||||
've', 'cu', 'do', 'bo', 'gt', 'cr', 'py', 'sv', 'pa',
|
||||
'ni', 'hn']),
|
||||
'pt': set(['pt', 'br']),
|
||||
}
|
||||
|
||||
char_scripts = get_chars_by_script()
|
||||
script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())}
|
||||
lang_scripts = defaultdict(set)
|
||||
|
||||
for script, langs in six.iteritems(script_languages):
|
||||
for lang in langs:
|
||||
lang_scripts[lang].add(script)
|
||||
|
||||
lang_scripts = dict(lang_scripts)
|
||||
|
||||
UNKNOWN_SCRIPT = 'Unknown'
|
||||
COMMON_SCRIPT = 'Common'
|
||||
MAX_ASCII = 127
|
||||
|
||||
|
||||
def get_string_script(s):
|
||||
s = safe_decode(s)
|
||||
str_len = len(s)
|
||||
script = last_script = UNKNOWN_SCRIPT
|
||||
is_ascii = True
|
||||
script_len = 0
|
||||
for c in wide_iter(s):
|
||||
script = char_scripts[wide_ord(c)]
|
||||
|
||||
if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
|
||||
script = last_script
|
||||
if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
|
||||
if (script_len < str_len):
|
||||
for c in reversed(list(wide_iter(s[:script_len]))):
|
||||
if char_scripts[wide_ord(c)] == COMMON_SCRIPT:
|
||||
script_len -= 1
|
||||
break
|
||||
is_ascii = is_ascii and ord(c) <= MAX_ASCII
|
||||
script_len += 1
|
||||
if script != UNKNOWN_SCRIPT:
|
||||
last_script = script
|
||||
return (last_script, script_len, is_ascii)
|
||||
|
||||
LATIN_SCRIPT = 'Latin'
|
||||
UNKNOWN_LANGUAGE = 'unk'
|
||||
AMBIGUOUS_LANGUAGE = 'xxx'
|
||||
|
||||
|
||||
def disambiguate_language_script(text, languages):
|
||||
script_langs = {}
|
||||
read_len = 0
|
||||
while read_len < len(text):
|
||||
script, script_len, is_ascii = get_string_script(text[read_len:])
|
||||
if script != LATIN_SCRIPT:
|
||||
script_valid = [l for l, d in languages if l in script_languages.get(script, [])]
|
||||
script_langs[script] = set(script_valid)
|
||||
|
||||
if script_len == len(text) and len(script_valid) == 1:
|
||||
return script_valid[0], script_langs
|
||||
|
||||
read_len += script_len
|
||||
|
||||
return UNKNOWN_LANGUAGE, script_langs
|
||||
|
||||
LATIN_TRANSLITERATED_SCRIPTS = {'Arabic', 'Cyrllic'}
|
||||
|
||||
|
||||
def has_non_latin_script(languages):
|
||||
for lang, is_default in languages:
|
||||
scripts = lang_scripts.get(lang, set())
|
||||
if LATIN_SCRIPT not in scripts or scripts & LATIN_TRANSLITERATED_SCRIPTS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def disambiguate_language(text, languages, scripts_only=False):
|
||||
text = safe_decode(text)
|
||||
valid_languages = OrderedDict(languages)
|
||||
|
||||
language_script, script_langs = disambiguate_language_script(text, languages)
|
||||
if language_script is not UNKNOWN_LANGUAGE:
|
||||
return language_script
|
||||
|
||||
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
|
||||
|
||||
tokens = normalized_tokens(text)
|
||||
|
||||
current_lang = None
|
||||
possible_lang = None
|
||||
|
||||
seen_languages = set()
|
||||
|
||||
for t, c, l, data in street_types_gazetteer.filter(tokens):
|
||||
if c == token_types.PHRASE:
|
||||
valid = OrderedDict()
|
||||
data = [safe_decode(d).split(u'|') for d in data]
|
||||
potentials = set([l for l, d, i, c in data if l in valid_languages])
|
||||
potential_defaults = set([l for l in potentials if valid_languages[l]])
|
||||
|
||||
phrase_len = sum((len(t_i[0]) for t_i in t))
|
||||
for lang, dictionary, is_canonical, canonical in data:
|
||||
is_canonical = int(is_canonical)
|
||||
is_stopword = dictionary == 'stopword'
|
||||
if lang not in valid_languages or (is_stopword and len(potentials) > 1):
|
||||
continue
|
||||
is_default = valid_languages[lang]
|
||||
|
||||
lang_valid = is_default or not seen_languages or lang in seen_languages
|
||||
|
||||
if lang_valid and phrase_len > 1 and ((is_canonical and not is_stopword) or (is_default and (len(potentials) == 1 or len(potential_defaults) == 1))):
|
||||
valid[lang] = 1
|
||||
elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
elif is_stopword and is_canonical and not is_default and lang in seen_languages:
|
||||
valid[lang] = 1
|
||||
elif not seen_languages and len(potentials) == 1 and phrase_len > 1:
|
||||
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
||||
|
||||
if seen_languages and valid and not any((l in seen_languages for l in valid)) and \
|
||||
(not any((valid_languages.get(l) for l in valid)) or any((valid_languages.get(l) for l in seen_languages))):
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
|
||||
valid = valid.keys()
|
||||
|
||||
if len(valid) == 1:
|
||||
current_lang = valid[0]
|
||||
else:
|
||||
valid_default = [l for l in valid if valid_languages.get(l)]
|
||||
if len(valid_default) == 1 and current_lang is not None and valid_default[0] != current_lang:
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
elif len(valid_default) == 1:
|
||||
current_lang = valid_default[0]
|
||||
|
||||
if any((current_lang not in langs for script, langs in script_langs.iteritems())):
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
|
||||
seen_languages.update(valid)
|
||||
|
||||
if current_lang is not None:
|
||||
return current_lang
|
||||
elif possible_lang is not None:
|
||||
if not any((possible_lang not in langs for script, langs in script_langs.iteritems())):
|
||||
return possible_lang
|
||||
else:
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
return UNKNOWN_LANGUAGE
|
||||
53
scripts/geodata/language_id/sample.py
Normal file
53
scripts/geodata/language_id/sample.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import random
|
||||
import bisect
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
'''
|
||||
Top languages on the Interwebs. Not a probability distribution
|
||||
as it doesn't sum to 1 and websites can be in more than one
|
||||
language. Reference:
|
||||
|
||||
https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
|
||||
'''
|
||||
INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
|
||||
('en', 0.555),
|
||||
('ru', 0.059),
|
||||
('de', 0.058),
|
||||
('ja', 0.05),
|
||||
('es', 0.046),
|
||||
('fr', 0.04),
|
||||
('zh', 0.028),
|
||||
('pt', 0.025),
|
||||
('it', 0.019),
|
||||
('pl', 0.017),
|
||||
('tr', 0.015),
|
||||
('nl', 0.013),
|
||||
('fa', 0.009),
|
||||
('ar', 0.008),
|
||||
('ko', 0.007),
|
||||
])
|
||||
|
||||
|
||||
def cdf(probs):
|
||||
total = float(sum(probs))
|
||||
|
||||
result = []
|
||||
cumulative = 0.0
|
||||
for w in probs:
|
||||
cumulative += w
|
||||
result.append(cumulative / total)
|
||||
return result
|
||||
|
||||
|
||||
MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
|
||||
INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
|
||||
|
||||
|
||||
def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
|
||||
cdf=INTERNET_LANGUAGES_CDF):
|
||||
assert len(keys) == len(cdf)
|
||||
|
||||
sample = random.random()
|
||||
idx = bisect.bisect(cdf, sample)
|
||||
return keys[idx]
|
||||
Reference in New Issue
Block a user