[languages] Adding country_and_languages to the language rtree itself
This commit is contained in:
@@ -1,69 +0,0 @@
|
|||||||
import operator
|
|
||||||
|
|
||||||
from geodata.language_id.disambiguation import disambiguate_language, UNKNOWN_LANGUAGE, WELL_REPRESENTED_LANGUAGES
|
|
||||||
|
|
||||||
|
|
||||||
def country_and_languages(language_rtree, latitude, longitude):
|
|
||||||
props = language_rtree.point_in_poly(latitude, longitude, return_all=True)
|
|
||||||
if not props:
|
|
||||||
return None, None, None
|
|
||||||
|
|
||||||
country = props[0]['qs_iso_cc'].lower()
|
|
||||||
languages = []
|
|
||||||
language_set = set()
|
|
||||||
|
|
||||||
have_regional = False
|
|
||||||
|
|
||||||
for p in props:
|
|
||||||
for l in p['languages']:
|
|
||||||
lang = l['lang']
|
|
||||||
if lang not in language_set:
|
|
||||||
language_set.add(lang)
|
|
||||||
if p['admin_level'] > 0 and l['default']:
|
|
||||||
have_regional = True
|
|
||||||
elif have_regional:
|
|
||||||
l = {'lang': l['lang'], 'default': 0}
|
|
||||||
languages.append(l)
|
|
||||||
|
|
||||||
# Python's builtin sort is stable, so if there are two defaults, the first remains first
|
|
||||||
# Since polygons are returned from the index ordered from smallest admin level to largest,
|
|
||||||
# it means the default language of the region overrides the country default
|
|
||||||
default_languages = sorted(languages, key=operator.itemgetter('default'), reverse=True)
|
|
||||||
return country, default_languages, props
|
|
||||||
|
|
||||||
|
|
||||||
def best_country_and_language(language_rtree, latitude, longitude, name):
|
|
||||||
country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
|
|
||||||
if not (country and candidate_languages):
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
num_langs = len(candidate_languages)
|
|
||||||
default_langs = set([l['lang'] for l in candidate_languages if l.get('default')])
|
|
||||||
num_defaults = len(default_langs)
|
|
||||||
|
|
||||||
regional_defaults = 0
|
|
||||||
country_defaults = 0
|
|
||||||
regional_langs = set()
|
|
||||||
country_langs = set()
|
|
||||||
for p in language_props:
|
|
||||||
if p['admin_level'] > 0:
|
|
||||||
regional_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
|
|
||||||
regional_langs |= set([l['lang'] for l in p['languages']])
|
|
||||||
else:
|
|
||||||
country_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
|
|
||||||
country_langs |= set([l['lang'] for l in p['languages']])
|
|
||||||
|
|
||||||
if num_langs == 1:
|
|
||||||
return country, candidate_languages[0]['lang']
|
|
||||||
else:
|
|
||||||
lang = disambiguate_language(name, [(l['lang'], l['default']) for l in candidate_languages])
|
|
||||||
default_lang = candidate_languages[0]['lang']
|
|
||||||
|
|
||||||
if lang == UNKNOWN_LANGUAGE and num_defaults == 1:
|
|
||||||
return country, default_lang
|
|
||||||
elif lang != UNKNOWN_LANGUAGE:
|
|
||||||
if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES:
|
|
||||||
return country, UNKNOWN_LANGUAGE
|
|
||||||
return country, lang
|
|
||||||
else:
|
|
||||||
return None, None
|
|
||||||
@@ -24,6 +24,7 @@ class PointIndex(object):
|
|||||||
INDEX_FILENAME = 'index.json'
|
INDEX_FILENAME = 'index.json'
|
||||||
|
|
||||||
def __init__(self, index=None, save_dir=None,
|
def __init__(self, index=None, save_dir=None,
|
||||||
|
points=None,
|
||||||
points_db=None,
|
points_db=None,
|
||||||
points_db_path=None,
|
points_db_path=None,
|
||||||
index_path=None,
|
index_path=None,
|
||||||
@@ -45,6 +46,11 @@ class PointIndex(object):
|
|||||||
else:
|
else:
|
||||||
self.index = index
|
self.index = index
|
||||||
|
|
||||||
|
if not points:
|
||||||
|
self.points = []
|
||||||
|
else:
|
||||||
|
self.points = points
|
||||||
|
|
||||||
if not points_db_path:
|
if not points_db_path:
|
||||||
points_db_path = os.path.join(save_dir or '.', self.POINTS_DB_DIR)
|
points_db_path = os.path.join(save_dir or '.', self.POINTS_DB_DIR)
|
||||||
|
|
||||||
@@ -57,14 +63,12 @@ class PointIndex(object):
|
|||||||
|
|
||||||
self.i = 0
|
self.i = 0
|
||||||
|
|
||||||
def create_index(self, overwrite=False):
|
|
||||||
self.index = defaultdict(list)
|
|
||||||
|
|
||||||
def index_point(self, lat, lon):
|
def index_point(self, lat, lon):
|
||||||
code = geohash.encode(lat, lon)[:self.precision]
|
code = geohash.encode(lat, lon)[:self.precision]
|
||||||
|
|
||||||
for key in [code] + geohash.neighbors(code):
|
for key in [code] + geohash.neighbors(code):
|
||||||
self.index[key].append((self.i, lat, lon))
|
self.index[key].append(self.i)
|
||||||
|
self.points.append((lat, lon))
|
||||||
|
|
||||||
def add_point(self, lat, lon, properties, cache=False, include_only_properties=None):
|
def add_point(self, lat, lon, properties, cache=False, include_only_properties=None):
|
||||||
if include_only_properties is not None:
|
if include_only_properties is not None:
|
||||||
@@ -96,6 +100,9 @@ class PointIndex(object):
|
|||||||
def properties_key(self, i):
|
def properties_key(self, i):
|
||||||
return 'props:{}'.format(i)
|
return 'props:{}'.format(i)
|
||||||
|
|
||||||
|
def get_properties(self, i):
|
||||||
|
return self.points_db.Get(self.properties_key(i))
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
self.save_index()
|
self.save_index()
|
||||||
self.save_properties(os.path.join(self.save_dir, self.PROPS_FILENAME))
|
self.save_properties(os.path.join(self.save_dir, self.PROPS_FILENAME))
|
||||||
@@ -121,16 +128,21 @@ class PointIndex(object):
|
|||||||
|
|
||||||
def point_distances(self, latitude, longitude):
|
def point_distances(self, latitude, longitude):
|
||||||
candidates = self.get_candidate_points(latitude, longitude)
|
candidates = self.get_candidate_points(latitude, longitude)
|
||||||
return [(i, lat, lon, haversine_distance(latitude, longitude, lat, lon)) for i, lat, lon in candidates]
|
|
||||||
|
|
||||||
def nearest_n_points(self, latitude, longitude, n=2):
|
return [(i, self.points[i][0], self.points[i][1],
|
||||||
|
haversine_distance(latitude, longitude, *self.points[i])) for i in candidates]
|
||||||
|
|
||||||
|
def all_nearby_points(self, latitude, longitude):
|
||||||
distances = self.point_distances(latitude, longitude)
|
distances = self.point_distances(latitude, longitude)
|
||||||
if not distances:
|
if not distances:
|
||||||
return None
|
return []
|
||||||
return sorted(distances, key=operator.itemgetter(-1))[:n]
|
return sorted(distances, key=operator.itemgetter(-1))
|
||||||
|
|
||||||
|
def nearest_n_points(self, latitude, longitude, n=2):
|
||||||
|
return self.all_nearby_points(latitude, longitude)[:n]
|
||||||
|
|
||||||
def nearest_point(self, latitude, longitude):
|
def nearest_point(self, latitude, longitude):
|
||||||
distances = self.nearest_n_points(latitude, longitude, n=1)
|
distances = self.all_nearby_points(latitude, longitude)
|
||||||
if not distances:
|
if not distances:
|
||||||
return None
|
return None
|
||||||
return distances[0]
|
return distances[0]
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
import operator
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import ujson as json
|
import ujson as json
|
||||||
@@ -8,6 +9,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|||||||
|
|
||||||
from geodata.polygons.index import *
|
from geodata.polygons.index import *
|
||||||
from geodata.i18n.languages import *
|
from geodata.i18n.languages import *
|
||||||
|
from geodata.language_id.disambiguation import disambiguate_language, AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE, WELL_REPRESENTED_LANGUAGES
|
||||||
|
|
||||||
country_language_dir = os.path.join(LANGUAGES_DIR, 'countries')
|
country_language_dir = os.path.join(LANGUAGES_DIR, 'countries')
|
||||||
regional_language_dir = os.path.join(LANGUAGES_DIR, 'regional')
|
regional_language_dir = os.path.join(LANGUAGES_DIR, 'regional')
|
||||||
@@ -158,6 +160,72 @@ class LanguagePolygonIndex(RTreePolygonIndex):
|
|||||||
candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys()
|
candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys()
|
||||||
return sorted(candidates, key=self.admin_level, reverse=True)
|
return sorted(candidates, key=self.admin_level, reverse=True)
|
||||||
|
|
||||||
|
def country_and_languages(self, latitude, longitude):
|
||||||
|
props = self.point_in_poly(latitude, longitude, return_all=True)
|
||||||
|
if not props:
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
country = props[0]['qs_iso_cc'].lower()
|
||||||
|
languages = []
|
||||||
|
language_set = set()
|
||||||
|
|
||||||
|
have_regional = False
|
||||||
|
|
||||||
|
for p in props:
|
||||||
|
for l in p['languages']:
|
||||||
|
lang = l['lang']
|
||||||
|
if lang not in language_set:
|
||||||
|
language_set.add(lang)
|
||||||
|
if p['admin_level'] > 0 and l['default']:
|
||||||
|
have_regional = True
|
||||||
|
elif have_regional:
|
||||||
|
l = {'lang': l['lang'], 'default': 0}
|
||||||
|
languages.append(l)
|
||||||
|
|
||||||
|
# Python's builtin sort is stable, so if there are two defaults, the first remains first
|
||||||
|
# Since polygons are returned from the index ordered from smallest admin level to largest,
|
||||||
|
# it means the default language of the region overrides the country default
|
||||||
|
default_languages = sorted(languages, key=operator.itemgetter('default'), reverse=True)
|
||||||
|
return country, default_languages, props
|
||||||
|
|
||||||
|
def best_country_and_language(self, latitude, longitude, name):
|
||||||
|
country, candidate_languages, language_props = self.country_and_languages(latitude, longitude)
|
||||||
|
if not (country and candidate_languages):
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
num_langs = len(candidate_languages)
|
||||||
|
default_langs = set([l['lang'] for l in candidate_languages if l.get('default')])
|
||||||
|
num_defaults = len(default_langs)
|
||||||
|
|
||||||
|
regional_defaults = 0
|
||||||
|
country_defaults = 0
|
||||||
|
regional_langs = set()
|
||||||
|
country_langs = set()
|
||||||
|
for p in language_props:
|
||||||
|
if p['admin_level'] > 0:
|
||||||
|
regional_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
|
||||||
|
regional_langs |= set([l['lang'] for l in p['languages']])
|
||||||
|
else:
|
||||||
|
country_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
|
||||||
|
country_langs |= set([l['lang'] for l in p['languages']])
|
||||||
|
|
||||||
|
if num_langs == 1:
|
||||||
|
return country, candidate_languages[0]['lang']
|
||||||
|
else:
|
||||||
|
lang = disambiguate_language(name, [(l['lang'], l['default']) for l in candidate_languages])
|
||||||
|
default_lang = candidate_languages[0]['lang']
|
||||||
|
|
||||||
|
if lang == UNKNOWN_LANGUAGE and num_defaults == 1:
|
||||||
|
return country, default_lang
|
||||||
|
elif lang == AMBIGUOUS_LANGUAGE:
|
||||||
|
return country, lang
|
||||||
|
elif lang != UNKNOWN_LANGUAGE:
|
||||||
|
if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES:
|
||||||
|
return country, UNKNOWN_LANGUAGE
|
||||||
|
return country, lang
|
||||||
|
else:
|
||||||
|
return country, lang
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Handle argument parsing here
|
# Handle argument parsing here
|
||||||
|
|||||||
Reference in New Issue
Block a user