diff --git a/scripts/geodata/language_id/polygon_lookup.py b/scripts/geodata/language_id/polygon_lookup.py deleted file mode 100644 index 32281d3a..00000000 --- a/scripts/geodata/language_id/polygon_lookup.py +++ /dev/null @@ -1,69 +0,0 @@ -import operator - -from geodata.language_id.disambiguation import disambiguate_language, UNKNOWN_LANGUAGE, WELL_REPRESENTED_LANGUAGES - - -def country_and_languages(language_rtree, latitude, longitude): - props = language_rtree.point_in_poly(latitude, longitude, return_all=True) - if not props: - return None, None, None - - country = props[0]['qs_iso_cc'].lower() - languages = [] - language_set = set() - - have_regional = False - - for p in props: - for l in p['languages']: - lang = l['lang'] - if lang not in language_set: - language_set.add(lang) - if p['admin_level'] > 0 and l['default']: - have_regional = True - elif have_regional: - l = {'lang': l['lang'], 'default': 0} - languages.append(l) - - # Python's builtin sort is stable, so if there are two defaults, the first remains first - # Since polygons are returned from the index ordered from smallest admin level to largest, - # it means the default language of the region overrides the country default - default_languages = sorted(languages, key=operator.itemgetter('default'), reverse=True) - return country, default_languages, props - - -def best_country_and_language(language_rtree, latitude, longitude, name): - country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude) - if not (country and candidate_languages): - return None, None - - num_langs = len(candidate_languages) - default_langs = set([l['lang'] for l in candidate_languages if l.get('default')]) - num_defaults = len(default_langs) - - regional_defaults = 0 - country_defaults = 0 - regional_langs = set() - country_langs = set() - for p in language_props: - if p['admin_level'] > 0: - regional_defaults += sum((1 for lang in p['languages'] if lang.get('default'))) - regional_langs |= set([l['lang'] for l in p['languages']]) - else: - country_defaults += sum((1 for lang in p['languages'] if lang.get('default'))) - country_langs |= set([l['lang'] for l in p['languages']]) - - if num_langs == 1: - return country, candidate_languages[0]['lang'] - else: - lang = disambiguate_language(name, [(l['lang'], l['default']) for l in candidate_languages]) - default_lang = candidate_languages[0]['lang'] - - if lang == UNKNOWN_LANGUAGE and num_defaults == 1: - return country, default_lang - elif lang != UNKNOWN_LANGUAGE: - if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES: - return country, UNKNOWN_LANGUAGE - return country, lang - else: - return None, None diff --git a/scripts/geodata/points/index.py b/scripts/geodata/points/index.py index 6470a7a4..b6a56a7f 100644 --- a/scripts/geodata/points/index.py +++ b/scripts/geodata/points/index.py @@ -24,6 +24,7 @@ class PointIndex(object): INDEX_FILENAME = 'index.json' def __init__(self, index=None, save_dir=None, + points=None, points_db=None, points_db_path=None, index_path=None, @@ -45,6 +46,11 @@ class PointIndex(object): else: self.index = index + if not points: + self.points = [] + else: + self.points = points + if not points_db_path: points_db_path = os.path.join(save_dir or '.', self.POINTS_DB_DIR) @@ -57,14 +63,12 @@ class PointIndex(object): self.i = 0 - def create_index(self, overwrite=False): - self.index = defaultdict(list) - def index_point(self, lat, lon): code = geohash.encode(lat, lon)[:self.precision] for key in [code] + geohash.neighbors(code): - self.index[key].append((self.i, lat, lon)) + self.index[key].append(self.i) + self.points.append((lat, lon)) def add_point(self, lat, lon, properties, cache=False, include_only_properties=None): if include_only_properties is not None: @@ -96,6 +100,9 @@ class PointIndex(object): def properties_key(self, i): return 'props:{}'.format(i) + def get_properties(self, i): + return self.points_db.Get(self.properties_key(i)) + def save(self): self.save_index() self.save_properties(os.path.join(self.save_dir, self.PROPS_FILENAME)) @@ -121,16 +128,21 @@ class PointIndex(object): def point_distances(self, latitude, longitude): candidates = self.get_candidate_points(latitude, longitude) - return [(i, lat, lon, haversine_distance(latitude, longitude, lat, lon)) for i, lat, lon in candidates] - def nearest_n_points(self, latitude, longitude, n=2): + return [(i, self.points[i][0], self.points[i][1], + haversine_distance(latitude, longitude, *self.points[i])) for i in candidates] + + def all_nearby_points(self, latitude, longitude): distances = self.point_distances(latitude, longitude) if not distances: - return None - return sorted(distances, key=operator.itemgetter(-1))[:n] + return [] + return sorted(distances, key=operator.itemgetter(-1)) + + def nearest_n_points(self, latitude, longitude, n=2): + return self.all_nearby_points(latitude, longitude)[:n] def nearest_point(self, latitude, longitude): - distances = self.nearest_n_points(latitude, longitude, n=1) + distances = self.all_nearby_points(latitude, longitude) if not distances: return None return distances[0] diff --git a/scripts/geodata/polygons/language_polys.py b/scripts/geodata/polygons/language_polys.py index 8d73d441..539ecf95 100644 --- a/scripts/geodata/polygons/language_polys.py +++ b/scripts/geodata/polygons/language_polys.py @@ -1,4 +1,5 @@ import argparse +import operator import os import sys import ujson as json @@ -8,6 +9,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) from geodata.polygons.index import * from geodata.i18n.languages import * +from geodata.language_id.disambiguation import disambiguate_language, AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE, WELL_REPRESENTED_LANGUAGES country_language_dir = os.path.join(LANGUAGES_DIR, 'countries') regional_language_dir = os.path.join(LANGUAGES_DIR, 'regional') @@ -158,6 +160,72 @@ class LanguagePolygonIndex(RTreePolygonIndex): candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys() return sorted(candidates, key=self.admin_level, reverse=True) + def country_and_languages(self, latitude, longitude): + props = self.point_in_poly(latitude, longitude, return_all=True) + if not props: + return None, None, None + + country = props[0]['qs_iso_cc'].lower() + languages = [] + language_set = set() + + have_regional = False + + for p in props: + for l in p['languages']: + lang = l['lang'] + if lang not in language_set: + language_set.add(lang) + if p['admin_level'] > 0 and l['default']: + have_regional = True + elif have_regional: + l = {'lang': l['lang'], 'default': 0} + languages.append(l) + + # Python's builtin sort is stable, so if there are two defaults, the first remains first + # Since polygons are returned from the index ordered from smallest admin level to largest, + # it means the default language of the region overrides the country default + default_languages = sorted(languages, key=operator.itemgetter('default'), reverse=True) + return country, default_languages, props + + def best_country_and_language(self, latitude, longitude, name): + country, candidate_languages, language_props = self.country_and_languages(latitude, longitude) + if not (country and candidate_languages): + return None, None + + num_langs = len(candidate_languages) + default_langs = set([l['lang'] for l in candidate_languages if l.get('default')]) + num_defaults = len(default_langs) + + regional_defaults = 0 + country_defaults = 0 + regional_langs = set() + country_langs = set() + for p in language_props: + if p['admin_level'] > 0: + regional_defaults += sum((1 for lang in p['languages'] if lang.get('default'))) + regional_langs |= set([l['lang'] for l in p['languages']]) + else: + country_defaults += sum((1 for lang in p['languages'] if lang.get('default'))) + country_langs |= set([l['lang'] for l in p['languages']]) + + if num_langs == 1: + return country, candidate_languages[0]['lang'] + else: + lang = disambiguate_language(name, [(l['lang'], l['default']) for l in candidate_languages]) + default_lang = candidate_languages[0]['lang'] + + if lang == UNKNOWN_LANGUAGE and num_defaults == 1: + return country, default_lang + elif lang == AMBIGUOUS_LANGUAGE: + return country, lang + elif lang != UNKNOWN_LANGUAGE: + if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES: + return country, UNKNOWN_LANGUAGE + return country, lang + else: + return country, lang + if __name__ == '__main__': # Handle argument parsing here