Files
libpostal-addrss/scripts/geodata/polygons/language_polys.py
2025-09-06 22:03:29 -04:00

244 lines
9.4 KiB
Python

import argparse
import operator
import os
import sys
import ujson as json
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.polygons.index import *
from geodata.i18n.languages import *
from geodata.language_id.disambiguation import disambiguate_language, AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE, WELL_REPRESENTED_LANGUAGES
country_language_dir = os.path.join(LANGUAGES_DIR, 'countries')
regional_language_dir = os.path.join(LANGUAGES_DIR, 'regional')
class LanguagePolygonIndex(RTreePolygonIndex):
DEFAULT_POLYS_FILENAME = 'polygons.geojson'
ADMIN_LEVELS_FILENAME = 'admin_levels.json'
include_only_properties = set([
'qs_a0',
'qs_iso_cc',
'qs_a1',
'qs_a1_lc',
'qs_a1r',
'qs_a1r_lc',
'qs_level',
'languages',
'admin_level'
])
@classmethod
def create_from_shapefiles(cls,
admin0_shapefile,
admin1_shapefile,
admin1_region_file,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
init_languages()
index = cls(save_dir=output_dir, index_filename=index_filename)
i = 0
'''
Ordering of the files is important here as we want to match
the most granular admin polygon first for regional languages. Currently
most regional languages as they would apply to street signage are regional in
terms of an admin 1 level (states, provinces, regions)
'''
for input_file in (admin0_shapefile, admin1_region_file, admin1_shapefile):
f = fiona.open(input_file)
for rec in f:
if not rec or not rec.get('geometry') or 'type' not in rec['geometry']:
continue
country = rec['properties']['qs_iso_cc'].lower()
properties = rec['properties']
admin_level = properties['qs_level']
level_num = None
if admin_level == 'adm1':
name_key = 'qs_a1'
code_key = 'qs_a1_lc'
level_num = 1
elif admin_level == 'adm1_region':
name_key = 'qs_a1r'
code_key = 'qs_a1r_lc'
level_num = 1
elif admin_level == 'adm0':
level_num = 0
else:
continue
assert level_num is not None
if admin_level != 'adm0':
admin1 = properties.get(name_key)
admin1_code = properties.get(code_key)
regional = None
if name_key:
regional = get_regional_languages(country, name_key, admin1)
if code_key and not regional:
regional = get_regional_languages(country, code_key, admin1_code)
if not regional:
continue
if all((not default for lang, default in regional.iteritems())):
languages = get_country_languages(country)
languages.update(regional)
languages = languages.items()
else:
languages = regional.items()
else:
languages = get_country_languages(country).items()
properties['languages'] = [{'lang': lang, 'default': default}
for lang, default in languages]
properties['admin_level'] = level_num
poly_type = rec['geometry']['type']
if poly_type == 'Polygon':
poly = cls.to_polygon(rec['geometry']['coordinates'][0])
index.index_polygon(poly)
poly = index.simplify_polygon(poly)
index.add_polygon(poly, dict(rec['properties']))
elif poly_type == 'MultiPolygon':
polys = []
for coords in rec['geometry']['coordinates']:
poly = cls.to_polygon(coords[0])
polys.append(poly)
index.index_polygon(poly)
multi_poly = index.simplify_polygon(MultiPolygon(polys))
index.add_polygon(multi_poly, dict(rec['properties']))
else:
continue
i += 1
return index
@classmethod
def create_with_quattroshapes(cls, quattroshapes_dir,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
admin0_filename = os.path.join(quattroshapes_dir, 'qs_adm0.shp')
admin1_filename = os.path.join(quattroshapes_dir, 'qs_adm1.shp')
admin1r_filename = os.path.join(quattroshapes_dir, 'qs_adm1_region.shp')
return cls.create_from_shapefiles(admin0_filename, admin1_filename, admin1r_filename,
output_dir, index_filename=index_filename,
polys_filename=polys_filename)
def setup(self):
self.admin_levels = []
def index_polygon_properties(self, properties):
self.admin_levels.append(properties['admin_level'])
def load_polygon_properties(self, d):
self.admin_levels = json.load(open(os.path.join(d, self.ADMIN_LEVELS_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.admin_levels, open(os.path.join(d, self.ADMIN_LEVELS_FILENAME), 'w'))
def admin_level(self, i):
return self.admin_levels[i]
def get_candidate_polygons(self, lat, lon):
candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys()
return sorted(candidates, key=self.admin_level, reverse=True)
def country_and_languages(self, latitude, longitude):
props = self.point_in_poly(latitude, longitude, return_all=True)
if not props:
return None, None, None
country = props[0]['qs_iso_cc'].lower()
languages = []
language_set = set()
have_regional = False
for p in props:
for l in p['languages']:
lang = l['lang']
if lang not in language_set:
language_set.add(lang)
if p['admin_level'] > 0 and l['default']:
have_regional = True
elif have_regional:
l = {'lang': l['lang'], 'default': 0}
languages.append(l)
# Python's builtin sort is stable, so if there are two defaults, the first remains first
# Since polygons are returned from the index ordered from smallest admin level to largest,
# it means the default language of the region overrides the country default
default_languages = sorted(languages, key=operator.itemgetter('default'), reverse=True)
return country, default_languages, props
def best_country_and_language(self, latitude, longitude, name):
country, candidate_languages, language_props = self.country_and_languages(latitude, longitude)
if not (country and candidate_languages):
return None, None
num_langs = len(candidate_languages)
default_langs = set([l['lang'] for l in candidate_languages if l.get('default')])
num_defaults = len(default_langs)
regional_defaults = 0
country_defaults = 0
regional_langs = set()
country_langs = set()
for p in language_props:
if p['admin_level'] > 0:
regional_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
regional_langs |= set([l['lang'] for l in p['languages']])
else:
country_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
country_langs |= set([l['lang'] for l in p['languages']])
if num_langs == 1:
return country, candidate_languages[0]['lang']
else:
lang = disambiguate_language(name, [(l['lang'], l['default']) for l in candidate_languages])
default_lang = candidate_languages[0]['lang']
if lang == UNKNOWN_LANGUAGE and num_defaults == 1:
return country, default_lang
elif lang == AMBIGUOUS_LANGUAGE:
return country, lang
elif lang != UNKNOWN_LANGUAGE:
if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES:
return country, UNKNOWN_LANGUAGE
return country, lang
else:
return country, lang
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-q', '--quattroshapes-dir',
help='Path to quattroshapes dir')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
help='Output directory')
args = parser.parse_args()
index = LanguagePolygonIndex.create_with_quattroshapes(args.quattroshapes_dir, args.out_dir)
index.save()