From fc91471434c01266a6dfc28afe94efb1f3440511 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 9 Apr 2017 02:15:42 -0400 Subject: [PATCH] [osm/boundaries] check polygons with an ISO3166-2 as well in the country polygon index in case the country polygon is funky --- scripts/geodata/countries/constants.py | 8 ++++++++ scripts/geodata/osm/admin_boundaries.py | 2 +- scripts/geodata/osm/fetch_osm_address_data.sh | 3 ++- scripts/geodata/polygons/reverse_geocode.py | 13 ++++++++++++- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/scripts/geodata/countries/constants.py b/scripts/geodata/countries/constants.py index 0f2923aa..b22454d4 100644 --- a/scripts/geodata/countries/constants.py +++ b/scripts/geodata/countries/constants.py @@ -1,3 +1,5 @@ +import pycountry + class Countries(object): AFGHANISTAN = 'af' @@ -252,3 +254,9 @@ class Countries(object): FORMER_SOVIET_UNION_COUNTRIES = set([RUSSIA, UKRAINE, BELARUS, KAZAKHSTAN, AZERBAIJAN, KYRGYZSTAN, GEORGIA, UZBEKISTAN, ARMENIA, TAJIKISTAN, MOLDOVA, TURKMENISTAN, LATVIA, LITHUANIA, ESTONIA]) CJK_COUNTRIES = set([CHINA, JAPAN, SOUTH_KOREA, TAIWAN, HONG_KONG, MACAO]) + + all_country_iso_codes = set([c.alpha2.lower() for c in pycountry.countries]) + + @classmethod + def is_valid_country_code(cls, alpha2_code): + return alpha2_code and alpha2_code.lower() in cls.all_country_iso_codes diff --git a/scripts/geodata/osm/admin_boundaries.py b/scripts/geodata/osm/admin_boundaries.py index 9bd4e69e..0a9c942f 100644 --- a/scripts/geodata/osm/admin_boundaries.py +++ b/scripts/geodata/osm/admin_boundaries.py @@ -315,7 +315,7 @@ class OSMBuildingPolygonReader(OSMPolygonReader): class OSMCountryPolygonReader(OSMPolygonReader): def include_polygon(self, props): - return 'ISO3166-1:alpha2' in props or (props.get('type', 'relation'), safe_encode(props.get('id', ''))) in osm_admin1_ids + return 'ISO3166-1:alpha2' in props or 'ISO3166-2' in props or (props.get('type', 'relation'), safe_encode(props.get('id', ''))) in osm_admin1_ids class OSMNeighborhoodPolygonReader(OSMPolygonReader): diff --git a/scripts/geodata/osm/fetch_osm_address_data.sh b/scripts/geodata/osm/fetch_osm_address_data.sh index 7534616a..82f814f5 100755 --- a/scripts/geodata/osm/fetch_osm_address_data.sh +++ b/scripts/geodata/osm/fetch_osm_address_data.sh @@ -123,6 +123,7 @@ PLANET_BORDERS="planet-borders.osm" PLANET_ADMIN_BORDERS_OSM="planet-admin-borders.osm" VALID_COUNTRY_KEYS="ISO3166-1:alpha2=" +VALID_ADMIN1_KEYS="ISO3166-2=" ADMIN1_LANGUAGE_EXCEPTION_IDS=$(grep "osm" $ADMIN1_FILE | sed 's/^.*relation:\([0-9][0-9]*\).*$/@id=\1/' | xargs echo | sed 's/\s/ or /g') VALID_ADMIN_BORDER_KEYS="boundary=administrative or boundary=town or boundary=city_limit or boundary=civil_parish or boundary=civil or boundary=ceremonial or boundary=postal_district or place=island or place=city or place=town or place=village or place=hamlet or place=municipality or place=settlement" @@ -147,7 +148,7 @@ osmconvert $PLANET_BORDERS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANE rm $PLANET_BORDERS_O5M osmfilter $PLANET_BORDERS_LATLONS --keep="$VALID_ADMIN_BORDER_KEYS or $VALID_LOCALITY_KEYS" -o=$PLANET_BORDERS rm $PLANET_BORDERS_LATLONS -osmfilter $PLANET_O5M --keep="$VALID_COUNTRY_KEYS or $ADMIN1_LANGUAGE_EXCEPTION_IDS" --drop-author --drop-version -o=$PLANET_COUNTRIES +osmfilter $PLANET_O5M --keep="$VALID_COUNTRY_KEYS or $VALID_ADMIN1_KEYS or $ADMIN1_LANGUAGE_EXCEPTION_IDS" --drop-author --drop-version -o=$PLANET_COUNTRIES echo "Filtering for neighborhoods" PLANET_LOCALITIES="planet-localities.osm" diff --git a/scripts/geodata/polygons/reverse_geocode.py b/scripts/geodata/polygons/reverse_geocode.py index c8a57c9b..bd4e570b 100644 --- a/scripts/geodata/polygons/reverse_geocode.py +++ b/scripts/geodata/polygons/reverse_geocode.py @@ -28,6 +28,7 @@ this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) from geodata.coordinates.conversion import latlon_to_decimal +from goedata.countries.constants import Countries from geodata.encoding import safe_decode from geodata.file_utils import ensure_dir, download_file from geodata.i18n.unicode_properties import get_chars_by_script @@ -298,6 +299,7 @@ class OSMReverseGeocoder(RTreePolygonIndex): 'name:*', 'ISO3166-1:alpha2', 'ISO3166-1:alpha3', + 'ISO3166-2', 'int_name', 'official_name', 'official_name:*', @@ -492,7 +494,16 @@ class OSMCountryReverseGeocoder(OSMReverseGeocoder): if country: break else: - return None, [] + # See if there's an ISO3166-2 code that matches + # in case the country polygon is wacky + for c in osm_components: + admin1 = c.get('ISO3166-2') + if admin1: + # If so, and if the country is valid, use that + country = admin1[:2] + if not Countries.is_valid_country_code(country.lower()): + return None, [] + break country = country.lower()