From d15737b319bf0cb96496380221eec267d1894b7f Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 13 Jul 2015 21:08:08 -0400 Subject: [PATCH] [osm] Validating lat/lon in OSM training data --- .../geodata/osm/osm_address_training_data.py | 63 ++++++++++++++++++- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index d444157a..a7dacd35 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -40,8 +40,6 @@ PLANET_WAYS_OUTPUT_FILE = 'planet-ways.tsv' PLANET_VENUES_INPUT_FILE = 'planet-venues.osm' PLANET_VENUES_OUTPUT_FILE = 'planet-venues.tsv' -DEFAULT_PLANET_URL = 'http://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-latest.osm.pbf' - ALL_OSM_TAGS = set(['node', 'way', 'relation']) ONLY_WAYS = set(['way']) @@ -240,6 +238,60 @@ def normalize_osm_name_tag(tag, script=False): WAYS_LANGUAGE_DATA_FILENAME = 'streets_by_language.tsv' +latitude_dms_regex = re.compile(ur'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$', re.I | re.UNICODE) +longitude_dms_regex = re.compile(ur'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$', re.I | re.UNICODE) + +latitude_decimal_with_direction_regex = re.compile('^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I) +longitude_decimal_with_direction_regex = re.compile('^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$', re.I) + + +def latlon_to_floats(latitude, longitude): + have_lat = False + have_lon = False + + latitude = safe_decode(latitude).strip(u' ,;|') + longitude = safe_decode(longitude).strip(u' ,;|') + + latitude = latitude.replace(u',', u'.') + longitude = longitude.replace(u',', u'.') + + lat_dms = latitude_dms_regex.match(latitude) + lat_dir = latitude_decimal_with_direction_regex.match(latitude) + + if lat_dms: + d, m, s, c = lat_dms.groups() + sign = direction_sign(c) + latitude = degrees_to_decimal(d or 0, m or 0, s or 0) + have_lat = True + elif lat_dir: + d, c = lat_dir.groups() + sign = direction_sign(c) + latitude = float(d) * sign + have_lat = True + else: + latitude = re.sub(beginning_re, u'', latitude) + latitude = re.sub(end_re, u'', latitude) + + lon_dms = longitude_dms_regex.match(longitude) + lon_dir = longitude_decimal_with_direction_regex.match(longitude) + + if lon_dms: + d, m, s, c = lon_dms.groups() + sign = direction_sign(c) + longitude = degrees_to_decimal(d or 0, m or 0, s or 0) + have_lon = True + elif lon_dir: + d, c = lon_dir.groups() + sign = direction_sign(c) + longitude = float(d) * sign + have_lon = True + else: + longitude = re.sub(beginning_re, u'', longitude) + longitude = re.sub(end_re, u'', longitude) + + return float(latitude), float(longitude) + + def country_and_languages(language_rtree, latitude, longitude): props = language_rtree.point_in_poly(latitude, longitude) if not props or not props.get('languages'): @@ -258,7 +310,12 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): tag_first_component = tag_prefix.split(':')[0] tag_last_component = tag_prefix.split(':')[-1] - country, default_languages = country_and_languages(language_rtree, float(value['lat']), float(value['lon'])) + try: + latitude, longitude = latlon_to_floats(value['lat'], value['lon']) + except Exception: + return None, None + + country, default_languages = country_and_languages(language_rtree, latitude, longitude) if not (country and default_languages): return None, None