diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 16dc9f62..bb4df628 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -14,7 +14,7 @@ from geodata.countries.country_names import * from geodata.language_id.disambiguation import * from geodata.language_id.sample import sample_random_language from geodata.names.normalization import name_affixes -from geodata.osm.extract import osm_address_components +from geodata.osm.components import osm_address_components from geodata.states.state_abbreviations import state_abbreviations diff --git a/scripts/geodata/osm/components.py b/scripts/geodata/osm/components.py new file mode 100644 index 00000000..d1761d10 --- /dev/null +++ b/scripts/geodata/osm/components.py @@ -0,0 +1,67 @@ +import os +import yaml + +from geodata.address_formatting.formatter import AddressFormatter + +this_dir = os.path.realpath(os.path.dirname(__file__)) + +OSM_BOUNDARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'boundaries', 'osm') + + +class OSMAddressComponents(object): + ''' + Keeps a map of OSM keys and values to the standard components + of an address like city, state, etc. used for address formatting. + When we reverse geocode a point, it will fall into a number of + polygons, and we simply need to assign the names of said polygons + to an address field. + ''' + + ADMIN_LEVEL = 'admin_level' + + # These keys are country-independent + global_keys = { + 'place': { + 'country': AddressFormatter.COUNTRY, + 'state': AddressFormatter.STATE, + 'region': AddressFormatter.STATE, + 'province': AddressFormatter.STATE, + 'county': AddressFormatter.STATE_DISTRICT, + 'island': AddressFormatter.ISLAND, + 'islet': AddressFormatter.ISLAND, + 'municipality': AddressFormatter.CITY, + 'city': AddressFormatter.CITY, + 'town': AddressFormatter.CITY, + 'township': AddressFormatter.CITY, + 'village': AddressFormatter.CITY, + 'hamlet': AddressFormatter.CITY, + 'borough': AddressFormatter.CITY_DISTRICT, + 'suburb': AddressFormatter.SUBURB, + 'quarter': AddressFormatter.SUBURB, + 'neighbourhood': AddressFormatter.SUBURB + } + } + + def __init__(self, boundaries_dir=OSM_BOUNDARIES_DIR): + self.config = {} + + for filename in os.listdir(boundaries_dir): + if not filename.endswith('.yaml'): + continue + + country_code = filename.rsplit('.yaml', 1)[0] + data = yaml.load(open(os.path.join(boundaries_dir, filename))) + for prop, values in data.iteritems(): + for k, v in values.iteritems(): + if v not in AddressFormatter.address_formatter_fields: + raise ValueError(u'Invalid value in {} for prop={}, key={}: {}'.format(filename, prop, k, v)) + self.config[country_code] = data + + def get_component(self, country, prop, value): + props = self.config.get(country, {}).get(prop, {}) + if not props and prop in self.global_keys: + props = self.global_keys[prop] + return props.get(value, None) + +osm_address_components = OSMAddressComponents() diff --git a/scripts/geodata/osm/extract.py b/scripts/geodata/osm/extract.py index 7c470829..ed2e62ec 100644 --- a/scripts/geodata/osm/extract.py +++ b/scripts/geodata/osm/extract.py @@ -6,30 +6,20 @@ Extracts nodes/ways/relations, their metadata and dependencies from .osm XML files. ''' -import os import re import six -import sys import urllib -import ujson as json -import yaml import HTMLParser from collections import OrderedDict from lxml import etree -this_dir = os.path.realpath(os.path.dirname(__file__)) -sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) -from geodata.address_formatting.formatter import AddressFormatter from geodata.csv_utils import unicode_csv_reader from geodata.text.normalize import normalize_string, NORMALIZE_STRING_DECOMPOSE, NORMALIZE_STRING_LATIN_ASCII - -OSM_BOUNDARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, - 'resources', 'boundaries', 'osm') - from geodata.encoding import safe_decode + WAY_OFFSET = 10 ** 15 RELATION_OFFSET = 2 * 10 ** 15 @@ -178,61 +168,3 @@ def parse_osm_number_range(value): else: numbers.extend(non_breaking_dash_regex.split(safe_decode(val))) return numbers - - -class OSMAddressComponents(object): - ''' - Keeps a map of OSM keys and values to the standard components - of an address like city, state, etc. used for address formatting. - When we reverse geocode a point, it will fall into a number of - polygons, and we simply need to assign the names of said polygons - to an address field. - ''' - - ADMIN_LEVEL = 'admin_level' - - # These keys are country-independent - global_keys = { - 'place': { - 'country': AddressFormatter.COUNTRY, - 'state': AddressFormatter.STATE, - 'region': AddressFormatter.STATE, - 'province': AddressFormatter.STATE, - 'county': AddressFormatter.STATE_DISTRICT, - 'island': AddressFormatter.ISLAND, - 'islet': AddressFormatter.ISLAND, - 'municipality': AddressFormatter.CITY, - 'city': AddressFormatter.CITY, - 'town': AddressFormatter.CITY, - 'township': AddressFormatter.CITY, - 'village': AddressFormatter.CITY, - 'hamlet': AddressFormatter.CITY, - 'borough': AddressFormatter.CITY_DISTRICT, - 'suburb': AddressFormatter.SUBURB, - 'quarter': AddressFormatter.SUBURB, - 'neighbourhood': AddressFormatter.SUBURB - } - } - - def __init__(self, boundaries_dir=OSM_BOUNDARIES_DIR): - self.config = {} - - for filename in os.listdir(boundaries_dir): - if not filename.endswith('.yaml'): - continue - - country_code = filename.rsplit('.yaml', 1)[0] - data = yaml.load(open(os.path.join(boundaries_dir, filename))) - for prop, values in data.iteritems(): - for k, v in values.iteritems(): - if v not in AddressFormatter.address_formatter_fields: - raise ValueError(u'Invalid value in {} for prop={}, key={}: {}'.format(filename, prop, k, v)) - self.config[country_code] = data - - def get_component(self, country, prop, value): - props = self.config.get(country, {}).get(prop, {}) - if not props and prop in self.global_keys: - props = self.global_keys[prop] - return props.get(value, None) - -osm_address_components = OSMAddressComponents()