diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index eefe295e..ac78209c 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -98,11 +98,27 @@ class OSMAddressFormatter(object): ]) ) - def __init__(self, components): + zones = { + 'landuse': { + 'retail': AddressComponents.zones.COMMERCIAL, + 'commercial': AddressComponents.zones.COMMERCIAL, + 'industrial': AddressComponents.zones.INDUSTRIAL, + 'residential': AddressComponents.zones.RESIDENTIAL, + }, + 'amenity': { + 'university': AddressComponents.zones.UNIVERSITY, + 'college': AddressComponents.zones.UNIVERSITY, + } + } + + def __init__(self, components, subdivisions_rtree, buildings_rtree): # Instance of AddressComponents, contains structures for reverse geocoding, etc. self.components = components self.language_rtree = components.language_rtree + self.subdivisions_rtree = subdivisions_rtree + self.buildings_rtree = buildings_rtree + self.config = yaml.load(open(OSM_PARSER_DATA_DEFAULT_CONFIG)) self.formatter = AddressFormatter() @@ -137,6 +153,37 @@ class OSMAddressFormatter(object): address_components = {k: v for k, v in six.iteritems(address_components) if k in AddressFormatter.address_formatter_fields} return address_components + def subdivision_components(self, latitude, longitude): + return self.subdivisions_rtree.point_in_poly(latitude, longitude, return_all=True) + + def zone(self, subdivisions): + for subdiv in subdivisions: + for k, v in six.iteritems(self.zones): + zone = v.get(subdiv.get(k)) + if zone: + return zone + return None + + def building_components(self, latitude, longitude): + return self.buildings_rtree.point_in_poly(latitude, longitude, return_all=True) + + def num_floors(self, buildings, key='building:levels'): + max_floors = None + for b in buildings: + num_floors = b.get(key) + if num_floors is not None: + try: + num_floors = int(num_floors) + except (ValueError, TypeError): + try: + num_floors = int(float(num_floors)) + except (ValueError, TypeError): + continue + + if max_floors is not None and num_floors > max_floors: + max_floors = num_floors + return max_floors + def abbreviated_street(self, street, language): ''' Street abbreviations @@ -330,7 +377,22 @@ class OSMAddressFormatter(object): revised_tags = self.normalize_address_components(tags) - address_components, country, language = self.components.expanded(revised_tags, latitude, longitude) + num_floors = None + num_basements = None + zone = None + + building_components = self.building_components(latitude, longitude) + if building_components: + num_floors = self.num_floors(building_components) + num_basements = self.num_floors(building_components, key='building:levels:underground') + + subdivision_components = self.subdivision_components(latitude, longitude) + if subdivision_components: + zone = self.zone(subdivision_components) + + address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, + num_floors=num_floors, num_basements=num_basements, + zone=zone) if not address_components: return None, None, None diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index ac4aae9a..e00fa833 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -447,23 +447,31 @@ if __name__ == '__main__': default=tempfile.gettempdir(), help='Temp directory to use') - parser.add_argument('-g', '--language-rtree-dir', + parser.add_argument('--language-rtree-dir', required=True, help='Language RTree directory') - parser.add_argument('-r', '--rtree-dir', + parser.add_argument('--rtree-dir', default=None, help='OSM reverse geocoder RTree directory') - parser.add_argument('-q', '--quattroshapes-rtree-dir', + parser.add_argument('--quattroshapes-rtree-dir', default=None, help='Quattroshapes reverse geocoder RTree directory') - parser.add_argument('-d', '--geonames-db', + parser.add_argument('--subdivisions-rtree-dir', + default=None, + help='Subdivisions reverse geocoder RTree directory') + + parser.add_argument('--buildings-rtree-dir', + default=None, + help='Buildings reverse geocoder RTree directory') + + parser.add_argument('--geonames-db', default=None, help='GeoNames db file') - parser.add_argument('-n', '--neighborhoods-rtree-dir', + parser.add_argument('--neighborhoods-rtree-dir', default=None, help='Neighborhoods reverse geocoder RTree directory') @@ -486,6 +494,14 @@ if __name__ == '__main__': if args.quattroshapes_rtree_dir: quattroshapes_rtree = QuattroshapesReverseGeocoder.load(args.quattroshapes_rtree_dir) + subdivisions_rtree = None + if args.subdivisions_rtree_dir: + subdivisions_rtree = OSMSubdivisionReverseGeocoder.load(args.subdivisions_rtree_dir) + + buildings_rtree = None + if args.subdivisions_rtree_dir: + buildings_rtree = OSMBuildingReverseGeocoder.load(args.buildings_rtree) + geonames = None if args.geonames_db: @@ -509,11 +525,11 @@ if __name__ == '__main__': if args.address_file and args.format: components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) - osm_formatter = OSMAddressFormatter(components) + osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree) osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged) if args.address_file and args.limited_addresses: components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) - osm_formatter = OSMAddressFormatter(components, splitter=u' ') + osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, splitter=u' ') osm_formatter.build_limited_training_data(args.address_file, args.out_dir) if args.venues_file: build_venue_training_data(language_rtree, args.venues_file, args.out_dir)