diff --git a/scripts/geodata/osm/extract.py b/scripts/geodata/osm/extract.py new file mode 100644 index 00000000..300f4b1e --- /dev/null +++ b/scripts/geodata/osm/extract.py @@ -0,0 +1,63 @@ +''' +geodata.osm.extract +------------------- + +Extracts nodes/ways/relations, their metadata and dependencies +from .osm XML files. +''' + +from collections import OrderedDict +from lxml import etree + + +WAY_OFFSET = 10 ** 15 +RELATION_OFFSET = 2 * 10 ** 15 + +ALL_OSM_TAGS = set(['node', 'way', 'relation']) +WAYS_RELATIONS = set(['way', 'relation']) + + +def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False): + ''' + Parse a file in .osm format iteratively, generating tuples like: + ('node:1', OrderedDict([('lat', '12.34'), ('lon', '23.45')])), + ('node:2', OrderedDict([('lat', '12.34'), ('lon', '23.45')])), + ('node:3', OrderedDict([('lat', '12.34'), ('lon', '23.45')])), + ('node:4', OrderedDict([('lat', '12.34'), ('lon', '23.45')])), + ('way:4444', OrderedDict([('name', 'Main Street')]), [1,2,3,4]) + ''' + f = open(filename) + parser = etree.iterparse(f) + + single_type = len(allowed_types) == 1 + + for (_, elem) in parser: + elem_id = long(elem.attrib.pop('id', 0)) + item_type = elem.tag + if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET: + elem_id -= WAY_OFFSET + item_type = 'way' + elif elem_id >= RELATION_OFFSET: + elem_id -= RELATION_OFFSET + item_type = 'relation' + + if item_type in allowed_types: + attrs = OrderedDict(elem.attrib) + deps = [] if dependencies else None + + for e in elem.getchildren(): + if e.tag == 'tag': + attrs[e.attrib['k']] = e.attrib['v'] + elif dependencies and item_type == 'way' and e.tag == 'nd': + deps.append(long(e.attrib['ref'])) + elif dependencies and item_type == 'relation' and e.tag == 'member' and \ + e.attrib.get('type') in ('way', 'relation'): + deps.append((long(e.attrib['ref']), e.attrib.get('role'))) + + key = elem_id if single_type else '{}:{}'.format(item_type, elem_id) + yield key, attrs, deps + + if elem.tag in ALL_OSM_TAGS: + elem.clear() + while elem.getprevious() is not None: + del elem.getparent()[0]