[osm] Separating address formatter into its own module, adding some documentation of the various training sets with examples

2015-09-20 19:23:13 -04:00
parent 5485ea2197
commit 84cf21df88
3 changed files with 309 additions and 196 deletions
--- a/scripts/geodata/address_formatting/init.py
+++ b/scripts/geodata/address_formatting/init.py
--- a/scripts/geodata/address_formatting/formatter.py
+++ b/scripts/geodata/address_formatting/formatter.py
@@ -0,0 +1,213 @@
 # -*- coding: utf-8 -*-
 import os
 import pystache
 import re
 import subprocess
 import yaml
 from postal.text.tokenize import tokenize, tokenize_raw, token_types
 from collections import OrderedDict
 from itertools import ifilter
 FORMATTER_GIT_REPO = 'https://github.com/openvenues/address-formatting'
 class AddressFormatter(object):
    '''
    Approximate Python port of lokku's Geo::Address::Formatter
    Usage:
        address_formatter = AddressFormatter()
        components = {
            'house': u'Anticafé',
            'addr:housenumber': '2',
            'addr:street': u'Calle de la Unión',
            'addr:postcode': '28013',
            'addr:city': u'Madrid',
        }
        address_formatter.format_address('es', components)
    '''
    MINIMAL_COMPONENT_KEYS = [
        ('road', 'house_number'),
        ('road', 'house'),
        ('road', 'postcode')
    ]
    whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
    splitter = ' | '
    aliases = OrderedDict([
        ('name', 'house'),
        ('addr:housename', 'house'),
        ('addr:housenumber', 'house_number'),
        ('addr:house_number', 'house_number'),
        ('addr:street', 'road'),
        ('addr:city', 'city'),
        ('addr:locality', 'city'),
        ('addr:municipality', 'city'),
        ('addr:hamlet', 'village'),
        ('addr:suburb', 'suburb'),
        ('addr:neighbourhood', 'suburb'),
        ('addr:neighborhood', 'suburb'),
        ('addr:district', 'suburb'),
        ('addr:state', 'state'),
        ('addr:province', 'state'),
        ('addr:region', 'state'),
        ('addr:postal_code', 'postcode'),
        ('addr:postcode', 'postcode'),
        ('addr:country', 'country'),
        ('street', 'road'),
        ('street_name', 'road'),
        ('residential', 'road'),
        ('hamlet', 'village'),
        ('neighborhood', 'suburb'),
        ('neighbourhood', 'suburb'),
        ('city_district', 'suburb'),
        ('state_code', 'state'),
        ('country_name', 'country'),
    ])
    def __init__(self, scratch_dir='/tmp', splitter=None):
        if splitter is not None:
            self.splitter = splitter
        self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
        self.clone_repo()
        self.load_config()
    def clone_repo(self):
        subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
        subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
    def load_config(self):
        self.config = yaml.load(open(os.path.join(self.formatter_repo_path,
                                'conf/countries/worldwide.yaml')))
    def component_aliases(self):
        self.aliases = OrderedDict()
        self.aliases.update(self.osm_aliases)
        components = yaml.load_all(open(os.path.join(self.formatter_repo_path,
                                   'conf', 'components.yaml')))
        for c in components:
            name = c['name']
            for a in c.get('aliases', []):
                self.aliases[a] = name
    def replace_aliases(self, components):
        for k in components.keys():
            new_key = self.aliases.get(k)
            if new_key and new_key not in components:
                components[new_key] = components.pop(k)
    def country_template(self, c):
        return self.config.get(c, self.config['default'])
    def render_template(self, template, components, tagged=False):
        def render_first(text):
            text = pystache.render(text, **components)
            splits = (e.strip() for e in text.split('||'))
            selected = next(ifilter(bool, splits), '')
            return selected
        output = pystache.render(template, first=render_first,
                                 **components).strip()
        values = self.whitespace_component_regex.split(output)
        output = self.splitter.join([
            self.strip_component(val, tagged=tagged)
            for val in values
        ])
        return output
    def minimal_components(self, components):
        for component_list in self.MINIMAL_COMPONENT_KEYS:
            if all((c in components for c in component_list)):
                return True
        return False
    def apply_replacements(self, template, components):
        if not template.get('replace'):
            return
        for key in components.keys():
            value = components[key]
            for regex, replacement in template['replace']:
                value = re.sub(regex, replacement, value)
                components[key] = value
    def post_replacements(self, template, text):
        components = []
        seen = set()
        for component in text.split(self.splitter):
            component = component.strip()
            if component not in seen:
                components.append(component)
                seen.add(component)
        text = self.splitter.join(components)
        post_format_replacements = template.get('postformat_replace')
        if post_format_replacements:
            for regex, replacement in post_format_replacements:
                text = re.sub(regex, replacement, text)
        return text
    def strip_component(self, value, tagged=False):
        if not tagged:
            start = end = 0
            tokens = tokenize_raw(value)
            for token_start, token_length, token_type in tokens:
                start = token_start
                if token_type < token_types.PERIOD.value:
                    break
            for token_start, token_length, token_type in reversed(tokens):
                end = token_start + token_length
                if token_type < token_types.PERIOD.value:
                    break
            return value[start:end]
        else:
            i = j = 0
            tokens = value.split()
            for i, t in enumerate(tokens):
                if '/' in t:
                    break
            for j, t in enumerate(reversed(tokens)):
                if '/' in t:
                    break
            if j == 0:
                j = None
            else:
                j = -j
            return u' '.join(tokens[i:j])
    def format_address(self, country, components, minimal_only=True, tag_components=True):
        template = self.config.get(country.upper())
        if not template:
            return None
        template_text = template['address_template']
        self.replace_aliases(components)
        if not self.minimal_components(components):
            if minimal_only:
                return None
            if 'fallback_template' in template:
                template_text = template['fallback_template']
            else:
                template_text = self.config['default']['fallback_template']
        self.apply_replacements(template, components)
        if tag_components:
            components = {k: u' '.join([u'{}/{}'.format(t, k.replace(' ', '_'))
                                        for t, c in tokenize(v)])
                          for k, v in components.iteritems()}
        text = self.render_template(template_text, components, tagged=tag_components)
        text = self.post_replacements(template, text)
        return text
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -1,16 +1,47 @@
 # -*- coding: utf-8 -*-
 '''
 osm_address_training_data.py
 ----------------------------
 This script generates several training sets from OpenStreetMap addresses,
 streets, venues and toponyms.
 Note: the combined size of all the files created by this script exceeds 100GB
 so if training these models, it is wise to use a server-grade machine with
 plenty of disk space. The following commands can be used in parallel to create 
 all the training sets:
 Ways:
 python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
 Venues:
 python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
 Address streets:
 python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
 Limited formatted addresses:
 python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
 Formatted addresses (tagged):
 python osm_address_training_data.py -a -f $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
 Formatted addresses (untagged):
 python osm_address_training_data.py -a -f -u $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
 Toponyms:
 python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
 '''
 import argparse
 import csv
 import os
 import operator
 import pystache
 import re
 import subprocess
 import sys
 import tempfile
 import urllib
 import ujson as json
 import yaml
 import HTMLParser
 from collections import defaultdict, OrderedDict
@@ -22,10 +53,10 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
 from address_normalizer.text.tokenize import *
 from geodata.language_id.disambiguation import *
 from geodata.language_id.polygon_lookup import country_and_languages
 from geodata.i18n.languages import *
 from geodata.address_formatting.formatter import AddressFormatter
 from geodata.polygons.language_polys import *
 from geodata.i18n.unicode_paths import DATA_DIR
@@ -34,8 +65,6 @@ from geodata.file_utils import *
 this_dir = os.path.realpath(os.path.dirname(__file__))
 FORMATTER_GIT_REPO = 'https://github.com/openvenues/address-formatting'
 WAY_OFFSET = 10 ** 15
 RELATION_OFFSET = 2 * 10 ** 15
@@ -127,189 +156,6 @@ def read_osm_json(filename):
        yield key, json.loads(attrs)
 class AddressFormatter(object):
    ''' Approximate Python port of lokku's Geo::Address::Formatter '''
    MINIMAL_COMPONENT_KEYS = [
        ('road', 'house_number'),
        ('road', 'house'),
        ('road', 'postcode')
    ]
    whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
    splitter = ' | '
    aliases = OrderedDict([
        ('name', 'house'),
        ('addr:housename', 'house'),
        ('addr:housenumber', 'house_number'),
        ('addr:street', 'road'),
        ('addr:city', 'city'),
        ('addr:locality', 'city'),
        ('addr:municipality', 'city'),
        ('addr:hamlet', 'village'),
        ('addr:suburb', 'suburb'),
        ('addr:neighbourhood', 'suburb'),
        ('addr:neighborhood', 'suburb'),
        ('addr:district', 'suburb'),
        ('addr:state', 'state'),
        ('addr:province', 'state'),
        ('addr:region', 'state'),
        ('addr:postal_code', 'postcode'),
        ('addr:postcode', 'postcode'),
        ('addr:country', 'country'),
        ('street', 'road'),
        ('street_name', 'road'),
        ('residential', 'road'),
        ('hamlet', 'village'),
        ('neighborhood', 'suburb'),
        ('neighbourhood', 'suburb'),
        ('city_district', 'suburb'),
        ('state_code', 'state'),
        ('country_name', 'country'),
    ])
    def __init__(self, scratch_dir='/tmp', splitter=None):
        if splitter is not None:
            self.splitter = splitter
        self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
        self.clone_repo()
        self.load_config()
    def clone_repo(self):
        subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
        subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
    def load_config(self):
        self.config = yaml.load(open(os.path.join(self.formatter_repo_path,
                                'conf/countries/worldwide.yaml')))
    def component_aliases(self):
        self.aliases = OrderedDict()
        self.aliases.update(self.osm_aliases)
        components = yaml.load_all(open(os.path.join(self.formatter_repo_path,
                                   'conf', 'components.yaml')))
        for c in components:
            name = c['name']
            for a in c.get('aliases', []):
                self.aliases[a] = name
    def replace_aliases(self, components):
        for k in components.keys():
            new_key = self.aliases.get(k)
            if new_key and new_key not in components:
                components[new_key] = components.pop(k)
    def country_template(self, c):
        return self.config.get(c, self.config['default'])
    def render_template(self, template, components, tagged=False):
        def render_first(text):
            text = pystache.render(text, **components)
            splits = (e.strip() for e in text.split('||'))
            selected = next(ifilter(bool, splits), '')
            return selected
        output = pystache.render(template, first=render_first,
                                 **components).strip()
        values = self.whitespace_component_regex.split(output)
        output = self.splitter.join([
            self.strip_component(val, tagged=tagged)
            for val in values
        ])
        return output
    def minimal_components(self, components):
        for component_list in self.MINIMAL_COMPONENT_KEYS:
            if all((c in components for c in component_list)):
                return True
        return False
    def apply_replacements(self, template, components):
        if not template.get('replace'):
            return
        for key in components.keys():
            value = components[key]
            for regex, replacement in template['replace']:
                value = re.sub(regex, replacement, value)
                components[key] = value
    def post_replacements(self, template, text):
        components = []
        seen = set()
        for component in text.split(self.splitter):
            component = component.strip()
            if component not in seen:
                components.append(component)
                seen.add(component)
        text = self.splitter.join(components)
        post_format_replacements = template.get('postformat_replace')
        if post_format_replacements:
            for regex, replacement in post_format_replacements:
                text = re.sub(regex, replacement, text)
        return text
    def strip_component(self, value, tagged=False):
        i = j = 0
        if not tagged:
            tokens = tokenize(value)
            for i, (c, t) in enumerate(tokens):
                if c.value < token_types.PERIOD.value:
                    break
            for j, (c, t) in enumerate(reversed(tokens)):
                if c.value < token_types.PERIOD.value:
                    break
            tokens = [t for c, t in tokens]
        else:
            tokens = value.split()
            for i, t in enumerate(tokens):
                if '/' in t:
                    break
            for j, t in enumerate(reversed(tokens)):
                if '/' in t:
                    break
        if j == 0:
            j = None
        else:
            j = -j
        return u' '.join(tokens[i:j])
    def format_address(self, country, components, minimal_only=True, tag_components=True):
        template = self.config.get(country.upper())
        if not template:
            return None
        template_text = template['address_template']
        self.replace_aliases(components)
        if not self.minimal_components(components):
            if minimal_only:
                return None
            if 'fallback_template' in template:
                template_text = template['fallback_template']
            else:
                template_text = self.config['default']['fallback_template']
        self.apply_replacements(template, components)
        if tag_components:
            components = {k: u' '.join([u'{}/{}'.format(t, k.replace(' ', '_'))
                                        for c, t in tokenize(v)])
                          for k, v in components.iteritems()}
        else:
            components = {k: u' '.join([t for c, t in tokenize(v)])
                          for k, v in components.iteritems()}
        text = self.render_template(template_text, components, tagged=tag_components)
        text = self.post_replacements(template, text)
        return text
 def normalize_osm_name_tag(tag, script=False):
    norm = tag.rsplit(':', 1)[-1]
@@ -462,6 +308,16 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
 def build_ways_training_data(language_rtree, infile, out_dir):
    '''
    Creates a training set for language classification using most OSM ways
    (streets) under a fairly lengthy osmfilter definition which attempts to
    identify all roads/ways designated for motor vehicle traffic, which
    is more-or-less what we'd expect to see in addresses.
    The fields are {language, country, street name}. Example:
    ar      ma      ﺵﺍﺮﻋ ﻑﺎﻟ ﻮﻟﺩ ﻊﻤﻳﺭ
    '''
    i = 0
    f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')
    writer = csv.writer(f, 'tsv_no_quote')
@@ -493,8 +349,9 @@ def strip_keys(value, ignore_keys):
 def build_address_format_training_data(language_rtree, infile, out_dir, tag_components=True):
    '''
    Creates formatted address training data for supervised sequence labeling (or potentially 
-    for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. The tagged
+    for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.
-    version produces a TSV file that looks like:
+
    Example:
    cs  cz  Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country
@@ -506,9 +363,9 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
    This information can potentially be used downstream by the sequence model as these
    breaks may be present at prediction time.
-    For the untagged version, lines simply look like:
+    Example:
-    The Dignity | 363 Regents Park Road | London N3 1DH
+    sr      rs      Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic
    This may be useful in learning word representations, statistical phrases, morphology
    or other models requiring only the sequence of words.
@@ -578,6 +435,18 @@ POSTAL_KEYS = (
 def build_address_format_training_data_limited(language_rtree, infile, out_dir):
    '''
    Creates a special kind of formatted address training data from OSM's addr:* tags
    but are designed for use in language classification. These records are similar 
    to the untagged formatted records but include the language and country
    (suitable for concatenation with the rest of the language training data),
    and remove several fields like country which usually do not contain helpful
    information for classifying the language.
    Example:
    nb      no      Olaf Ryes Plass 8 | Oslo
    '''
    i = 0
    formatter = AddressFormatter()
@@ -648,6 +517,18 @@ def normalize_wikipedia_title(title):
 def build_toponym_training_data(language_rtree, infile, out_dir):
    '''
    Data set of toponyms by language and country which should assist
    in language classification. OSM tends to use the native language
    by default (e.g. Москва instead of Moscow). Toponyms get messy
    due to factors like colonialism, historical names, name borrowing
    and the shortness of the names generally. In these cases
    we're more strict as to what constitutes a valid language for a
    given country.
    Example:
    ja      jp      東京都
    '''
    i = 0
    f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
    writer = csv.writer(f, 'tsv_no_quote')
@@ -667,10 +548,8 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
        name_language = defaultdict(list)
        all_langs = country_languages[country]
        official = official_languages[country]
        num_langs = len(candidate_languages)
        default_langs = set([l for l, default in official.iteritems() if default])
        regional_langs = list(chain(*(p['languages'] for p in language_props if p.get('admin_level', 0) > 0)))
@@ -684,6 +563,14 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
            default_langs -= WELL_REPRESENTED_LANGUAGES
        valid_languages = set([l['lang'] for l in candidate_languages])
        '''
        WELL_REPRESENTED_LANGUAGES are languages like English, French, etc. for which we have a lot of data
        WELL_REPRESENTED_LANGUAGE_COUNTRIES are more-or-less the "origin" countries for said languages where
        we can take the place names as examples of the language itself (e.g. place names in France are examples
        of French, whereas place names in much of Francophone Africa tend to get their names from languages
        other than French, even though French is the official language.
        '''
        valid_languages -= set([lang for lang in valid_languages if lang in WELL_REPRESENTED_LANGUAGES and country not in WELL_REPRESENTED_LANGUAGE_COUNTRIES[lang]])
        valid_languages |= default_langs
@@ -728,6 +615,14 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
 def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
    '''
    Creates training set similar to the ways data but using addr:street tags instead.
    These may be slightly closer to what we'd see in real live addresses, containing
    variations, some abbreviations (although this is discouraged in OSM), etc.
    Example record:
    eu      es      Errebal kalea
    '''
    i = 0
    f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
    writer = csv.writer(f, 'tsv_no_quote')
@@ -809,6 +704,11 @@ if __name__ == '__main__':
                        default=False,
                        help='Save formatted addresses (slow)')
    parser.add_argument('-u', '--untagged',
                        action='store_true',
                        default=False,
                        help='Save untagged formatted addresses (slow)')
    parser.add_argument('-l', '--limited-addresses',
                        action='store_true',
                        default=False,
@@ -842,7 +742,7 @@ if __name__ == '__main__':
    if args.address_file and not args.format_only and not args.limited_addresses:
        build_address_training_data(language_rtree, args.address_file, args.out_dir)
    if args.address_file and args.format_only:
-        build_address_format_training_data(language_rtree, args.address_file, args.out_dir)
+        build_address_format_training_data(language_rtree, args.address_file, args.out_dir, tag_components=not args.untagged)
    if args.address_file and args.limited_addresses:
        build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir)
    if args.venues_file: