From 84cf21df88969ecbce3c22482f5ad1ce06471974 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sun, 20 Sep 2015 19:23:13 -0400
Subject: [PATCH] [osm] Separating address formatter into its own module,
 adding some documentation of the various training sets with examples

---
 .../geodata/address_formatting/__init__.py    |   0
 .../geodata/address_formatting/formatter.py   | 213 +++++++++++++
 .../geodata/osm/osm_address_training_data.py  | 292 ++++++------------
 3 files changed, 309 insertions(+), 196 deletions(-)
 create mode 100644 scripts/geodata/address_formatting/__init__.py
 create mode 100644 scripts/geodata/address_formatting/formatter.py

diff --git a/scripts/geodata/address_formatting/__init__.py b/scripts/geodata/address_formatting/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/geodata/address_formatting/formatter.py b/scripts/geodata/address_formatting/formatter.py
new file mode 100644
index 00000000..46f5f054
--- /dev/null
+++ b/scripts/geodata/address_formatting/formatter.py
@@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+import os
+import pystache
+import re
+import subprocess
+import yaml
+
+from postal.text.tokenize import tokenize, tokenize_raw, token_types
+from collections import OrderedDict
+from itertools import ifilter
+
+FORMATTER_GIT_REPO = 'https://github.com/openvenues/address-formatting'
+
+
+class AddressFormatter(object):
+    '''
+    Approximate Python port of lokku's Geo::Address::Formatter
+
+    Usage:
+        address_formatter = AddressFormatter()
+        components = {
+            'house': u'Anticafé',
+            'addr:housenumber': '2',
+            'addr:street': u'Calle de la Unión',
+            'addr:postcode': '28013',
+            'addr:city': u'Madrid',
+        }
+        address_formatter.format_address('es', components)
+    '''
+
+    MINIMAL_COMPONENT_KEYS = [
+        ('road', 'house_number'),
+        ('road', 'house'),
+        ('road', 'postcode')
+    ]
+
+    whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
+
+    splitter = ' | '
+
+    aliases = OrderedDict([
+        ('name', 'house'),
+        ('addr:housename', 'house'),
+        ('addr:housenumber', 'house_number'),
+        ('addr:house_number', 'house_number'),
+        ('addr:street', 'road'),
+        ('addr:city', 'city'),
+        ('addr:locality', 'city'),
+        ('addr:municipality', 'city'),
+        ('addr:hamlet', 'village'),
+        ('addr:suburb', 'suburb'),
+        ('addr:neighbourhood', 'suburb'),
+        ('addr:neighborhood', 'suburb'),
+        ('addr:district', 'suburb'),
+        ('addr:state', 'state'),
+        ('addr:province', 'state'),
+        ('addr:region', 'state'),
+        ('addr:postal_code', 'postcode'),
+        ('addr:postcode', 'postcode'),
+        ('addr:country', 'country'),
+        ('street', 'road'),
+        ('street_name', 'road'),
+        ('residential', 'road'),
+        ('hamlet', 'village'),
+        ('neighborhood', 'suburb'),
+        ('neighbourhood', 'suburb'),
+        ('city_district', 'suburb'),
+        ('state_code', 'state'),
+        ('country_name', 'country'),
+    ])
+
+    def __init__(self, scratch_dir='/tmp', splitter=None):
+        if splitter is not None:
+            self.splitter = splitter
+
+        self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
+        self.clone_repo()
+        self.load_config()
+
+    def clone_repo(self):
+        subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
+        subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
+
+    def load_config(self):
+        self.config = yaml.load(open(os.path.join(self.formatter_repo_path,
+                                'conf/countries/worldwide.yaml')))
+
+    def component_aliases(self):
+        self.aliases = OrderedDict()
+        self.aliases.update(self.osm_aliases)
+        components = yaml.load_all(open(os.path.join(self.formatter_repo_path,
+                                   'conf', 'components.yaml')))
+        for c in components:
+            name = c['name']
+            for a in c.get('aliases', []):
+                self.aliases[a] = name
+
+    def replace_aliases(self, components):
+        for k in components.keys():
+            new_key = self.aliases.get(k)
+            if new_key and new_key not in components:
+                components[new_key] = components.pop(k)
+
+    def country_template(self, c):
+        return self.config.get(c, self.config['default'])
+
+    def render_template(self, template, components, tagged=False):
+        def render_first(text):
+            text = pystache.render(text, **components)
+            splits = (e.strip() for e in text.split('||'))
+            selected = next(ifilter(bool, splits), '')
+            return selected
+
+        output = pystache.render(template, first=render_first,
+                                 **components).strip()
+
+        values = self.whitespace_component_regex.split(output)
+
+        output = self.splitter.join([
+            self.strip_component(val, tagged=tagged)
+            for val in values
+        ])
+
+        return output
+
+    def minimal_components(self, components):
+        for component_list in self.MINIMAL_COMPONENT_KEYS:
+            if all((c in components for c in component_list)):
+                return True
+        return False
+
+    def apply_replacements(self, template, components):
+        if not template.get('replace'):
+            return
+        for key in components.keys():
+            value = components[key]
+            for regex, replacement in template['replace']:
+                value = re.sub(regex, replacement, value)
+                components[key] = value
+
+    def post_replacements(self, template, text):
+        components = []
+        seen = set()
+        for component in text.split(self.splitter):
+            component = component.strip()
+            if component not in seen:
+                components.append(component)
+                seen.add(component)
+        text = self.splitter.join(components)
+        post_format_replacements = template.get('postformat_replace')
+        if post_format_replacements:
+            for regex, replacement in post_format_replacements:
+                text = re.sub(regex, replacement, text)
+        return text
+
+    def strip_component(self, value, tagged=False):
+        if not tagged:
+            start = end = 0
+            tokens = tokenize_raw(value)
+            for token_start, token_length, token_type in tokens:
+                start = token_start
+                if token_type < token_types.PERIOD.value:
+                    break
+
+            for token_start, token_length, token_type in reversed(tokens):
+                end = token_start + token_length
+                if token_type < token_types.PERIOD.value:
+                    break
+
+            return value[start:end]
+        else:
+            i = j = 0
+            tokens = value.split()
+            for i, t in enumerate(tokens):
+                if '/' in t:
+                    break
+
+            for j, t in enumerate(reversed(tokens)):
+                if '/' in t:
+                    break
+
+            if j == 0:
+                j = None
+            else:
+                j = -j
+            return u' '.join(tokens[i:j])
+
+    def format_address(self, country, components, minimal_only=True, tag_components=True):
+        template = self.config.get(country.upper())
+        if not template:
+            return None
+        template_text = template['address_template']
+        self.replace_aliases(components)
+
+        if not self.minimal_components(components):
+            if minimal_only:
+                return None
+            if 'fallback_template' in template:
+                template_text = template['fallback_template']
+            else:
+                template_text = self.config['default']['fallback_template']
+
+        self.apply_replacements(template, components)
+
+        if tag_components:
+            components = {k: u' '.join([u'{}/{}'.format(t, k.replace(' ', '_'))
+                                        for t, c in tokenize(v)])
+                          for k, v in components.iteritems()}
+
+        text = self.render_template(template_text, components, tagged=tag_components)
+
+        text = self.post_replacements(template, text)
+        return text
diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py
index 71c90709..6cceb2c2 100644
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -1,16 +1,47 @@
 # -*- coding: utf-8 -*-
+'''
+osm_address_training_data.py
+----------------------------
+
+This script generates several training sets from OpenStreetMap addresses,
+streets, venues and toponyms.
+
+Note: the combined size of all the files created by this script exceeds 100GB
+so if training these models, it is wise to use a server-grade machine with
+plenty of disk space. The following commands can be used in parallel to create 
+all the training sets:
+
+Ways:
+python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Venues:
+python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Address streets:
+python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Limited formatted addresses:
+python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Formatted addresses (tagged):
+python osm_address_training_data.py -a -f $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Formatted addresses (untagged):
+python osm_address_training_data.py -a -f -u $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Toponyms:
+python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+'''
+
 import argparse
 import csv
 import os
 import operator
-import pystache
 import re
-import subprocess
 import sys
 import tempfile
 import urllib
 import ujson as json
-import yaml
 import HTMLParser
 
 from collections import defaultdict, OrderedDict
@@ -22,10 +53,10 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
 
-from address_normalizer.text.tokenize import *
 from geodata.language_id.disambiguation import *
 from geodata.language_id.polygon_lookup import country_and_languages
 from geodata.i18n.languages import *
+from geodata.address_formatting.formatter import AddressFormatter
 from geodata.polygons.language_polys import *
 from geodata.i18n.unicode_paths import DATA_DIR
 
@@ -34,8 +65,6 @@ from geodata.file_utils import *
 
 this_dir = os.path.realpath(os.path.dirname(__file__))
 
-FORMATTER_GIT_REPO = 'https://github.com/openvenues/address-formatting'
-
 WAY_OFFSET = 10 ** 15
 RELATION_OFFSET = 2 * 10 ** 15
 
@@ -127,189 +156,6 @@ def read_osm_json(filename):
         yield key, json.loads(attrs)
 
 
-class AddressFormatter(object):
-    ''' Approximate Python port of lokku's Geo::Address::Formatter '''
-    MINIMAL_COMPONENT_KEYS = [
-        ('road', 'house_number'),
-        ('road', 'house'),
-        ('road', 'postcode')
-    ]
-
-    whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
-
-    splitter = ' | '
-
-    aliases = OrderedDict([
-        ('name', 'house'),
-        ('addr:housename', 'house'),
-        ('addr:housenumber', 'house_number'),
-        ('addr:street', 'road'),
-        ('addr:city', 'city'),
-        ('addr:locality', 'city'),
-        ('addr:municipality', 'city'),
-        ('addr:hamlet', 'village'),
-        ('addr:suburb', 'suburb'),
-        ('addr:neighbourhood', 'suburb'),
-        ('addr:neighborhood', 'suburb'),
-        ('addr:district', 'suburb'),
-        ('addr:state', 'state'),
-        ('addr:province', 'state'),
-        ('addr:region', 'state'),
-        ('addr:postal_code', 'postcode'),
-        ('addr:postcode', 'postcode'),
-        ('addr:country', 'country'),
-        ('street', 'road'),
-        ('street_name', 'road'),
-        ('residential', 'road'),
-        ('hamlet', 'village'),
-        ('neighborhood', 'suburb'),
-        ('neighbourhood', 'suburb'),
-        ('city_district', 'suburb'),
-        ('state_code', 'state'),
-        ('country_name', 'country'),
-    ])
-
-    def __init__(self, scratch_dir='/tmp', splitter=None):
-        if splitter is not None:
-            self.splitter = splitter
-
-        self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
-        self.clone_repo()
-        self.load_config()
-
-    def clone_repo(self):
-        subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
-        subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
-
-    def load_config(self):
-        self.config = yaml.load(open(os.path.join(self.formatter_repo_path,
-                                'conf/countries/worldwide.yaml')))
-
-    def component_aliases(self):
-        self.aliases = OrderedDict()
-        self.aliases.update(self.osm_aliases)
-        components = yaml.load_all(open(os.path.join(self.formatter_repo_path,
-                                   'conf', 'components.yaml')))
-        for c in components:
-            name = c['name']
-            for a in c.get('aliases', []):
-                self.aliases[a] = name
-
-    def replace_aliases(self, components):
-        for k in components.keys():
-            new_key = self.aliases.get(k)
-            if new_key and new_key not in components:
-                components[new_key] = components.pop(k)
-
-    def country_template(self, c):
-        return self.config.get(c, self.config['default'])
-
-    def render_template(self, template, components, tagged=False):
-        def render_first(text):
-            text = pystache.render(text, **components)
-            splits = (e.strip() for e in text.split('||'))
-            selected = next(ifilter(bool, splits), '')
-            return selected
-
-        output = pystache.render(template, first=render_first,
-                                 **components).strip()
-
-        values = self.whitespace_component_regex.split(output)
-
-        output = self.splitter.join([
-            self.strip_component(val, tagged=tagged)
-            for val in values
-        ])
-
-        return output
-
-    def minimal_components(self, components):
-        for component_list in self.MINIMAL_COMPONENT_KEYS:
-            if all((c in components for c in component_list)):
-                return True
-        return False
-
-    def apply_replacements(self, template, components):
-        if not template.get('replace'):
-            return
-        for key in components.keys():
-            value = components[key]
-            for regex, replacement in template['replace']:
-                value = re.sub(regex, replacement, value)
-                components[key] = value
-
-    def post_replacements(self, template, text):
-        components = []
-        seen = set()
-        for component in text.split(self.splitter):
-            component = component.strip()
-            if component not in seen:
-                components.append(component)
-                seen.add(component)
-        text = self.splitter.join(components)
-        post_format_replacements = template.get('postformat_replace')
-        if post_format_replacements:
-            for regex, replacement in post_format_replacements:
-                text = re.sub(regex, replacement, text)
-        return text
-
-    def strip_component(self, value, tagged=False):
-        i = j = 0
-        if not tagged:
-            tokens = tokenize(value)
-            for i, (c, t) in enumerate(tokens):
-                if c.value < token_types.PERIOD.value:
-                    break
-
-            for j, (c, t) in enumerate(reversed(tokens)):
-                if c.value < token_types.PERIOD.value:
-                    break
-            tokens = [t for c, t in tokens]
-        else:
-            tokens = value.split()
-            for i, t in enumerate(tokens):
-                if '/' in t:
-                    break
-
-            for j, t in enumerate(reversed(tokens)):
-                if '/' in t:
-                    break
-        if j == 0:
-            j = None
-        else:
-            j = -j
-        return u' '.join(tokens[i:j])
-
-    def format_address(self, country, components, minimal_only=True, tag_components=True):
-        template = self.config.get(country.upper())
-        if not template:
-            return None
-        template_text = template['address_template']
-        self.replace_aliases(components)
-
-        if not self.minimal_components(components):
-            if minimal_only:
-                return None
-            if 'fallback_template' in template:
-                template_text = template['fallback_template']
-            else:
-                template_text = self.config['default']['fallback_template']
-
-        self.apply_replacements(template, components)
-
-        if tag_components:
-            components = {k: u' '.join([u'{}/{}'.format(t, k.replace(' ', '_'))
-                                        for c, t in tokenize(v)])
-                          for k, v in components.iteritems()}
-        else:
-            components = {k: u' '.join([t for c, t in tokenize(v)])
-                          for k, v in components.iteritems()}
-
-        text = self.render_template(template_text, components, tagged=tag_components)
-
-        text = self.post_replacements(template, text)
-        return text
-
 
 def normalize_osm_name_tag(tag, script=False):
     norm = tag.rsplit(':', 1)[-1]
@@ -462,6 +308,16 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
 
 
 def build_ways_training_data(language_rtree, infile, out_dir):
+    '''
+    Creates a training set for language classification using most OSM ways
+    (streets) under a fairly lengthy osmfilter definition which attempts to
+    identify all roads/ways designated for motor vehicle traffic, which
+    is more-or-less what we'd expect to see in addresses.
+
+    The fields are {language, country, street name}. Example:
+
+    ar      ma      ﺵﺍﺮﻋ ﻑﺎﻟ ﻮﻟﺩ ﻊﻤﻳﺭ
+    '''
     i = 0
     f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')
     writer = csv.writer(f, 'tsv_no_quote')
@@ -493,8 +349,9 @@ def strip_keys(value, ignore_keys):
 def build_address_format_training_data(language_rtree, infile, out_dir, tag_components=True):
     '''
     Creates formatted address training data for supervised sequence labeling (or potentially 
-    for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. The tagged
-    version produces a TSV file that looks like:
+    for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.
+
+    Example:
 
     cs  cz  Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country
 
@@ -506,9 +363,9 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
     This information can potentially be used downstream by the sequence model as these
     breaks may be present at prediction time.
 
-    For the untagged version, lines simply look like:
+    Example:
 
-    The Dignity | 363 Regents Park Road | London N3 1DH
+    sr      rs      Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic
 
     This may be useful in learning word representations, statistical phrases, morphology
     or other models requiring only the sequence of words.
@@ -578,6 +435,18 @@ POSTAL_KEYS = (
 
 
 def build_address_format_training_data_limited(language_rtree, infile, out_dir):
+    '''
+    Creates a special kind of formatted address training data from OSM's addr:* tags
+    but are designed for use in language classification. These records are similar 
+    to the untagged formatted records but include the language and country
+    (suitable for concatenation with the rest of the language training data),
+    and remove several fields like country which usually do not contain helpful
+    information for classifying the language.
+
+    Example:
+
+    nb      no      Olaf Ryes Plass 8 | Oslo
+    '''
     i = 0
 
     formatter = AddressFormatter()
@@ -648,6 +517,18 @@ def normalize_wikipedia_title(title):
 
 
 def build_toponym_training_data(language_rtree, infile, out_dir):
+    '''
+    Data set of toponyms by language and country which should assist
+    in language classification. OSM tends to use the native language
+    by default (e.g. Москва instead of Moscow). Toponyms get messy
+    due to factors like colonialism, historical names, name borrowing
+    and the shortness of the names generally. In these cases
+    we're more strict as to what constitutes a valid language for a
+    given country.
+
+    Example:
+    ja      jp      東京都
+    '''
     i = 0
     f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
     writer = csv.writer(f, 'tsv_no_quote')
@@ -667,10 +548,8 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
 
         name_language = defaultdict(list)
 
-        all_langs = country_languages[country]
         official = official_languages[country]
 
-        num_langs = len(candidate_languages)
         default_langs = set([l for l, default in official.iteritems() if default])
 
         regional_langs = list(chain(*(p['languages'] for p in language_props if p.get('admin_level', 0) > 0)))
@@ -684,6 +563,14 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
             default_langs -= WELL_REPRESENTED_LANGUAGES
 
         valid_languages = set([l['lang'] for l in candidate_languages])
+
+        '''
+        WELL_REPRESENTED_LANGUAGES are languages like English, French, etc. for which we have a lot of data
+        WELL_REPRESENTED_LANGUAGE_COUNTRIES are more-or-less the "origin" countries for said languages where
+        we can take the place names as examples of the language itself (e.g. place names in France are examples
+        of French, whereas place names in much of Francophone Africa tend to get their names from languages
+        other than French, even though French is the official language.
+        '''
         valid_languages -= set([lang for lang in valid_languages if lang in WELL_REPRESENTED_LANGUAGES and country not in WELL_REPRESENTED_LANGUAGE_COUNTRIES[lang]])
 
         valid_languages |= default_langs
@@ -728,6 +615,14 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
 
 
 def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
+    '''
+    Creates training set similar to the ways data but using addr:street tags instead.
+    These may be slightly closer to what we'd see in real live addresses, containing
+    variations, some abbreviations (although this is discouraged in OSM), etc.
+
+    Example record:
+    eu      es      Errebal kalea
+    '''
     i = 0
     f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
     writer = csv.writer(f, 'tsv_no_quote')
@@ -809,6 +704,11 @@ if __name__ == '__main__':
                         default=False,
                         help='Save formatted addresses (slow)')
 
+    parser.add_argument('-u', '--untagged',
+                        action='store_true',
+                        default=False,
+                        help='Save untagged formatted addresses (slow)')
+
     parser.add_argument('-l', '--limited-addresses',
                         action='store_true',
                         default=False,
@@ -842,7 +742,7 @@ if __name__ == '__main__':
     if args.address_file and not args.format_only and not args.limited_addresses:
         build_address_training_data(language_rtree, args.address_file, args.out_dir)
     if args.address_file and args.format_only:
-        build_address_format_training_data(language_rtree, args.address_file, args.out_dir)
+        build_address_format_training_data(language_rtree, args.address_file, args.out_dir, tag_components=not args.untagged)
     if args.address_file and args.limited_addresses:
         build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir)
     if args.venues_file: