[osm] Separating address formatter into its own module, adding some documentation of the various training sets with examples
This commit is contained in:
0
scripts/geodata/address_formatting/__init__.py
Normal file
0
scripts/geodata/address_formatting/__init__.py
Normal file
213
scripts/geodata/address_formatting/formatter.py
Normal file
213
scripts/geodata/address_formatting/formatter.py
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import pystache
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from postal.text.tokenize import tokenize, tokenize_raw, token_types
|
||||||
|
from collections import OrderedDict
|
||||||
|
from itertools import ifilter
|
||||||
|
|
||||||
|
FORMATTER_GIT_REPO = 'https://github.com/openvenues/address-formatting'
|
||||||
|
|
||||||
|
|
||||||
|
class AddressFormatter(object):
|
||||||
|
'''
|
||||||
|
Approximate Python port of lokku's Geo::Address::Formatter
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
address_formatter = AddressFormatter()
|
||||||
|
components = {
|
||||||
|
'house': u'Anticafé',
|
||||||
|
'addr:housenumber': '2',
|
||||||
|
'addr:street': u'Calle de la Unión',
|
||||||
|
'addr:postcode': '28013',
|
||||||
|
'addr:city': u'Madrid',
|
||||||
|
}
|
||||||
|
address_formatter.format_address('es', components)
|
||||||
|
'''
|
||||||
|
|
||||||
|
MINIMAL_COMPONENT_KEYS = [
|
||||||
|
('road', 'house_number'),
|
||||||
|
('road', 'house'),
|
||||||
|
('road', 'postcode')
|
||||||
|
]
|
||||||
|
|
||||||
|
whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
|
||||||
|
|
||||||
|
splitter = ' | '
|
||||||
|
|
||||||
|
aliases = OrderedDict([
|
||||||
|
('name', 'house'),
|
||||||
|
('addr:housename', 'house'),
|
||||||
|
('addr:housenumber', 'house_number'),
|
||||||
|
('addr:house_number', 'house_number'),
|
||||||
|
('addr:street', 'road'),
|
||||||
|
('addr:city', 'city'),
|
||||||
|
('addr:locality', 'city'),
|
||||||
|
('addr:municipality', 'city'),
|
||||||
|
('addr:hamlet', 'village'),
|
||||||
|
('addr:suburb', 'suburb'),
|
||||||
|
('addr:neighbourhood', 'suburb'),
|
||||||
|
('addr:neighborhood', 'suburb'),
|
||||||
|
('addr:district', 'suburb'),
|
||||||
|
('addr:state', 'state'),
|
||||||
|
('addr:province', 'state'),
|
||||||
|
('addr:region', 'state'),
|
||||||
|
('addr:postal_code', 'postcode'),
|
||||||
|
('addr:postcode', 'postcode'),
|
||||||
|
('addr:country', 'country'),
|
||||||
|
('street', 'road'),
|
||||||
|
('street_name', 'road'),
|
||||||
|
('residential', 'road'),
|
||||||
|
('hamlet', 'village'),
|
||||||
|
('neighborhood', 'suburb'),
|
||||||
|
('neighbourhood', 'suburb'),
|
||||||
|
('city_district', 'suburb'),
|
||||||
|
('state_code', 'state'),
|
||||||
|
('country_name', 'country'),
|
||||||
|
])
|
||||||
|
|
||||||
|
def __init__(self, scratch_dir='/tmp', splitter=None):
|
||||||
|
if splitter is not None:
|
||||||
|
self.splitter = splitter
|
||||||
|
|
||||||
|
self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
|
||||||
|
self.clone_repo()
|
||||||
|
self.load_config()
|
||||||
|
|
||||||
|
def clone_repo(self):
|
||||||
|
subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
|
||||||
|
subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
|
||||||
|
|
||||||
|
def load_config(self):
|
||||||
|
self.config = yaml.load(open(os.path.join(self.formatter_repo_path,
|
||||||
|
'conf/countries/worldwide.yaml')))
|
||||||
|
|
||||||
|
def component_aliases(self):
|
||||||
|
self.aliases = OrderedDict()
|
||||||
|
self.aliases.update(self.osm_aliases)
|
||||||
|
components = yaml.load_all(open(os.path.join(self.formatter_repo_path,
|
||||||
|
'conf', 'components.yaml')))
|
||||||
|
for c in components:
|
||||||
|
name = c['name']
|
||||||
|
for a in c.get('aliases', []):
|
||||||
|
self.aliases[a] = name
|
||||||
|
|
||||||
|
def replace_aliases(self, components):
|
||||||
|
for k in components.keys():
|
||||||
|
new_key = self.aliases.get(k)
|
||||||
|
if new_key and new_key not in components:
|
||||||
|
components[new_key] = components.pop(k)
|
||||||
|
|
||||||
|
def country_template(self, c):
|
||||||
|
return self.config.get(c, self.config['default'])
|
||||||
|
|
||||||
|
def render_template(self, template, components, tagged=False):
|
||||||
|
def render_first(text):
|
||||||
|
text = pystache.render(text, **components)
|
||||||
|
splits = (e.strip() for e in text.split('||'))
|
||||||
|
selected = next(ifilter(bool, splits), '')
|
||||||
|
return selected
|
||||||
|
|
||||||
|
output = pystache.render(template, first=render_first,
|
||||||
|
**components).strip()
|
||||||
|
|
||||||
|
values = self.whitespace_component_regex.split(output)
|
||||||
|
|
||||||
|
output = self.splitter.join([
|
||||||
|
self.strip_component(val, tagged=tagged)
|
||||||
|
for val in values
|
||||||
|
])
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
def minimal_components(self, components):
|
||||||
|
for component_list in self.MINIMAL_COMPONENT_KEYS:
|
||||||
|
if all((c in components for c in component_list)):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def apply_replacements(self, template, components):
|
||||||
|
if not template.get('replace'):
|
||||||
|
return
|
||||||
|
for key in components.keys():
|
||||||
|
value = components[key]
|
||||||
|
for regex, replacement in template['replace']:
|
||||||
|
value = re.sub(regex, replacement, value)
|
||||||
|
components[key] = value
|
||||||
|
|
||||||
|
def post_replacements(self, template, text):
|
||||||
|
components = []
|
||||||
|
seen = set()
|
||||||
|
for component in text.split(self.splitter):
|
||||||
|
component = component.strip()
|
||||||
|
if component not in seen:
|
||||||
|
components.append(component)
|
||||||
|
seen.add(component)
|
||||||
|
text = self.splitter.join(components)
|
||||||
|
post_format_replacements = template.get('postformat_replace')
|
||||||
|
if post_format_replacements:
|
||||||
|
for regex, replacement in post_format_replacements:
|
||||||
|
text = re.sub(regex, replacement, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def strip_component(self, value, tagged=False):
|
||||||
|
if not tagged:
|
||||||
|
start = end = 0
|
||||||
|
tokens = tokenize_raw(value)
|
||||||
|
for token_start, token_length, token_type in tokens:
|
||||||
|
start = token_start
|
||||||
|
if token_type < token_types.PERIOD.value:
|
||||||
|
break
|
||||||
|
|
||||||
|
for token_start, token_length, token_type in reversed(tokens):
|
||||||
|
end = token_start + token_length
|
||||||
|
if token_type < token_types.PERIOD.value:
|
||||||
|
break
|
||||||
|
|
||||||
|
return value[start:end]
|
||||||
|
else:
|
||||||
|
i = j = 0
|
||||||
|
tokens = value.split()
|
||||||
|
for i, t in enumerate(tokens):
|
||||||
|
if '/' in t:
|
||||||
|
break
|
||||||
|
|
||||||
|
for j, t in enumerate(reversed(tokens)):
|
||||||
|
if '/' in t:
|
||||||
|
break
|
||||||
|
|
||||||
|
if j == 0:
|
||||||
|
j = None
|
||||||
|
else:
|
||||||
|
j = -j
|
||||||
|
return u' '.join(tokens[i:j])
|
||||||
|
|
||||||
|
def format_address(self, country, components, minimal_only=True, tag_components=True):
|
||||||
|
template = self.config.get(country.upper())
|
||||||
|
if not template:
|
||||||
|
return None
|
||||||
|
template_text = template['address_template']
|
||||||
|
self.replace_aliases(components)
|
||||||
|
|
||||||
|
if not self.minimal_components(components):
|
||||||
|
if minimal_only:
|
||||||
|
return None
|
||||||
|
if 'fallback_template' in template:
|
||||||
|
template_text = template['fallback_template']
|
||||||
|
else:
|
||||||
|
template_text = self.config['default']['fallback_template']
|
||||||
|
|
||||||
|
self.apply_replacements(template, components)
|
||||||
|
|
||||||
|
if tag_components:
|
||||||
|
components = {k: u' '.join([u'{}/{}'.format(t, k.replace(' ', '_'))
|
||||||
|
for t, c in tokenize(v)])
|
||||||
|
for k, v in components.iteritems()}
|
||||||
|
|
||||||
|
text = self.render_template(template_text, components, tagged=tag_components)
|
||||||
|
|
||||||
|
text = self.post_replacements(template, text)
|
||||||
|
return text
|
||||||
@@ -1,16 +1,47 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
'''
|
||||||
|
osm_address_training_data.py
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
This script generates several training sets from OpenStreetMap addresses,
|
||||||
|
streets, venues and toponyms.
|
||||||
|
|
||||||
|
Note: the combined size of all the files created by this script exceeds 100GB
|
||||||
|
so if training these models, it is wise to use a server-grade machine with
|
||||||
|
plenty of disk space. The following commands can be used in parallel to create
|
||||||
|
all the training sets:
|
||||||
|
|
||||||
|
Ways:
|
||||||
|
python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||||
|
|
||||||
|
Venues:
|
||||||
|
python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||||
|
|
||||||
|
Address streets:
|
||||||
|
python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||||
|
|
||||||
|
Limited formatted addresses:
|
||||||
|
python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||||
|
|
||||||
|
Formatted addresses (tagged):
|
||||||
|
python osm_address_training_data.py -a -f $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||||
|
|
||||||
|
Formatted addresses (untagged):
|
||||||
|
python osm_address_training_data.py -a -f -u $(OSM_DIR)/planet-addresses.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||||
|
|
||||||
|
Toponyms:
|
||||||
|
python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||||
|
'''
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
import operator
|
import operator
|
||||||
import pystache
|
|
||||||
import re
|
import re
|
||||||
import subprocess
|
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import urllib
|
import urllib
|
||||||
import ujson as json
|
import ujson as json
|
||||||
import yaml
|
|
||||||
import HTMLParser
|
import HTMLParser
|
||||||
|
|
||||||
from collections import defaultdict, OrderedDict
|
from collections import defaultdict, OrderedDict
|
||||||
@@ -22,10 +53,10 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|||||||
|
|
||||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
|
||||||
|
|
||||||
from address_normalizer.text.tokenize import *
|
|
||||||
from geodata.language_id.disambiguation import *
|
from geodata.language_id.disambiguation import *
|
||||||
from geodata.language_id.polygon_lookup import country_and_languages
|
from geodata.language_id.polygon_lookup import country_and_languages
|
||||||
from geodata.i18n.languages import *
|
from geodata.i18n.languages import *
|
||||||
|
from geodata.address_formatting.formatter import AddressFormatter
|
||||||
from geodata.polygons.language_polys import *
|
from geodata.polygons.language_polys import *
|
||||||
from geodata.i18n.unicode_paths import DATA_DIR
|
from geodata.i18n.unicode_paths import DATA_DIR
|
||||||
|
|
||||||
@@ -34,8 +65,6 @@ from geodata.file_utils import *
|
|||||||
|
|
||||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
|
|
||||||
FORMATTER_GIT_REPO = 'https://github.com/openvenues/address-formatting'
|
|
||||||
|
|
||||||
WAY_OFFSET = 10 ** 15
|
WAY_OFFSET = 10 ** 15
|
||||||
RELATION_OFFSET = 2 * 10 ** 15
|
RELATION_OFFSET = 2 * 10 ** 15
|
||||||
|
|
||||||
@@ -127,189 +156,6 @@ def read_osm_json(filename):
|
|||||||
yield key, json.loads(attrs)
|
yield key, json.loads(attrs)
|
||||||
|
|
||||||
|
|
||||||
class AddressFormatter(object):
|
|
||||||
''' Approximate Python port of lokku's Geo::Address::Formatter '''
|
|
||||||
MINIMAL_COMPONENT_KEYS = [
|
|
||||||
('road', 'house_number'),
|
|
||||||
('road', 'house'),
|
|
||||||
('road', 'postcode')
|
|
||||||
]
|
|
||||||
|
|
||||||
whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
|
|
||||||
|
|
||||||
splitter = ' | '
|
|
||||||
|
|
||||||
aliases = OrderedDict([
|
|
||||||
('name', 'house'),
|
|
||||||
('addr:housename', 'house'),
|
|
||||||
('addr:housenumber', 'house_number'),
|
|
||||||
('addr:street', 'road'),
|
|
||||||
('addr:city', 'city'),
|
|
||||||
('addr:locality', 'city'),
|
|
||||||
('addr:municipality', 'city'),
|
|
||||||
('addr:hamlet', 'village'),
|
|
||||||
('addr:suburb', 'suburb'),
|
|
||||||
('addr:neighbourhood', 'suburb'),
|
|
||||||
('addr:neighborhood', 'suburb'),
|
|
||||||
('addr:district', 'suburb'),
|
|
||||||
('addr:state', 'state'),
|
|
||||||
('addr:province', 'state'),
|
|
||||||
('addr:region', 'state'),
|
|
||||||
('addr:postal_code', 'postcode'),
|
|
||||||
('addr:postcode', 'postcode'),
|
|
||||||
('addr:country', 'country'),
|
|
||||||
('street', 'road'),
|
|
||||||
('street_name', 'road'),
|
|
||||||
('residential', 'road'),
|
|
||||||
('hamlet', 'village'),
|
|
||||||
('neighborhood', 'suburb'),
|
|
||||||
('neighbourhood', 'suburb'),
|
|
||||||
('city_district', 'suburb'),
|
|
||||||
('state_code', 'state'),
|
|
||||||
('country_name', 'country'),
|
|
||||||
])
|
|
||||||
|
|
||||||
def __init__(self, scratch_dir='/tmp', splitter=None):
|
|
||||||
if splitter is not None:
|
|
||||||
self.splitter = splitter
|
|
||||||
|
|
||||||
self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
|
|
||||||
self.clone_repo()
|
|
||||||
self.load_config()
|
|
||||||
|
|
||||||
def clone_repo(self):
|
|
||||||
subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
|
|
||||||
subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
|
|
||||||
|
|
||||||
def load_config(self):
|
|
||||||
self.config = yaml.load(open(os.path.join(self.formatter_repo_path,
|
|
||||||
'conf/countries/worldwide.yaml')))
|
|
||||||
|
|
||||||
def component_aliases(self):
|
|
||||||
self.aliases = OrderedDict()
|
|
||||||
self.aliases.update(self.osm_aliases)
|
|
||||||
components = yaml.load_all(open(os.path.join(self.formatter_repo_path,
|
|
||||||
'conf', 'components.yaml')))
|
|
||||||
for c in components:
|
|
||||||
name = c['name']
|
|
||||||
for a in c.get('aliases', []):
|
|
||||||
self.aliases[a] = name
|
|
||||||
|
|
||||||
def replace_aliases(self, components):
|
|
||||||
for k in components.keys():
|
|
||||||
new_key = self.aliases.get(k)
|
|
||||||
if new_key and new_key not in components:
|
|
||||||
components[new_key] = components.pop(k)
|
|
||||||
|
|
||||||
def country_template(self, c):
|
|
||||||
return self.config.get(c, self.config['default'])
|
|
||||||
|
|
||||||
def render_template(self, template, components, tagged=False):
|
|
||||||
def render_first(text):
|
|
||||||
text = pystache.render(text, **components)
|
|
||||||
splits = (e.strip() for e in text.split('||'))
|
|
||||||
selected = next(ifilter(bool, splits), '')
|
|
||||||
return selected
|
|
||||||
|
|
||||||
output = pystache.render(template, first=render_first,
|
|
||||||
**components).strip()
|
|
||||||
|
|
||||||
values = self.whitespace_component_regex.split(output)
|
|
||||||
|
|
||||||
output = self.splitter.join([
|
|
||||||
self.strip_component(val, tagged=tagged)
|
|
||||||
for val in values
|
|
||||||
])
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
def minimal_components(self, components):
|
|
||||||
for component_list in self.MINIMAL_COMPONENT_KEYS:
|
|
||||||
if all((c in components for c in component_list)):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def apply_replacements(self, template, components):
|
|
||||||
if not template.get('replace'):
|
|
||||||
return
|
|
||||||
for key in components.keys():
|
|
||||||
value = components[key]
|
|
||||||
for regex, replacement in template['replace']:
|
|
||||||
value = re.sub(regex, replacement, value)
|
|
||||||
components[key] = value
|
|
||||||
|
|
||||||
def post_replacements(self, template, text):
|
|
||||||
components = []
|
|
||||||
seen = set()
|
|
||||||
for component in text.split(self.splitter):
|
|
||||||
component = component.strip()
|
|
||||||
if component not in seen:
|
|
||||||
components.append(component)
|
|
||||||
seen.add(component)
|
|
||||||
text = self.splitter.join(components)
|
|
||||||
post_format_replacements = template.get('postformat_replace')
|
|
||||||
if post_format_replacements:
|
|
||||||
for regex, replacement in post_format_replacements:
|
|
||||||
text = re.sub(regex, replacement, text)
|
|
||||||
return text
|
|
||||||
|
|
||||||
def strip_component(self, value, tagged=False):
|
|
||||||
i = j = 0
|
|
||||||
if not tagged:
|
|
||||||
tokens = tokenize(value)
|
|
||||||
for i, (c, t) in enumerate(tokens):
|
|
||||||
if c.value < token_types.PERIOD.value:
|
|
||||||
break
|
|
||||||
|
|
||||||
for j, (c, t) in enumerate(reversed(tokens)):
|
|
||||||
if c.value < token_types.PERIOD.value:
|
|
||||||
break
|
|
||||||
tokens = [t for c, t in tokens]
|
|
||||||
else:
|
|
||||||
tokens = value.split()
|
|
||||||
for i, t in enumerate(tokens):
|
|
||||||
if '/' in t:
|
|
||||||
break
|
|
||||||
|
|
||||||
for j, t in enumerate(reversed(tokens)):
|
|
||||||
if '/' in t:
|
|
||||||
break
|
|
||||||
if j == 0:
|
|
||||||
j = None
|
|
||||||
else:
|
|
||||||
j = -j
|
|
||||||
return u' '.join(tokens[i:j])
|
|
||||||
|
|
||||||
def format_address(self, country, components, minimal_only=True, tag_components=True):
|
|
||||||
template = self.config.get(country.upper())
|
|
||||||
if not template:
|
|
||||||
return None
|
|
||||||
template_text = template['address_template']
|
|
||||||
self.replace_aliases(components)
|
|
||||||
|
|
||||||
if not self.minimal_components(components):
|
|
||||||
if minimal_only:
|
|
||||||
return None
|
|
||||||
if 'fallback_template' in template:
|
|
||||||
template_text = template['fallback_template']
|
|
||||||
else:
|
|
||||||
template_text = self.config['default']['fallback_template']
|
|
||||||
|
|
||||||
self.apply_replacements(template, components)
|
|
||||||
|
|
||||||
if tag_components:
|
|
||||||
components = {k: u' '.join([u'{}/{}'.format(t, k.replace(' ', '_'))
|
|
||||||
for c, t in tokenize(v)])
|
|
||||||
for k, v in components.iteritems()}
|
|
||||||
else:
|
|
||||||
components = {k: u' '.join([t for c, t in tokenize(v)])
|
|
||||||
for k, v in components.iteritems()}
|
|
||||||
|
|
||||||
text = self.render_template(template_text, components, tagged=tag_components)
|
|
||||||
|
|
||||||
text = self.post_replacements(template, text)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_osm_name_tag(tag, script=False):
|
def normalize_osm_name_tag(tag, script=False):
|
||||||
norm = tag.rsplit(':', 1)[-1]
|
norm = tag.rsplit(':', 1)[-1]
|
||||||
@@ -462,6 +308,16 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
|
|||||||
|
|
||||||
|
|
||||||
def build_ways_training_data(language_rtree, infile, out_dir):
|
def build_ways_training_data(language_rtree, infile, out_dir):
|
||||||
|
'''
|
||||||
|
Creates a training set for language classification using most OSM ways
|
||||||
|
(streets) under a fairly lengthy osmfilter definition which attempts to
|
||||||
|
identify all roads/ways designated for motor vehicle traffic, which
|
||||||
|
is more-or-less what we'd expect to see in addresses.
|
||||||
|
|
||||||
|
The fields are {language, country, street name}. Example:
|
||||||
|
|
||||||
|
ar ma ﺵﺍﺮﻋ ﻑﺎﻟ ﻮﻟﺩ ﻊﻤﻳﺭ
|
||||||
|
'''
|
||||||
i = 0
|
i = 0
|
||||||
f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')
|
f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')
|
||||||
writer = csv.writer(f, 'tsv_no_quote')
|
writer = csv.writer(f, 'tsv_no_quote')
|
||||||
@@ -493,8 +349,9 @@ def strip_keys(value, ignore_keys):
|
|||||||
def build_address_format_training_data(language_rtree, infile, out_dir, tag_components=True):
|
def build_address_format_training_data(language_rtree, infile, out_dir, tag_components=True):
|
||||||
'''
|
'''
|
||||||
Creates formatted address training data for supervised sequence labeling (or potentially
|
Creates formatted address training data for supervised sequence labeling (or potentially
|
||||||
for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. The tagged
|
for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.
|
||||||
version produces a TSV file that looks like:
|
|
||||||
|
Example:
|
||||||
|
|
||||||
cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country
|
cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country
|
||||||
|
|
||||||
@@ -506,9 +363,9 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
|
|||||||
This information can potentially be used downstream by the sequence model as these
|
This information can potentially be used downstream by the sequence model as these
|
||||||
breaks may be present at prediction time.
|
breaks may be present at prediction time.
|
||||||
|
|
||||||
For the untagged version, lines simply look like:
|
Example:
|
||||||
|
|
||||||
The Dignity | 363 Regents Park Road | London N3 1DH
|
sr rs Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic
|
||||||
|
|
||||||
This may be useful in learning word representations, statistical phrases, morphology
|
This may be useful in learning word representations, statistical phrases, morphology
|
||||||
or other models requiring only the sequence of words.
|
or other models requiring only the sequence of words.
|
||||||
@@ -578,6 +435,18 @@ POSTAL_KEYS = (
|
|||||||
|
|
||||||
|
|
||||||
def build_address_format_training_data_limited(language_rtree, infile, out_dir):
|
def build_address_format_training_data_limited(language_rtree, infile, out_dir):
|
||||||
|
'''
|
||||||
|
Creates a special kind of formatted address training data from OSM's addr:* tags
|
||||||
|
but are designed for use in language classification. These records are similar
|
||||||
|
to the untagged formatted records but include the language and country
|
||||||
|
(suitable for concatenation with the rest of the language training data),
|
||||||
|
and remove several fields like country which usually do not contain helpful
|
||||||
|
information for classifying the language.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
nb no Olaf Ryes Plass 8 | Oslo
|
||||||
|
'''
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
formatter = AddressFormatter()
|
formatter = AddressFormatter()
|
||||||
@@ -648,6 +517,18 @@ def normalize_wikipedia_title(title):
|
|||||||
|
|
||||||
|
|
||||||
def build_toponym_training_data(language_rtree, infile, out_dir):
|
def build_toponym_training_data(language_rtree, infile, out_dir):
|
||||||
|
'''
|
||||||
|
Data set of toponyms by language and country which should assist
|
||||||
|
in language classification. OSM tends to use the native language
|
||||||
|
by default (e.g. Москва instead of Moscow). Toponyms get messy
|
||||||
|
due to factors like colonialism, historical names, name borrowing
|
||||||
|
and the shortness of the names generally. In these cases
|
||||||
|
we're more strict as to what constitutes a valid language for a
|
||||||
|
given country.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
ja jp 東京都
|
||||||
|
'''
|
||||||
i = 0
|
i = 0
|
||||||
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
|
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
|
||||||
writer = csv.writer(f, 'tsv_no_quote')
|
writer = csv.writer(f, 'tsv_no_quote')
|
||||||
@@ -667,10 +548,8 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
|
|||||||
|
|
||||||
name_language = defaultdict(list)
|
name_language = defaultdict(list)
|
||||||
|
|
||||||
all_langs = country_languages[country]
|
|
||||||
official = official_languages[country]
|
official = official_languages[country]
|
||||||
|
|
||||||
num_langs = len(candidate_languages)
|
|
||||||
default_langs = set([l for l, default in official.iteritems() if default])
|
default_langs = set([l for l, default in official.iteritems() if default])
|
||||||
|
|
||||||
regional_langs = list(chain(*(p['languages'] for p in language_props if p.get('admin_level', 0) > 0)))
|
regional_langs = list(chain(*(p['languages'] for p in language_props if p.get('admin_level', 0) > 0)))
|
||||||
@@ -684,6 +563,14 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
|
|||||||
default_langs -= WELL_REPRESENTED_LANGUAGES
|
default_langs -= WELL_REPRESENTED_LANGUAGES
|
||||||
|
|
||||||
valid_languages = set([l['lang'] for l in candidate_languages])
|
valid_languages = set([l['lang'] for l in candidate_languages])
|
||||||
|
|
||||||
|
'''
|
||||||
|
WELL_REPRESENTED_LANGUAGES are languages like English, French, etc. for which we have a lot of data
|
||||||
|
WELL_REPRESENTED_LANGUAGE_COUNTRIES are more-or-less the "origin" countries for said languages where
|
||||||
|
we can take the place names as examples of the language itself (e.g. place names in France are examples
|
||||||
|
of French, whereas place names in much of Francophone Africa tend to get their names from languages
|
||||||
|
other than French, even though French is the official language.
|
||||||
|
'''
|
||||||
valid_languages -= set([lang for lang in valid_languages if lang in WELL_REPRESENTED_LANGUAGES and country not in WELL_REPRESENTED_LANGUAGE_COUNTRIES[lang]])
|
valid_languages -= set([lang for lang in valid_languages if lang in WELL_REPRESENTED_LANGUAGES and country not in WELL_REPRESENTED_LANGUAGE_COUNTRIES[lang]])
|
||||||
|
|
||||||
valid_languages |= default_langs
|
valid_languages |= default_langs
|
||||||
@@ -728,6 +615,14 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
|
|||||||
|
|
||||||
|
|
||||||
def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
|
def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
|
||||||
|
'''
|
||||||
|
Creates training set similar to the ways data but using addr:street tags instead.
|
||||||
|
These may be slightly closer to what we'd see in real live addresses, containing
|
||||||
|
variations, some abbreviations (although this is discouraged in OSM), etc.
|
||||||
|
|
||||||
|
Example record:
|
||||||
|
eu es Errebal kalea
|
||||||
|
'''
|
||||||
i = 0
|
i = 0
|
||||||
f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
|
f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
|
||||||
writer = csv.writer(f, 'tsv_no_quote')
|
writer = csv.writer(f, 'tsv_no_quote')
|
||||||
@@ -809,6 +704,11 @@ if __name__ == '__main__':
|
|||||||
default=False,
|
default=False,
|
||||||
help='Save formatted addresses (slow)')
|
help='Save formatted addresses (slow)')
|
||||||
|
|
||||||
|
parser.add_argument('-u', '--untagged',
|
||||||
|
action='store_true',
|
||||||
|
default=False,
|
||||||
|
help='Save untagged formatted addresses (slow)')
|
||||||
|
|
||||||
parser.add_argument('-l', '--limited-addresses',
|
parser.add_argument('-l', '--limited-addresses',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
@@ -842,7 +742,7 @@ if __name__ == '__main__':
|
|||||||
if args.address_file and not args.format_only and not args.limited_addresses:
|
if args.address_file and not args.format_only and not args.limited_addresses:
|
||||||
build_address_training_data(language_rtree, args.address_file, args.out_dir)
|
build_address_training_data(language_rtree, args.address_file, args.out_dir)
|
||||||
if args.address_file and args.format_only:
|
if args.address_file and args.format_only:
|
||||||
build_address_format_training_data(language_rtree, args.address_file, args.out_dir)
|
build_address_format_training_data(language_rtree, args.address_file, args.out_dir, tag_components=not args.untagged)
|
||||||
if args.address_file and args.limited_addresses:
|
if args.address_file and args.limited_addresses:
|
||||||
build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir)
|
build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir)
|
||||||
if args.venues_file:
|
if args.venues_file:
|
||||||
|
|||||||
Reference in New Issue
Block a user