[formatting] New formatter config including random component component order changes and default/per-country admin component ordering
This commit is contained in:
285
resources/formatting/global.yaml
Normal file
285
resources/formatting/global.yaml
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
global:
|
||||||
|
# Add these components to templates that don't have them
|
||||||
|
admin_components:
|
||||||
|
subdivision:
|
||||||
|
after:
|
||||||
|
- road
|
||||||
|
before:
|
||||||
|
- suburb
|
||||||
|
- city_district
|
||||||
|
- city
|
||||||
|
- island
|
||||||
|
- state_district
|
||||||
|
- state
|
||||||
|
- postcode
|
||||||
|
- country
|
||||||
|
suburb:
|
||||||
|
after:
|
||||||
|
- road
|
||||||
|
- subdivision
|
||||||
|
before:
|
||||||
|
- city_district
|
||||||
|
- city
|
||||||
|
- island
|
||||||
|
- state_district
|
||||||
|
- state
|
||||||
|
- postcode
|
||||||
|
- country
|
||||||
|
city_district:
|
||||||
|
after:
|
||||||
|
- road
|
||||||
|
- suburb
|
||||||
|
before:
|
||||||
|
- city
|
||||||
|
- island
|
||||||
|
- state_district
|
||||||
|
- state
|
||||||
|
- postcode
|
||||||
|
- country
|
||||||
|
# This is added to all the templates but only makes it in
|
||||||
|
island:
|
||||||
|
after:
|
||||||
|
- road
|
||||||
|
- suburb
|
||||||
|
- city_district
|
||||||
|
- city
|
||||||
|
before:
|
||||||
|
- state_district
|
||||||
|
- state
|
||||||
|
- country
|
||||||
|
state_district:
|
||||||
|
after:
|
||||||
|
- suburb
|
||||||
|
- city_district
|
||||||
|
- city
|
||||||
|
- island
|
||||||
|
before:
|
||||||
|
- state
|
||||||
|
- country
|
||||||
|
state:
|
||||||
|
after:
|
||||||
|
- suburb
|
||||||
|
- city_district
|
||||||
|
- city
|
||||||
|
- island
|
||||||
|
- state_district
|
||||||
|
before:
|
||||||
|
- country
|
||||||
|
country:
|
||||||
|
after:
|
||||||
|
- suburb
|
||||||
|
- city_district
|
||||||
|
- city
|
||||||
|
- island
|
||||||
|
- state_district
|
||||||
|
- state
|
||||||
|
- postcode
|
||||||
|
|
||||||
|
insertions:
|
||||||
|
# For each component, insertions are mutually exclusive
|
||||||
|
# They don't have to sum to 1 (especially for components
|
||||||
|
# likely to be found in most addresses)
|
||||||
|
postcode:
|
||||||
|
postcode_before_city:
|
||||||
|
before: city
|
||||||
|
probability: 0.0001
|
||||||
|
|
||||||
|
postcode_after_city:
|
||||||
|
after: city
|
||||||
|
probability: 0.0001
|
||||||
|
|
||||||
|
postcode_before_city_district:
|
||||||
|
before: city_district
|
||||||
|
probability: 0.0001
|
||||||
|
|
||||||
|
postcode_before_suburb:
|
||||||
|
before: suburb
|
||||||
|
probability: 0.0001
|
||||||
|
|
||||||
|
postcode_before_state_district:
|
||||||
|
before: state
|
||||||
|
probability: 0.0001
|
||||||
|
|
||||||
|
postcode_before_state:
|
||||||
|
before: state
|
||||||
|
probability: 0.0001
|
||||||
|
|
||||||
|
postcode_before_country:
|
||||||
|
before: country
|
||||||
|
probability: 0.05
|
||||||
|
|
||||||
|
postcode_after_country:
|
||||||
|
after: country
|
||||||
|
probability: 0.01
|
||||||
|
|
||||||
|
postcode_first:
|
||||||
|
first: true
|
||||||
|
probability: 0.001
|
||||||
|
|
||||||
|
postcode_last:
|
||||||
|
last: true
|
||||||
|
probability: 0.01
|
||||||
|
|
||||||
|
# PO Box should be the same in most countries
|
||||||
|
po_box:
|
||||||
|
po_box_before_city:
|
||||||
|
before: city
|
||||||
|
probability: 0.7
|
||||||
|
|
||||||
|
po_box_after_house:
|
||||||
|
after: house
|
||||||
|
probability: 0.2
|
||||||
|
|
||||||
|
po_box_first:
|
||||||
|
first: true
|
||||||
|
probability: 0.1
|
||||||
|
|
||||||
|
# Overrides for languages (better for e.g. covering all French-speaking countries)
|
||||||
|
languages:
|
||||||
|
en:
|
||||||
|
insertions:
|
||||||
|
level:
|
||||||
|
# e.g. 123 East 45th St, 6th Floor, NYC
|
||||||
|
level_after_road:
|
||||||
|
after: road
|
||||||
|
probability: 0.5
|
||||||
|
# e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London
|
||||||
|
level_before_house:
|
||||||
|
before: house
|
||||||
|
probability: 0.25
|
||||||
|
# e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London
|
||||||
|
level_before_road:
|
||||||
|
before: road
|
||||||
|
probability: 0.25
|
||||||
|
|
||||||
|
unit:
|
||||||
|
# e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London
|
||||||
|
unit_before_house:
|
||||||
|
before: house
|
||||||
|
probability: 0.2
|
||||||
|
|
||||||
|
# e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London
|
||||||
|
unit_before_house_number:
|
||||||
|
before: house_number
|
||||||
|
probability: 0.6
|
||||||
|
|
||||||
|
# e.g. Da Vinci House, 44 Saffron Hill, Flat 18, London (not as common in UK)
|
||||||
|
unit_after_road:
|
||||||
|
after: road
|
||||||
|
probability: 0.1
|
||||||
|
|
||||||
|
# e.g. Floor 5, Apt 6
|
||||||
|
unit_after_level:
|
||||||
|
after: level
|
||||||
|
probability: 0.09
|
||||||
|
|
||||||
|
# e.g. Apt. 6, 5/F (less common)
|
||||||
|
unit_before_level:
|
||||||
|
before: level
|
||||||
|
probability: 0.01
|
||||||
|
|
||||||
|
es:
|
||||||
|
insertions:
|
||||||
|
level:
|
||||||
|
# e.g. Calle Ruiz de Alarcón 23 piso 3
|
||||||
|
level_after_house_number:
|
||||||
|
after: house_number
|
||||||
|
probability: 0.8
|
||||||
|
# e.g. Piso 3, Museo del Prado, Calle Ruiz de Alarcón 23
|
||||||
|
level_before_house:
|
||||||
|
before: house
|
||||||
|
probability: 0.1
|
||||||
|
# e.g. Museo del Prado, Bajos, Calle Ruiz de Alarcón 23
|
||||||
|
level_before_road:
|
||||||
|
before: road
|
||||||
|
probability: 0.1
|
||||||
|
|
||||||
|
unit:
|
||||||
|
unit_before_house:
|
||||||
|
before: house
|
||||||
|
probability: 0.05
|
||||||
|
unit_before_house_number:
|
||||||
|
before: house_number
|
||||||
|
probability: 0.05
|
||||||
|
# e.g. Piso 3 Dpto 12 (most common)
|
||||||
|
unit_after_level:
|
||||||
|
after: level
|
||||||
|
probability: 0.8
|
||||||
|
# e.g. Apto 6, 2o piso (less common)
|
||||||
|
unit_before_level:
|
||||||
|
before: level
|
||||||
|
probability: 0.1
|
||||||
|
|
||||||
|
fr:
|
||||||
|
# libpostal issue #27
|
||||||
|
insertions:
|
||||||
|
city:
|
||||||
|
city_before_road:
|
||||||
|
before: road
|
||||||
|
probability: 0.001
|
||||||
|
|
||||||
|
|
||||||
|
countries:
|
||||||
|
# Hungary, e.g. 1075, Budapest Kazinczy utca 14
|
||||||
|
hu:
|
||||||
|
insertions:
|
||||||
|
postcode:
|
||||||
|
postcode_before_city:
|
||||||
|
probability: 0.5
|
||||||
|
|
||||||
|
# Malaysia (islands are bigger than states)
|
||||||
|
my:
|
||||||
|
admin_components:
|
||||||
|
island:
|
||||||
|
after:
|
||||||
|
- road
|
||||||
|
- suburb
|
||||||
|
- city_district
|
||||||
|
- city
|
||||||
|
- state_district
|
||||||
|
- state
|
||||||
|
before:
|
||||||
|
- country
|
||||||
|
|
||||||
|
us:
|
||||||
|
insertions:
|
||||||
|
level:
|
||||||
|
# e.g. 123 East 45th St, 6th Floor, NYC
|
||||||
|
level_after_road:
|
||||||
|
after: road
|
||||||
|
probability: 0.75
|
||||||
|
# e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London
|
||||||
|
level_before_house:
|
||||||
|
before: house
|
||||||
|
probability: 0.125
|
||||||
|
# e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London
|
||||||
|
level_before_road:
|
||||||
|
before: road
|
||||||
|
probability: 0.125
|
||||||
|
|
||||||
|
unit:
|
||||||
|
# e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London
|
||||||
|
unit_before_house:
|
||||||
|
before: house
|
||||||
|
probability: 0.05
|
||||||
|
|
||||||
|
# e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London
|
||||||
|
unit_before_house_number:
|
||||||
|
before: house_number
|
||||||
|
probability: 0.05
|
||||||
|
|
||||||
|
# e.g. Da Vinci House, 44 Saffron Hill, Flat 18, London (not as common in UK)
|
||||||
|
unit_after_road:
|
||||||
|
after: road
|
||||||
|
probability: 0.8
|
||||||
|
|
||||||
|
# e.g. Floor 5, Apt 6
|
||||||
|
unit_after_level:
|
||||||
|
after: level
|
||||||
|
probability: 0.09
|
||||||
|
|
||||||
|
# e.g. Apt. 6, 5/F (less common)
|
||||||
|
unit_before_level:
|
||||||
|
before: level
|
||||||
|
probability: 0.01
|
||||||
|
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
import copy
|
||||||
import os
|
import os
|
||||||
import pystache
|
import pystache
|
||||||
import re
|
import re
|
||||||
@@ -7,6 +8,9 @@ import subprocess
|
|||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from geodata.address_formatting.aliases import Aliases
|
from geodata.address_formatting.aliases import Aliases
|
||||||
|
from geodata.configs.utils import nested_get, recursive_merge
|
||||||
|
from geodata.math.floats import isclose
|
||||||
|
from geodata.math.sampling import weighted_choice, cdf
|
||||||
from geodata.text.tokenize import tokenize, tokenize_raw, token_types
|
from geodata.text.tokenize import tokenize, tokenize_raw, token_types
|
||||||
from geodata.encoding import safe_decode
|
from geodata.encoding import safe_decode
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
@@ -14,6 +18,11 @@ from itertools import ifilter
|
|||||||
|
|
||||||
FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting'
|
FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting'
|
||||||
|
|
||||||
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
|
|
||||||
|
FORMATTER_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
|
'resources', 'formatting', 'global.yaml')
|
||||||
|
|
||||||
|
|
||||||
class AddressFormatter(object):
|
class AddressFormatter(object):
|
||||||
'''
|
'''
|
||||||
@@ -50,6 +59,7 @@ class AddressFormatter(object):
|
|||||||
UNIT = 'unit'
|
UNIT = 'unit'
|
||||||
INTERSECTION = 'intersection'
|
INTERSECTION = 'intersection'
|
||||||
ROAD = 'road'
|
ROAD = 'road'
|
||||||
|
SUBDIVISION = 'subdivision'
|
||||||
SUBURB = 'suburb'
|
SUBURB = 'suburb'
|
||||||
CITY_DISTRICT = 'city_district'
|
CITY_DISTRICT = 'city_district'
|
||||||
CITY = 'city'
|
CITY = 'city'
|
||||||
@@ -73,6 +83,7 @@ class AddressFormatter(object):
|
|||||||
INTERSECTION,
|
INTERSECTION,
|
||||||
ROAD,
|
ROAD,
|
||||||
SUBURB,
|
SUBURB,
|
||||||
|
SUBDIVISION,
|
||||||
CITY,
|
CITY,
|
||||||
CITY_DISTRICT,
|
CITY_DISTRICT,
|
||||||
ISLAND,
|
ISLAND,
|
||||||
@@ -111,58 +122,139 @@ class AddressFormatter(object):
|
|||||||
(ROAD, POSTCODE)
|
(ROAD, POSTCODE)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
FIRST, BEFORE, AFTER, LAST = range(4)
|
||||||
|
|
||||||
def __init__(self, scratch_dir='/tmp', splitter=None):
|
def __init__(self, scratch_dir='/tmp', splitter=None):
|
||||||
if splitter is not None:
|
if splitter is not None:
|
||||||
self.splitter = splitter
|
self.splitter = splitter
|
||||||
|
|
||||||
self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
|
self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
|
||||||
self.clone_repo()
|
self.clone_repo()
|
||||||
|
|
||||||
self.load_config()
|
self.load_config()
|
||||||
|
self.load_country_config()
|
||||||
|
|
||||||
|
self.setup_insertion_probabilities()
|
||||||
|
|
||||||
|
self.template_cache = {}
|
||||||
|
|
||||||
def clone_repo(self):
|
def clone_repo(self):
|
||||||
subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
|
subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
|
||||||
subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
|
subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
|
||||||
|
|
||||||
def load_config(self):
|
def load_country_config(self):
|
||||||
config = yaml.load(open(os.path.join(self.formatter_repo_path,
|
config = yaml.load(open(os.path.join(self.formatter_repo_path,
|
||||||
'conf/countries/worldwide.yaml')))
|
'conf', 'countries', 'worldwide.yaml')))
|
||||||
for key, value in config.items():
|
for key in list(config):
|
||||||
|
country = key
|
||||||
|
language = None
|
||||||
|
if '_' in key:
|
||||||
|
country, language = country.split('_', 1)
|
||||||
|
value = config[country]
|
||||||
if hasattr(value, 'items'):
|
if hasattr(value, 'items'):
|
||||||
address_template = value.get('address_template')
|
address_template = value.get('address_template')
|
||||||
|
if not address_template and 'use_country' in value:
|
||||||
|
# Temporary fix for Norway territories (NO unquoted is a boolean) and recursive references
|
||||||
|
if value['use_country'] in (country, False):
|
||||||
|
continue
|
||||||
|
address_template = config[value['use_country']]['address_template']
|
||||||
|
|
||||||
if address_template:
|
if address_template:
|
||||||
value['address_template'] = self.add_postprocessing_tags(address_template)
|
value['address_template'] = self.add_postprocessing_tags(address_template, country, language=language)
|
||||||
|
|
||||||
post_format_replacements = value.get('postformat_replace')
|
post_format_replacements = value.get('postformat_replace')
|
||||||
if post_format_replacements:
|
if post_format_replacements:
|
||||||
value['postformat_replace'] = [[pattern, replacement.replace('$', '\\')] for pattern, replacement in post_format_replacements]
|
value['postformat_replace'] = [[pattern, replacement.replace('$', '\\')] for pattern, replacement in post_format_replacements]
|
||||||
else:
|
else:
|
||||||
address_template = value
|
address_template = value
|
||||||
config[key] = self.add_postprocessing_tags(value)
|
config[country] = self.add_postprocessing_tags(value, country, language=language)
|
||||||
self.config = config
|
self.country_formats = config
|
||||||
|
|
||||||
|
def load_config(self):
|
||||||
|
config = yaml.load(open(FORMATTER_CONFIG))
|
||||||
|
self.config = config.get('global', {})
|
||||||
|
language_configs = config.get('languages', {})
|
||||||
|
|
||||||
|
self.language_configs = {}
|
||||||
|
for language in language_configs:
|
||||||
|
language_config = language_configs[language]
|
||||||
|
config_copy = copy.deepcopy(self.config)
|
||||||
|
self.language_configs[language] = recursive_merge(config_copy, language_config)
|
||||||
|
|
||||||
|
country_configs = config.get('countries', {})
|
||||||
|
|
||||||
|
self.country_configs = {}
|
||||||
|
for country in country_configs:
|
||||||
|
country_config = country_configs[country]
|
||||||
|
config_copy = copy.deepcopy(self.config)
|
||||||
|
self.country_configs[country] = recursive_merge(config_copy, country_config)
|
||||||
|
|
||||||
|
def get_property(self, keys, country, language=None, default=None):
|
||||||
|
if isinstance(keys, six.string_types):
|
||||||
|
keys = keys.split('.')
|
||||||
|
keys = tuple(keys)
|
||||||
|
value = nested_get(self.language_configs, (language,) + keys, default=default)
|
||||||
|
if not value:
|
||||||
|
value = nested_get(self.country_configs, (country,) + keys, default=default)
|
||||||
|
if not value:
|
||||||
|
value = nested_get(self.config, keys, default=default)
|
||||||
|
return value
|
||||||
|
|
||||||
|
def get_admin_components(self, country, language=None):
|
||||||
|
admin_components = self.get_property('admin_components', country, language=language, default={})
|
||||||
|
return [(key, value.get('after', ()), value.get('before', ())) for key, value in six.iteritems(admin_components)]
|
||||||
|
|
||||||
|
def insertion_probs(self, config):
|
||||||
|
component_insertions = {}
|
||||||
|
for component, insertions in six.iteritems(config):
|
||||||
|
values = []
|
||||||
|
probs = []
|
||||||
|
for k, v in six.iteritems(insertions):
|
||||||
|
if 'before' in v:
|
||||||
|
val = (self.BEFORE, v['before'])
|
||||||
|
elif 'after' in v:
|
||||||
|
val = (self.AFTER, v['after'])
|
||||||
|
elif 'last' in v:
|
||||||
|
val = (self.LAST, None)
|
||||||
|
elif 'first' in v:
|
||||||
|
val = (self.FIRST, None)
|
||||||
|
else:
|
||||||
|
raise ValueError('Insertions must contain one of {first, before, after, last}')
|
||||||
|
|
||||||
|
prob = v['probability']
|
||||||
|
values.append(val)
|
||||||
|
probs.append(prob)
|
||||||
|
|
||||||
|
# If the probabilities don't sum to 1, add a "do nothing" action
|
||||||
|
if not isclose(sum(probs), 1.0):
|
||||||
|
probs.append(1.0 - sum(probs))
|
||||||
|
values.append((None, None))
|
||||||
|
|
||||||
|
component_insertions[component] = values, cdf(probs)
|
||||||
|
return component_insertions
|
||||||
|
|
||||||
|
def setup_insertion_probabilities(self):
|
||||||
|
self.global_insertions = self.insertion_probs(self.config['insertions'])
|
||||||
|
|
||||||
|
self.country_insertions = {}
|
||||||
|
|
||||||
|
for country, config in six.iteritems(self.country_configs):
|
||||||
|
if 'insertions' in config:
|
||||||
|
self.country_insertions[country.lower()] = self.insertion_probs(config['insertions'])
|
||||||
|
|
||||||
|
self.language_insertions = {}
|
||||||
|
|
||||||
|
for language, config in six.iteritems(self.language_configs):
|
||||||
|
if 'insertions' in config:
|
||||||
|
self.language_insertions[language.lower()] = self.insertion_probs(config['insertions'])
|
||||||
|
|
||||||
def country_template(self, c):
|
def country_template(self, c):
|
||||||
return self.config.get(c, self.config['default'])
|
return self.country_formats.get(c, self.country_formats['default'])
|
||||||
|
|
||||||
postprocessing_tags = [
|
def is_reverse(self, template):
|
||||||
(SUBURB, (ROAD,), (CITY_DISTRICT, CITY, ISLAND, STATE_DISTRICT, STATE, POSTCODE, COUNTRY)),
|
|
||||||
(CITY_DISTRICT, (ROAD, SUBURB), (CITY, ISLAND, STATE_DISTRICT, STATE)),
|
|
||||||
(STATE_DISTRICT, (SUBURB, CITY_DISTRICT, CITY, ISLAND), (STATE,)),
|
|
||||||
(STATE, (SUBURB, CITY_DISTRICT, CITY, ISLAND, STATE_DISTRICT), (COUNTRY,)),
|
|
||||||
]
|
|
||||||
|
|
||||||
template_tag_replacements = [
|
|
||||||
('county', STATE_DISTRICT),
|
|
||||||
]
|
|
||||||
|
|
||||||
def is_reverse(self, key, template):
|
|
||||||
address_parts_match = self.template_address_parts_re.search(template)
|
address_parts_match = self.template_address_parts_re.search(template)
|
||||||
admin_parts_match = list(self.template_admin_parts_re.finditer(template))
|
admin_parts_match = list(self.template_admin_parts_re.finditer(template))
|
||||||
|
|
||||||
if not address_parts_match:
|
|
||||||
raise ValueError('Template for {} does not contain any address parts'.format(key))
|
|
||||||
elif not admin_parts_match:
|
|
||||||
raise ValueError('Template for {} does not contain any admin parts'.format(key))
|
|
||||||
|
|
||||||
# last instance of city/state/country occurs before the first instance of house_number/road
|
# last instance of city/state/country occurs before the first instance of house_number/road
|
||||||
return admin_parts_match[-1].start() < address_parts_match.start()
|
return admin_parts_match[-1].start() < address_parts_match.start()
|
||||||
|
|
||||||
@@ -170,25 +262,24 @@ class AddressFormatter(object):
|
|||||||
""" For constructing """
|
""" For constructing """
|
||||||
return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys]))
|
return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys]))
|
||||||
|
|
||||||
def insert_component(self, template, tag, before=(), after=(), separate=True, is_reverse=False):
|
def insert_component(self, template, tag, before=None, after=None, first=False, last=False, separate=True, is_reverse=False):
|
||||||
if not before and not after:
|
if not before and not after and not first and not last:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
template = template.rstrip()
|
||||||
|
|
||||||
tag_match = re.compile('\{{{key}\}}'.format(key=tag)).search(template)
|
tag_match = re.compile('\{{{key}\}}'.format(key=tag)).search(template)
|
||||||
|
|
||||||
if before:
|
if before:
|
||||||
before_match = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in before])).search(template)
|
before_match = re.compile('\{{{key}\}}'.format(key=before)).search(template)
|
||||||
if before_match and tag_match and before_match.start() > tag_match.start():
|
if before_match and tag_match and before_match.start() > tag_match.start():
|
||||||
return template
|
return template
|
||||||
|
|
||||||
if after:
|
if after:
|
||||||
after_match = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in after])).search(template)
|
after_match = re.compile('\{{{key}\}}'.format(key=after)).search(template)
|
||||||
if after_match and tag_match and tag_match.start() > after_match.start():
|
if after_match and tag_match and tag_match.start() > after_match.start():
|
||||||
return template
|
return template
|
||||||
|
|
||||||
before = set(before)
|
|
||||||
after = set(after)
|
|
||||||
|
|
||||||
key_added = False
|
key_added = False
|
||||||
skip_next_non_token = False
|
skip_next_non_token = False
|
||||||
new_components = []
|
new_components = []
|
||||||
@@ -201,7 +292,7 @@ class AddressFormatter(object):
|
|||||||
|
|
||||||
if hasattr(el, 'parsed'):
|
if hasattr(el, 'parsed'):
|
||||||
keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key')]
|
keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key')]
|
||||||
if set(keys) & before and not key_added:
|
if (before in set(keys) or first) and not key_added:
|
||||||
token = new_components[-1] if new_components and '{' not in new_components[-1] else '\n'
|
token = new_components[-1] if new_components and '{' not in new_components[-1] else '\n'
|
||||||
new_components.extend([tag_token, token])
|
new_components.extend([tag_token, token])
|
||||||
key_added = True
|
key_added = True
|
||||||
@@ -214,7 +305,7 @@ class AddressFormatter(object):
|
|||||||
new_components.pop()
|
new_components.pop()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if set(keys) & after and not key_added:
|
if (after in set(keys) or i == num_tokens - 1) and not key_added:
|
||||||
token = '\n'
|
token = '\n'
|
||||||
if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
|
if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
|
||||||
token = parsed._parse_tree[i + 1]
|
token = parsed._parse_tree[i + 1]
|
||||||
@@ -226,7 +317,7 @@ class AddressFormatter(object):
|
|||||||
skip_next_non_token = True
|
skip_next_non_token = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if el.key in before and not key_added:
|
if (el.key == before or first) and not key_added:
|
||||||
token = '\n'
|
token = '\n'
|
||||||
if new_components and '{' not in new_components[-1]:
|
if new_components and '{' not in new_components[-1]:
|
||||||
token = new_components[-1]
|
token = new_components[-1]
|
||||||
@@ -235,7 +326,7 @@ class AddressFormatter(object):
|
|||||||
|
|
||||||
new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
|
new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
|
||||||
|
|
||||||
if el.key in after and not key_added:
|
if (el.key == after or i == num_tokens - 1) and not key_added:
|
||||||
token = '\n'
|
token = '\n'
|
||||||
if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
|
if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
|
||||||
token = parsed._parse_tree[i + 1]
|
token = parsed._parse_tree[i + 1]
|
||||||
@@ -244,30 +335,44 @@ class AddressFormatter(object):
|
|||||||
elif not skip_next_non_token:
|
elif not skip_next_non_token:
|
||||||
new_components.append(el)
|
new_components.append(el)
|
||||||
|
|
||||||
|
if i == num_tokens - 1 and not key_added:
|
||||||
|
key_added = True
|
||||||
|
new_components.append(tag_token)
|
||||||
|
|
||||||
skip_next_non_token = False
|
skip_next_non_token = False
|
||||||
|
|
||||||
return ''.join(new_components)
|
return ''.join(new_components)
|
||||||
|
|
||||||
def add_postprocessing_tags(self, template):
|
def add_postprocessing_tags(self, template, country, language=None):
|
||||||
is_reverse = self.is_reverse(template)
|
is_reverse = self.is_reverse(template)
|
||||||
for key, pre_keys, post_keys in self.postprocessing_tags:
|
for key, pre_keys, post_keys in self.get_admin_components(country, language=language):
|
||||||
key_included = key in template
|
key_tag = six.u('{{{{{{{key}}}}}}}').format(key=key)
|
||||||
|
|
||||||
|
key_included = key_tag in template
|
||||||
new_components = []
|
new_components = []
|
||||||
if key_included:
|
if key_included:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for line in template.split('\n'):
|
pre_key_regex = re.compile('|'.join(['{{{}}}'.format(k) for k in pre_keys]))
|
||||||
pre_key = re.compile('|'.join(pre_keys)).search(line)
|
post_key_regex = re.compile('|'.join(['{{{}}}'.format(k) for k in post_keys]))
|
||||||
post_key = re.compile('|'.join(post_keys)).search(line)
|
|
||||||
|
for line in template.split(six.u('\n')):
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
pre_key = pre_keys and pre_key_regex.search(line)
|
||||||
|
post_key = post_keys and post_key_regex.search(line)
|
||||||
if post_key and not pre_key and not key_included:
|
if post_key and not pre_key and not key_included:
|
||||||
if not is_reverse:
|
if not is_reverse:
|
||||||
new_components.append(u'{{{{{{{key}}}}}}}'.format(key=key))
|
new_components.append(key_tag)
|
||||||
key_included = True
|
key_included = True
|
||||||
|
|
||||||
new_components.append(line.rstrip('\n'))
|
new_components.append(line.rstrip('\n'))
|
||||||
if post_key and not pre_key and not key_included and is_reverse:
|
if post_key and not pre_key and not key_included and is_reverse:
|
||||||
new_components.append(u'{{{{{{{key}}}}}}}'.format(key=key))
|
new_components.append(key_tag)
|
||||||
key_included = True
|
key_included = True
|
||||||
template = u'\n'.join(new_components)
|
if not post_keys and not key_included:
|
||||||
|
new_components.append(key_tag)
|
||||||
|
template = six.u('\n').join(new_components)
|
||||||
return template
|
return template
|
||||||
|
|
||||||
def render_template(self, template, components, tagged=False):
|
def render_template(self, template, components, tagged=False):
|
||||||
@@ -298,15 +403,6 @@ class AddressFormatter(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def apply_replacements(self, template, components):
|
|
||||||
if not template.get('replace'):
|
|
||||||
return
|
|
||||||
for key in components.keys():
|
|
||||||
value = components[key]
|
|
||||||
for regex, replacement in template['replace']:
|
|
||||||
value = re.sub(regex, replacement, value)
|
|
||||||
components[key] = value
|
|
||||||
|
|
||||||
def post_replacements(self, template, text):
|
def post_replacements(self, template, text):
|
||||||
components = []
|
components = []
|
||||||
seen = set()
|
seen = set()
|
||||||
@@ -322,10 +418,64 @@ class AddressFormatter(object):
|
|||||||
text = re.sub(regex, replacement, text)
|
text = re.sub(regex, replacement, text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def revised_template(self, components, country, language=None):
|
||||||
|
template = self.get_template(country, language=language)
|
||||||
|
if not template or 'address_template' not in template:
|
||||||
|
return None
|
||||||
|
|
||||||
|
country = country.lower()
|
||||||
|
|
||||||
|
template = template['address_template']
|
||||||
|
|
||||||
|
cache_keys = []
|
||||||
|
|
||||||
|
for component in components:
|
||||||
|
scope = country
|
||||||
|
insertions = nested_get(self.country_insertions, (country, component), default=None)
|
||||||
|
|
||||||
|
if insertions is None and language:
|
||||||
|
country_language = '{}_{}'.format(country, language)
|
||||||
|
insertions = nested_get(self.country_insertions, (country_language, component), default=None)
|
||||||
|
scope = country_language
|
||||||
|
|
||||||
|
if insertions is None and language:
|
||||||
|
insertions = nested_get(self.language_insertions, (language, component), default=None)
|
||||||
|
scope = language
|
||||||
|
|
||||||
|
if insertions is None:
|
||||||
|
insertions = nested_get(self.global_insertions, (component,), default=None)
|
||||||
|
scope = None
|
||||||
|
|
||||||
|
if insertions is not None:
|
||||||
|
values, probs = insertions
|
||||||
|
order, other = weighted_choice(values, probs)
|
||||||
|
|
||||||
|
insertion_id = (scope, component, order, other)
|
||||||
|
cache_keys.append(insertion_id)
|
||||||
|
|
||||||
|
cache_key = tuple(sorted(cache_keys))
|
||||||
|
|
||||||
|
if cache_key in self.template_cache:
|
||||||
|
template = self.template_cache[cache_key]
|
||||||
|
continue
|
||||||
|
|
||||||
|
if order == self.BEFORE and other in components:
|
||||||
|
template = self.insert_component(template, component, before=other)
|
||||||
|
elif order == self.AFTER and other in components:
|
||||||
|
template = self.insert_component(template, component, after=other)
|
||||||
|
elif order == self.LAST:
|
||||||
|
template = self.insert_component(template, component, last=True)
|
||||||
|
elif order == self.FIRST:
|
||||||
|
template = self.insert_component(template, component, first=True)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.template_cache[cache_key] = template
|
||||||
|
|
||||||
|
return template
|
||||||
|
|
||||||
def tag_template_separators(self, template):
|
def tag_template_separators(self, template):
|
||||||
template = re.sub(r'},', '}} ,/{} '.format(self.separator_tag), template)
|
template = re.sub(r'}\s*([,\-;])\s*', r'}} \1/{} '.format(self.separator_tag), template)
|
||||||
template = re.sub(r'}-', '}} -/{} '.format(self.separator_tag), template)
|
|
||||||
template = re.sub(r' - ', ' -/{} '.format(self.separator_tag), template)
|
|
||||||
return template
|
return template
|
||||||
|
|
||||||
def strip_component(self, value, tagged=False):
|
def strip_component(self, value, tagged=False):
|
||||||
@@ -374,28 +524,47 @@ class AddressFormatter(object):
|
|||||||
else:
|
else:
|
||||||
end = num_tokens - j - 1
|
end = num_tokens - j - 1
|
||||||
|
|
||||||
return u' '.join(tokens[start:end])
|
return six.u(' ').join(tokens[start:end])
|
||||||
|
|
||||||
|
def get_template(self, country, language=None):
|
||||||
|
template = None
|
||||||
|
if language:
|
||||||
|
# For countries like China and Japan where the country format varies
|
||||||
|
# based on which language is being used
|
||||||
|
template = self.country_formats.get('{}_{}'.format(country.upper(), language.lower()), None)
|
||||||
|
|
||||||
|
if not template:
|
||||||
|
template = self.country_formats.get(country.upper())
|
||||||
|
|
||||||
def format_address(self, country, components,
|
|
||||||
minimal_only=True, tag_components=True, replace_aliases=True,
|
|
||||||
template_replacements=False):
|
|
||||||
template = self.config.get(country.upper())
|
|
||||||
if not template:
|
if not template:
|
||||||
return None
|
return None
|
||||||
template_text = template['address_template']
|
|
||||||
|
use_country = template.get('use_country')
|
||||||
|
if use_country and use_country.upper() in self.country_formats:
|
||||||
|
template = self.country_formats[use_country.upper()]
|
||||||
|
|
||||||
|
if 'address_template' not in template:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return template
|
||||||
|
|
||||||
|
def format_address(self, country, components, language=None,
|
||||||
|
minimal_only=True, tag_components=True, replace_aliases=True):
|
||||||
|
template = self.get_template(country, language=language)
|
||||||
|
if not template:
|
||||||
|
return None
|
||||||
|
|
||||||
|
template_text = self.revised_template(components, country, language=language)
|
||||||
if replace_aliases:
|
if replace_aliases:
|
||||||
self.replace_aliases(components)
|
self.aliases.replace(components)
|
||||||
|
|
||||||
if minimal_only and not self.minimal_components(components):
|
if minimal_only and not self.minimal_components(components):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if template_replacements:
|
|
||||||
self.apply_replacements(template, components)
|
|
||||||
|
|
||||||
if tag_components:
|
if tag_components:
|
||||||
template_text = self.tag_template_separators(template_text)
|
template_text = self.tag_template_separators(template_text)
|
||||||
components = {k: u' '.join([u'{}/{}'.format(t.replace(' ', ''), k.replace(' ', '_'))
|
components = {k: six.u(' ').join([six.u('{}/{}').format(t.replace(' ', ''), k.replace(' ', '_'))
|
||||||
for t, c in tokenize(v)])
|
for t, c in tokenize(v)])
|
||||||
for k, v in components.iteritems()}
|
for k, v in components.iteritems()}
|
||||||
|
|
||||||
text = self.render_template(template_text, components, tagged=tag_components)
|
text = self.render_template(template_text, components, tagged=tag_components)
|
||||||
|
|||||||
Reference in New Issue
Block a user