[formatting] Adding conditional probabilities for template insertions (e.g. given that we have a floor number, increase the probability that unit number follows it)
This commit is contained in:
@@ -7,14 +7,15 @@ import six
|
|||||||
import subprocess
|
import subprocess
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
from collections import OrderedDict, defaultdict
|
||||||
|
from itertools import ifilter
|
||||||
|
|
||||||
from geodata.address_formatting.aliases import Aliases
|
from geodata.address_formatting.aliases import Aliases
|
||||||
from geodata.configs.utils import nested_get, recursive_merge
|
from geodata.configs.utils import nested_get, recursive_merge
|
||||||
from geodata.math.floats import isclose
|
from geodata.math.floats import isclose
|
||||||
from geodata.math.sampling import weighted_choice, cdf
|
from geodata.math.sampling import weighted_choice, cdf
|
||||||
from geodata.text.tokenize import tokenize, tokenize_raw, token_types
|
from geodata.text.tokenize import tokenize, tokenize_raw, token_types
|
||||||
from geodata.encoding import safe_decode
|
from geodata.encoding import safe_decode
|
||||||
from collections import OrderedDict
|
|
||||||
from itertools import ifilter
|
|
||||||
|
|
||||||
FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting'
|
FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting'
|
||||||
|
|
||||||
@@ -52,9 +53,11 @@ class AddressFormatter(object):
|
|||||||
HOUSE = 'house'
|
HOUSE = 'house'
|
||||||
HOUSE_NUMBER = 'house_number'
|
HOUSE_NUMBER = 'house_number'
|
||||||
PO_BOX = 'po_box'
|
PO_BOX = 'po_box'
|
||||||
|
ATTENTION = 'attention'
|
||||||
CARE_OF = 'care_of'
|
CARE_OF = 'care_of'
|
||||||
BLOCK = 'block'
|
|
||||||
BUILDING = 'building'
|
BUILDING = 'building'
|
||||||
|
ENTRANCE = 'entrance'
|
||||||
|
STAIRCASE = 'staircase'
|
||||||
LEVEL = 'level'
|
LEVEL = 'level'
|
||||||
UNIT = 'unit'
|
UNIT = 'unit'
|
||||||
INTERSECTION = 'intersection'
|
INTERSECTION = 'intersection'
|
||||||
@@ -69,21 +72,23 @@ class AddressFormatter(object):
|
|||||||
POSTCODE = 'postcode'
|
POSTCODE = 'postcode'
|
||||||
COUNTRY = 'country'
|
COUNTRY = 'country'
|
||||||
|
|
||||||
address_formatter_fields = set([
|
component_order = {k: i for i, k in enumerate([
|
||||||
CATEGORY,
|
CATEGORY,
|
||||||
NEAR,
|
NEAR,
|
||||||
HOUSE,
|
ATTENTION,
|
||||||
HOUSE_NUMBER,
|
|
||||||
PO_BOX,
|
|
||||||
CARE_OF,
|
CARE_OF,
|
||||||
BLOCK,
|
HOUSE,
|
||||||
|
PO_BOX,
|
||||||
|
HOUSE_NUMBER,
|
||||||
BUILDING,
|
BUILDING,
|
||||||
|
ENTRANCE,
|
||||||
|
STAIRCASE,
|
||||||
LEVEL,
|
LEVEL,
|
||||||
UNIT,
|
UNIT,
|
||||||
INTERSECTION,
|
|
||||||
ROAD,
|
ROAD,
|
||||||
SUBURB,
|
INTERSECTION,
|
||||||
SUBDIVISION,
|
SUBDIVISION,
|
||||||
|
SUBURB,
|
||||||
CITY,
|
CITY,
|
||||||
CITY_DISTRICT,
|
CITY_DISTRICT,
|
||||||
ISLAND,
|
ISLAND,
|
||||||
@@ -91,7 +96,9 @@ class AddressFormatter(object):
|
|||||||
STATE_DISTRICT,
|
STATE_DISTRICT,
|
||||||
POSTCODE,
|
POSTCODE,
|
||||||
COUNTRY,
|
COUNTRY,
|
||||||
])
|
])}
|
||||||
|
|
||||||
|
address_formatter_fields = set(component_order)
|
||||||
|
|
||||||
aliases = Aliases(
|
aliases = Aliases(
|
||||||
OrderedDict([
|
OrderedDict([
|
||||||
@@ -204,49 +211,73 @@ class AddressFormatter(object):
|
|||||||
admin_components = self.get_property('admin_components', country, language=language, default={})
|
admin_components = self.get_property('admin_components', country, language=language, default={})
|
||||||
return [(key, value.get('after', ()), value.get('before', ())) for key, value in six.iteritems(admin_components)]
|
return [(key, value.get('after', ()), value.get('before', ())) for key, value in six.iteritems(admin_components)]
|
||||||
|
|
||||||
|
def insertion_distribution(self, insertions):
|
||||||
|
values = []
|
||||||
|
probs = []
|
||||||
|
for k, v in six.iteritems(insertions):
|
||||||
|
if k == 'conditional':
|
||||||
|
continue
|
||||||
|
|
||||||
|
if 'before' in v:
|
||||||
|
val = (self.BEFORE, v['before'])
|
||||||
|
elif 'after' in v:
|
||||||
|
val = (self.AFTER, v['after'])
|
||||||
|
elif 'last' in v:
|
||||||
|
val = (self.LAST, None)
|
||||||
|
elif 'first' in v:
|
||||||
|
val = (self.FIRST, None)
|
||||||
|
else:
|
||||||
|
raise ValueError('Insertions must contain one of {first, before, after, last}')
|
||||||
|
|
||||||
|
prob = v['probability']
|
||||||
|
values.append(val)
|
||||||
|
probs.append(prob)
|
||||||
|
|
||||||
|
# If the probabilities don't sum to 1, add a "do nothing" action
|
||||||
|
if not isclose(sum(probs), 1.0):
|
||||||
|
probs.append(1.0 - sum(probs))
|
||||||
|
values.append((None, None))
|
||||||
|
|
||||||
|
return values, cdf(probs)
|
||||||
|
|
||||||
def insertion_probs(self, config):
|
def insertion_probs(self, config):
|
||||||
component_insertions = {}
|
component_insertions = {}
|
||||||
for component, insertions in six.iteritems(config):
|
for component, insertions in six.iteritems(config):
|
||||||
values = []
|
component_insertions[component] = self.insertion_distribution(insertions)
|
||||||
probs = []
|
|
||||||
for k, v in six.iteritems(insertions):
|
|
||||||
if 'before' in v:
|
|
||||||
val = (self.BEFORE, v['before'])
|
|
||||||
elif 'after' in v:
|
|
||||||
val = (self.AFTER, v['after'])
|
|
||||||
elif 'last' in v:
|
|
||||||
val = (self.LAST, None)
|
|
||||||
elif 'first' in v:
|
|
||||||
val = (self.FIRST, None)
|
|
||||||
else:
|
|
||||||
raise ValueError('Insertions must contain one of {first, before, after, last}')
|
|
||||||
|
|
||||||
prob = v['probability']
|
|
||||||
values.append(val)
|
|
||||||
probs.append(prob)
|
|
||||||
|
|
||||||
# If the probabilities don't sum to 1, add a "do nothing" action
|
|
||||||
if not isclose(sum(probs), 1.0):
|
|
||||||
probs.append(1.0 - sum(probs))
|
|
||||||
values.append((None, None))
|
|
||||||
|
|
||||||
component_insertions[component] = values, cdf(probs)
|
|
||||||
return component_insertions
|
return component_insertions
|
||||||
|
|
||||||
|
def conditional_insertion_probs(self, conditionals):
|
||||||
|
conditional_insertions = defaultdict(OrderedDict)
|
||||||
|
for component, value in six.iteritems(conditionals):
|
||||||
|
if 'conditional' in value:
|
||||||
|
conditionals = value['conditional']
|
||||||
|
|
||||||
|
for c in conditionals:
|
||||||
|
other = c['component']
|
||||||
|
conditional_insertions[component][other] = self.insertion_distribution(c['probabilities'])
|
||||||
|
return conditional_insertions
|
||||||
|
|
||||||
def setup_insertion_probabilities(self):
|
def setup_insertion_probabilities(self):
|
||||||
self.global_insertions = self.insertion_probs(self.config['insertions'])
|
config = self.config['insertions']
|
||||||
|
self.global_insertions = self.insertion_probs(config)
|
||||||
|
self.global_conditionals = self.conditional_insertion_probs(config)
|
||||||
|
|
||||||
self.country_insertions = {}
|
self.country_insertions = {}
|
||||||
|
self.country_conditionals = {}
|
||||||
|
|
||||||
for country, config in six.iteritems(self.country_configs):
|
for country, config in six.iteritems(self.country_configs):
|
||||||
if 'insertions' in config:
|
if 'insertions' in config:
|
||||||
self.country_insertions[country.lower()] = self.insertion_probs(config['insertions'])
|
self.country_insertions[country.lower()] = self.insertion_probs(config['insertions'])
|
||||||
|
self.country_conditionals[country.lower()] = self.conditional_insertion_probs(config['insertions'])
|
||||||
|
|
||||||
self.language_insertions = {}
|
self.language_insertions = {}
|
||||||
|
self.language_conditionals = {}
|
||||||
|
|
||||||
for language, config in six.iteritems(self.language_configs):
|
for language, config in six.iteritems(self.language_configs):
|
||||||
if 'insertions' in config:
|
if 'insertions' in config:
|
||||||
self.language_insertions[language.lower()] = self.insertion_probs(config['insertions'])
|
self.language_insertions[language.lower()] = self.insertion_probs(config['insertions'])
|
||||||
|
self.language_conditionals[language.lower()] = self.conditional_insertion_probs(config['insertions'])
|
||||||
|
|
||||||
def country_template(self, c):
|
def country_template(self, c):
|
||||||
return self.country_formats.get(c, self.country_formats['default'])
|
return self.country_formats.get(c, self.country_formats['default'])
|
||||||
@@ -262,6 +293,9 @@ class AddressFormatter(object):
|
|||||||
""" For constructing """
|
""" For constructing """
|
||||||
return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys]))
|
return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys]))
|
||||||
|
|
||||||
|
def tag_token(self, key):
|
||||||
|
return '{{{{{{{key}}}}}}}'.format(key=key)
|
||||||
|
|
||||||
def insert_component(self, template, tag, before=None, after=None, first=False, last=False, separate=True, is_reverse=False):
|
def insert_component(self, template, tag, before=None, after=None, first=False, last=False, separate=True, is_reverse=False):
|
||||||
if not before and not after and not first and not last:
|
if not before and not after and not first and not last:
|
||||||
return
|
return
|
||||||
@@ -284,7 +318,7 @@ class AddressFormatter(object):
|
|||||||
skip_next_non_token = False
|
skip_next_non_token = False
|
||||||
new_components = []
|
new_components = []
|
||||||
|
|
||||||
tag_token = '{{{{{{{key}}}}}}}'.format(key=tag)
|
tag_token = self.tag_token(tag)
|
||||||
|
|
||||||
parsed = pystache.parse(safe_decode(template))
|
parsed = pystache.parse(safe_decode(template))
|
||||||
num_tokens = len(parsed._parse_tree)
|
num_tokens = len(parsed._parse_tree)
|
||||||
@@ -429,26 +463,52 @@ class AddressFormatter(object):
|
|||||||
|
|
||||||
cache_keys = []
|
cache_keys = []
|
||||||
|
|
||||||
for component in components:
|
for component in sorted(components, key=self.component_order.get):
|
||||||
scope = country
|
scope = country
|
||||||
insertions = nested_get(self.country_insertions, (country, component), default=None)
|
insertions = nested_get(self.country_insertions, (country, component), default=None)
|
||||||
|
conditionals = nested_get(self.country_conditionals, (country, component), default=None)
|
||||||
|
|
||||||
if insertions is None and language:
|
if insertions is None and language:
|
||||||
country_language = '{}_{}'.format(country, language)
|
country_language = '{}_{}'.format(country, language)
|
||||||
insertions = nested_get(self.country_insertions, (country_language, component), default=None)
|
insertions = nested_get(self.country_insertions, (country_language, component), default=None)
|
||||||
scope = country_language
|
scope = country_language
|
||||||
|
|
||||||
|
if conditionals is None and language:
|
||||||
|
conditionals = nested_get(self.country_conditionals, (country_language, component), default=None)
|
||||||
|
|
||||||
if insertions is None and language:
|
if insertions is None and language:
|
||||||
insertions = nested_get(self.language_insertions, (language, component), default=None)
|
insertions = nested_get(self.language_insertions, (language, component), default=None)
|
||||||
scope = language
|
scope = language
|
||||||
|
|
||||||
|
if conditionals is None and language:
|
||||||
|
conditionals = nested_get(self.language_conditionals, (language, component), default=None)
|
||||||
|
|
||||||
if insertions is None:
|
if insertions is None:
|
||||||
insertions = nested_get(self.global_insertions, (component,), default=None)
|
insertions = nested_get(self.global_insertions, (component,), default=None)
|
||||||
scope = None
|
scope = None
|
||||||
|
|
||||||
|
if conditionals is None:
|
||||||
|
conditionals = nested_get(self.global_conditionals, (component,), default=None)
|
||||||
|
|
||||||
if insertions is not None:
|
if insertions is not None:
|
||||||
values, probs = insertions
|
conditional_insertions = None
|
||||||
order, other = weighted_choice(values, probs)
|
if conditionals is not None:
|
||||||
|
for k, v in six.iteritems(conditionals):
|
||||||
|
if k in components:
|
||||||
|
conditional_insertions = v
|
||||||
|
break
|
||||||
|
|
||||||
|
order, other = None, None
|
||||||
|
|
||||||
|
# Check the conditional probabilities first
|
||||||
|
if conditional_insertions is not None:
|
||||||
|
values, probs = conditional_insertions
|
||||||
|
order, other = weighted_choice(values, probs)
|
||||||
|
|
||||||
|
# If there are no conditional probabilites or the "default" value was chosen, sample from the marginals
|
||||||
|
if other is None:
|
||||||
|
values, probs = insertions
|
||||||
|
order, other = weighted_choice(values, probs)
|
||||||
|
|
||||||
insertion_id = (scope, component, order, other)
|
insertion_id = (scope, component, order, other)
|
||||||
cache_keys.append(insertion_id)
|
cache_keys.append(insertion_id)
|
||||||
@@ -459,9 +519,9 @@ class AddressFormatter(object):
|
|||||||
template = self.template_cache[cache_key]
|
template = self.template_cache[cache_key]
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if order == self.BEFORE and other in components:
|
if order == self.BEFORE and self.tag_token(other) in template:
|
||||||
template = self.insert_component(template, component, before=other)
|
template = self.insert_component(template, component, before=other)
|
||||||
elif order == self.AFTER and other in components:
|
elif order == self.AFTER and self.tag_token(other) in template:
|
||||||
template = self.insert_component(template, component, after=other)
|
template = self.insert_component(template, component, after=other)
|
||||||
elif order == self.LAST:
|
elif order == self.LAST:
|
||||||
template = self.insert_component(template, component, last=True)
|
template = self.insert_component(template, component, last=True)
|
||||||
@@ -548,7 +608,7 @@ class AddressFormatter(object):
|
|||||||
|
|
||||||
return template
|
return template
|
||||||
|
|
||||||
def format_address(self, country, components, language=None,
|
def format_address(self, components, country, language,
|
||||||
minimal_only=True, tag_components=True, replace_aliases=True):
|
minimal_only=True, tag_components=True, replace_aliases=True):
|
||||||
template = self.get_template(country, language=language)
|
template = self.get_template(country, language=language)
|
||||||
if not template:
|
if not template:
|
||||||
|
|||||||
Reference in New Issue
Block a user