From 294101ad80e9f701e31ce5af1bc8505d81e6da16 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 3 Sep 2015 17:46:57 -0400 Subject: [PATCH] [osm] Treating components that are all punctuation as blank in address parsing (e.g. a single comma) --- scripts/geodata/osm/osm_address_training_data.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 4fdacecc..f80552b4 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -244,6 +244,12 @@ class AddressFormatter(object): text = re.sub(regex, replacement, text) return text + def tokenize_component(self, v): + tokens = tokenize(value) + if sum((1 for c, t in tokens if c.value < token_types.PERIOD.value)) > 0: + return [t for c, t in tokenize(v)] + return [] + def format_address(self, country, components, minimal_only=True, tag_components=True): template = self.config.get(country.upper()) if not template: @@ -263,10 +269,10 @@ class AddressFormatter(object): if tag_components: components = {k: u' '.join([u'{}/{}'.format(t, k.replace(' ', '_')) - for c, t in tokenize(v)]) + for t in self.tokenize_component(v)]) for k, v in components.iteritems()} else: - components = {k: u' '.join([t for c, t in tokenize(v)]) + components = {k: u' '.join(self.tokenize_component(v)) for k, v in components.iteritems()} text = self.render_template(template_text, **components)