diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 0aa64605..b87eb037 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import csv +import ftfy import itertools import os import random @@ -217,6 +218,9 @@ class OpenAddressesFormatter(object): return self.unit_type_regexes[language].sub(six.u(''), value) return value + def fix_component_encodings(self, components): + return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)} + def formatted_addresses(self, country_dir, path, configs, tag_components=True): abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs)) separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0) @@ -344,6 +348,8 @@ class OpenAddressesFormatter(object): continue candidate_languages = candidate_languages.items() + components = self.fix_component_encodings(components) + if language is None: language = AddressComponents.address_language(components, candidate_languages) diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index e449e83a..0fb7536d 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -5,6 +5,7 @@ import random import re import six import sys +import ftfy import yaml from collections import defaultdict, OrderedDict, Counter @@ -196,6 +197,9 @@ class OSMAddressFormatter(object): sub_building_components = {k: v for k, v in six.iteritems(sub_building_components) if k in AddressFormatter.address_formatter_fields} return sub_building_components + def fix_component_encodings(self, tags): + return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(tags)} + def normalized_street_name(self, address_components, country=None, language=None): street = address_components.get(AddressFormatter.ROAD) if street and ',' in street: @@ -667,6 +671,8 @@ class OSMAddressFormatter(object): component_order = AddressFormatter.component_order[component_name] sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY] + revised_tags = self.fix_component_encodings(revised_tags) + for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'): if more_than_one_official_language: name = tags.get(name_tag) @@ -984,6 +990,8 @@ class OSMAddressFormatter(object): venue_sub_building_prob = float(nested_get(self.config, ('venues', 'sub_building_probability'), default=0.0)) add_sub_building_components = AddressFormatter.HOUSE_NUMBER in revised_tags and (AddressFormatter.HOUSE not in revised_tags or random.random() < venue_sub_building_prob) + revised_tags = self.fix_component_encodings(revised_tags) + address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language, num_floors=num_floors, num_basements=num_basements, zone=zone, add_sub_building_components=add_sub_building_components,