[osm] adding the excellent ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix Mojibake, etc. in address components

This commit is contained in:
Al
2016-12-26 21:04:16 -05:00
parent 7ec368542b
commit 8abbb273b2
2 changed files with 14 additions and 0 deletions

View File

@@ -5,6 +5,7 @@ import random
import re
import six
import sys
import ftfy
import yaml
from collections import defaultdict, OrderedDict, Counter
@@ -196,6 +197,9 @@ class OSMAddressFormatter(object):
sub_building_components = {k: v for k, v in six.iteritems(sub_building_components) if k in AddressFormatter.address_formatter_fields}
return sub_building_components
def fix_component_encodings(self, tags):
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(tags)}
def normalized_street_name(self, address_components, country=None, language=None):
street = address_components.get(AddressFormatter.ROAD)
if street and ',' in street:
@@ -667,6 +671,8 @@ class OSMAddressFormatter(object):
component_order = AddressFormatter.component_order[component_name]
sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY]
revised_tags = self.fix_component_encodings(revised_tags)
for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'):
if more_than_one_official_language:
name = tags.get(name_tag)
@@ -984,6 +990,8 @@ class OSMAddressFormatter(object):
venue_sub_building_prob = float(nested_get(self.config, ('venues', 'sub_building_probability'), default=0.0))
add_sub_building_components = AddressFormatter.HOUSE_NUMBER in revised_tags and (AddressFormatter.HOUSE not in revised_tags or random.random() < venue_sub_building_prob)
revised_tags = self.fix_component_encodings(revised_tags)
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language,
num_floors=num_floors, num_basements=num_basements,
zone=zone, add_sub_building_components=add_sub_building_components,