[osm] adding the excellent ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix Mojibake, etc. in address components
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import csv
|
||||
import ftfy
|
||||
import itertools
|
||||
import os
|
||||
import random
|
||||
@@ -217,6 +218,9 @@ class OpenAddressesFormatter(object):
|
||||
return self.unit_type_regexes[language].sub(six.u(''), value)
|
||||
return value
|
||||
|
||||
def fix_component_encodings(self, components):
|
||||
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}
|
||||
|
||||
def formatted_addresses(self, country_dir, path, configs, tag_components=True):
|
||||
abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
|
||||
separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
|
||||
@@ -344,6 +348,8 @@ class OpenAddressesFormatter(object):
|
||||
continue
|
||||
candidate_languages = candidate_languages.items()
|
||||
|
||||
components = self.fix_component_encodings(components)
|
||||
|
||||
if language is None:
|
||||
language = AddressComponents.address_language(components, candidate_languages)
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import random
|
||||
import re
|
||||
import six
|
||||
import sys
|
||||
import ftfy
|
||||
import yaml
|
||||
|
||||
from collections import defaultdict, OrderedDict, Counter
|
||||
@@ -196,6 +197,9 @@ class OSMAddressFormatter(object):
|
||||
sub_building_components = {k: v for k, v in six.iteritems(sub_building_components) if k in AddressFormatter.address_formatter_fields}
|
||||
return sub_building_components
|
||||
|
||||
def fix_component_encodings(self, tags):
|
||||
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(tags)}
|
||||
|
||||
def normalized_street_name(self, address_components, country=None, language=None):
|
||||
street = address_components.get(AddressFormatter.ROAD)
|
||||
if street and ',' in street:
|
||||
@@ -667,6 +671,8 @@ class OSMAddressFormatter(object):
|
||||
component_order = AddressFormatter.component_order[component_name]
|
||||
sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY]
|
||||
|
||||
revised_tags = self.fix_component_encodings(revised_tags)
|
||||
|
||||
for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'):
|
||||
if more_than_one_official_language:
|
||||
name = tags.get(name_tag)
|
||||
@@ -984,6 +990,8 @@ class OSMAddressFormatter(object):
|
||||
venue_sub_building_prob = float(nested_get(self.config, ('venues', 'sub_building_probability'), default=0.0))
|
||||
add_sub_building_components = AddressFormatter.HOUSE_NUMBER in revised_tags and (AddressFormatter.HOUSE not in revised_tags or random.random() < venue_sub_building_prob)
|
||||
|
||||
revised_tags = self.fix_component_encodings(revised_tags)
|
||||
|
||||
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language,
|
||||
num_floors=num_floors, num_basements=num_basements,
|
||||
zone=zone, add_sub_building_components=add_sub_building_components,
|
||||
|
||||
Reference in New Issue
Block a user