[osm] adding the excellent ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix Mojibake, etc. in address components

This commit is contained in:
Al
2016-12-26 21:04:16 -05:00
parent 7ec368542b
commit 8abbb273b2
2 changed files with 14 additions and 0 deletions

View File

@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import csv import csv
import ftfy
import itertools import itertools
import os import os
import random import random
@@ -217,6 +218,9 @@ class OpenAddressesFormatter(object):
return self.unit_type_regexes[language].sub(six.u(''), value) return self.unit_type_regexes[language].sub(six.u(''), value)
return value return value
def fix_component_encodings(self, components):
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}
def formatted_addresses(self, country_dir, path, configs, tag_components=True): def formatted_addresses(self, country_dir, path, configs, tag_components=True):
abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs)) abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0) separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
@@ -344,6 +348,8 @@ class OpenAddressesFormatter(object):
continue continue
candidate_languages = candidate_languages.items() candidate_languages = candidate_languages.items()
components = self.fix_component_encodings(components)
if language is None: if language is None:
language = AddressComponents.address_language(components, candidate_languages) language = AddressComponents.address_language(components, candidate_languages)

View File

@@ -5,6 +5,7 @@ import random
import re import re
import six import six
import sys import sys
import ftfy
import yaml import yaml
from collections import defaultdict, OrderedDict, Counter from collections import defaultdict, OrderedDict, Counter
@@ -196,6 +197,9 @@ class OSMAddressFormatter(object):
sub_building_components = {k: v for k, v in six.iteritems(sub_building_components) if k in AddressFormatter.address_formatter_fields} sub_building_components = {k: v for k, v in six.iteritems(sub_building_components) if k in AddressFormatter.address_formatter_fields}
return sub_building_components return sub_building_components
def fix_component_encodings(self, tags):
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(tags)}
def normalized_street_name(self, address_components, country=None, language=None): def normalized_street_name(self, address_components, country=None, language=None):
street = address_components.get(AddressFormatter.ROAD) street = address_components.get(AddressFormatter.ROAD)
if street and ',' in street: if street and ',' in street:
@@ -667,6 +671,8 @@ class OSMAddressFormatter(object):
component_order = AddressFormatter.component_order[component_name] component_order = AddressFormatter.component_order[component_name]
sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY] sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY]
revised_tags = self.fix_component_encodings(revised_tags)
for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'): for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'):
if more_than_one_official_language: if more_than_one_official_language:
name = tags.get(name_tag) name = tags.get(name_tag)
@@ -984,6 +990,8 @@ class OSMAddressFormatter(object):
venue_sub_building_prob = float(nested_get(self.config, ('venues', 'sub_building_probability'), default=0.0)) venue_sub_building_prob = float(nested_get(self.config, ('venues', 'sub_building_probability'), default=0.0))
add_sub_building_components = AddressFormatter.HOUSE_NUMBER in revised_tags and (AddressFormatter.HOUSE not in revised_tags or random.random() < venue_sub_building_prob) add_sub_building_components = AddressFormatter.HOUSE_NUMBER in revised_tags and (AddressFormatter.HOUSE not in revised_tags or random.random() < venue_sub_building_prob)
revised_tags = self.fix_component_encodings(revised_tags)
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language, address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language,
num_floors=num_floors, num_basements=num_basements, num_floors=num_floors, num_basements=num_basements,
zone=zone, add_sub_building_components=add_sub_building_components, zone=zone, add_sub_building_components=add_sub_building_components,