[osm] adding the excellent ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix Mojibake, etc. in address components
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
import ftfy
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
@@ -217,6 +218,9 @@ class OpenAddressesFormatter(object):
|
|||||||
return self.unit_type_regexes[language].sub(six.u(''), value)
|
return self.unit_type_regexes[language].sub(six.u(''), value)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
def fix_component_encodings(self, components):
|
||||||
|
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}
|
||||||
|
|
||||||
def formatted_addresses(self, country_dir, path, configs, tag_components=True):
|
def formatted_addresses(self, country_dir, path, configs, tag_components=True):
|
||||||
abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
|
abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
|
||||||
separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
|
separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
|
||||||
@@ -344,6 +348,8 @@ class OpenAddressesFormatter(object):
|
|||||||
continue
|
continue
|
||||||
candidate_languages = candidate_languages.items()
|
candidate_languages = candidate_languages.items()
|
||||||
|
|
||||||
|
components = self.fix_component_encodings(components)
|
||||||
|
|
||||||
if language is None:
|
if language is None:
|
||||||
language = AddressComponents.address_language(components, candidate_languages)
|
language = AddressComponents.address_language(components, candidate_languages)
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import random
|
|||||||
import re
|
import re
|
||||||
import six
|
import six
|
||||||
import sys
|
import sys
|
||||||
|
import ftfy
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from collections import defaultdict, OrderedDict, Counter
|
from collections import defaultdict, OrderedDict, Counter
|
||||||
@@ -196,6 +197,9 @@ class OSMAddressFormatter(object):
|
|||||||
sub_building_components = {k: v for k, v in six.iteritems(sub_building_components) if k in AddressFormatter.address_formatter_fields}
|
sub_building_components = {k: v for k, v in six.iteritems(sub_building_components) if k in AddressFormatter.address_formatter_fields}
|
||||||
return sub_building_components
|
return sub_building_components
|
||||||
|
|
||||||
|
def fix_component_encodings(self, tags):
|
||||||
|
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(tags)}
|
||||||
|
|
||||||
def normalized_street_name(self, address_components, country=None, language=None):
|
def normalized_street_name(self, address_components, country=None, language=None):
|
||||||
street = address_components.get(AddressFormatter.ROAD)
|
street = address_components.get(AddressFormatter.ROAD)
|
||||||
if street and ',' in street:
|
if street and ',' in street:
|
||||||
@@ -667,6 +671,8 @@ class OSMAddressFormatter(object):
|
|||||||
component_order = AddressFormatter.component_order[component_name]
|
component_order = AddressFormatter.component_order[component_name]
|
||||||
sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY]
|
sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY]
|
||||||
|
|
||||||
|
revised_tags = self.fix_component_encodings(revised_tags)
|
||||||
|
|
||||||
for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'):
|
for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'):
|
||||||
if more_than_one_official_language:
|
if more_than_one_official_language:
|
||||||
name = tags.get(name_tag)
|
name = tags.get(name_tag)
|
||||||
@@ -984,6 +990,8 @@ class OSMAddressFormatter(object):
|
|||||||
venue_sub_building_prob = float(nested_get(self.config, ('venues', 'sub_building_probability'), default=0.0))
|
venue_sub_building_prob = float(nested_get(self.config, ('venues', 'sub_building_probability'), default=0.0))
|
||||||
add_sub_building_components = AddressFormatter.HOUSE_NUMBER in revised_tags and (AddressFormatter.HOUSE not in revised_tags or random.random() < venue_sub_building_prob)
|
add_sub_building_components = AddressFormatter.HOUSE_NUMBER in revised_tags and (AddressFormatter.HOUSE not in revised_tags or random.random() < venue_sub_building_prob)
|
||||||
|
|
||||||
|
revised_tags = self.fix_component_encodings(revised_tags)
|
||||||
|
|
||||||
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language,
|
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language,
|
||||||
num_floors=num_floors, num_basements=num_basements,
|
num_floors=num_floors, num_basements=num_basements,
|
||||||
zone=zone, add_sub_building_components=add_sub_building_components,
|
zone=zone, add_sub_building_components=add_sub_building_components,
|
||||||
|
|||||||
Reference in New Issue
Block a user