diff --git a/scripts/geodata/names/normalization.py b/scripts/geodata/names/normalization.py new file mode 100644 index 00000000..a93521ed --- /dev/null +++ b/scripts/geodata/names/normalization.py @@ -0,0 +1,31 @@ +from __future__ import unicode_literals +import re + +from geodata.encoding import safe_decode + +name_prefixes = ['{} '.format(s) for s in ( + 'city of', + 'township of', + 'municipality of', + 'borough of', + 'london borough of', + 'town of', +)] + +name_suffixes = [' {}'.format(s) for s in ( + 'township', + 'municipality', +)] + +name_prefix_regex = re.compile('^{}'.format('|'.join(name_prefixes)), re.I | re.UNICODE) +name_suffix_regex = re.compile('{}$'.format('|'.join(name_suffixes)), re.I | re.UNICODE) + + +def replace_name_prefixes(name): + name = safe_decode(name) + return name_prefix_regex.sub('', name) + + +def replace_name_suffixes(name): + name = safe_decode(name) + return name_suffix_regex.sub('', name) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 89e4e5ed..f0789f61 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -63,6 +63,7 @@ from geodata.states.state_abbreviations import STATE_ABBREVIATIONS, STATE_EXPANS from geodata.language_id.polygon_lookup import country_and_languages from geodata.i18n.languages import * from geodata.address_formatting.formatter import AddressFormatter +from geodata.names.normalization import replace_name_prefixes, replace_name_suffixes from geodata.osm.extract import * from geodata.polygons.language_polys import * from geodata.polygons.reverse_geocode import * @@ -192,6 +193,15 @@ osm_fields = [ ] +REPLACE_COMPONENTS = ( + AddressFormatter.SUBURB, + AddressFormatter.CITY_DISTRICT, + AddressFormatter.CITY, + AddressFormatter.STATE_DISTRICT, + AddressFormatter.STATE +) + + def write_osm_json(filename, out_filename): out = open(out_filename, 'w') writer = csv.writer(out, 'tsv_no_quote') @@ -709,6 +719,18 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if component not in address_components and random.random() < 0.5: address_components[component] = neighborhoods[0] + ''' + Name normalization + ------------------ + + Probabilistically strip standard prefixes/suffixes e.g. "London Borough of" + ''' + for component in REPLACE_COMPONENTS: + name = address_components[component] + replacement = replace_name_prefixes(replace_name_suffixes()) + if replacement != name and random.random() < 0.6: + address_components[component] = replacement + # Version with all components formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)