From ed0b49884e4b570f073df5acf0359f42e5212e32 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 Aug 2016 00:38:43 -0400 Subject: [PATCH] [openaddresses] Changes to OA config utilizing some of the new cleanup options. Adding language to brussels-fr and brussels-nl, adding New York and New Jersey statewide with the understanding that OSM components will be added in NJ and postcodes will be stripped of letters in NY --- resources/parser/data_sets/openaddresses.yaml | 39 +++++++++++++------ scripts/geodata/openaddresses/formatter.py | 8 ++-- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/resources/parser/data_sets/openaddresses.yaml b/resources/parser/data_sets/openaddresses.yaml index a1b0c7c1..10221283 100644 --- a/resources/parser/data_sets/openaddresses.yaml +++ b/resources/parser/data_sets/openaddresses.yaml @@ -8,21 +8,25 @@ global: separate_street_probability: 0.2 abbreviate_unit_probability: 0.3 separate_unit_probability: 0.2 - field_map: + + fields: - field_name: NUMBER component: house_number - field_name: STREET component: road + - field_name: UNIT + component: unit - field_name: CITY component: city - - field_name: REGION - component: state - field_name: POSTCODE component: postcode + # Units have strong restrictions, have to be a number or hyphenated number + non_numeric_units: false + countries: au: - field_map: + fields: - field_name: NUMBER component: house_number - field_name: STREET @@ -38,24 +42,20 @@ countries: files: - filename: countrywide.csv + non_numeric_units: true nz: - field_map: - - field_name: NUMBER - component: house_number - - field_name: STREET - component: road - - field_name: POSTCODE - component: postcode files: - filename: countrywide.csv + - filename: city_of_palmerston_north.csv be: subdirs: wa: files: - filename: brussels-fr.csv + language: fr - filename: brussels-nl.csv - + language: nl us: cldr_country_probability: 0.05 subdirs: @@ -115,3 +115,18 @@ countries: - filename: ventura.csv - filename: yolo.csv - filename: yuba.csv + + nj: + add: + state: NJ + files: + - filename: statewide.csv + add_osm_boundaries: true + + ny: + add: + state: NY + files: + - filename: statewide.csv + strip_alpha_from_postcode: true + diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 7cbf2d36..8f0f5502 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -111,17 +111,17 @@ class OpenAddressesFormatter(object): add_components = self.get_property('add', *configs) - field_map = self.get_property('field_map', *configs) - if not field_map: + fields = self.get_property('fields', *configs) + if not fields: return - field_map = {f['field_name']: f['component'] for f in field_map} + fields = {f['field_name']: f['component'] for f in fields} f = open(path) reader = unicode_csv_reader(f) headers = reader.next() - header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map} + header_indices = {i: fields[k] for i, k in enumerate(headers) if k in fields} latitude_index = headers.index('LAT') longitude_index = headers.index('LON')