[openaddresses] In some OpenAddresses data sets, the house number is just a copy of the street name, so eliminate non-numeric house numbers to be safe

This commit is contained in:
Al
2016-07-31 01:12:04 -04:00
parent f8e9d39e12
commit cec4914233

View File

@@ -10,6 +10,7 @@ from geodata.address_formatting.formatter import AddressFormatter
from geodata.addresses.components import AddressComponents
from geodata.countries.names import country_names
from geodata.math.sampling import cdf, weighted_choice
from geodata.text.utils import is_numeric
from geodata.csv_utils import tsv_string, unicode_csv_reader
@@ -125,6 +126,10 @@ class OpenAddressesFormatter(object):
separate_prob=separate_street_prob)
components[AddressFormatter.ROAD] = street
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
if house_number and not is_numeric(house_number):
components.pop(AddressFormatter.HOUSE_NUMBER)
unit = components.get(AddressFormatter.UNIT, None)
if unit is not None:
unit = abbreviate(unit_types_gazetteer, unit, language,