From f7c8a6309303f0518cc185f78d6d2455a60895a3 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 1 Mar 2017 15:51:56 -0500 Subject: [PATCH] [addresses] making most of the methods on AddressComponents classmethods if possible so they can be accessed easily for sources not using OSM polygon lookup, etc. --- scripts/geodata/addresses/components.py | 213 ++++++++++++++---------- 1 file changed, 123 insertions(+), 90 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 1458efd3..7987217b 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -171,13 +171,12 @@ class AddressComponents(object): AddressFormatter.UNIT: Unit, } + config = yaml.load(open(PARSER_DEFAULT_CONFIG)) + # Non-admin component dropout + address_level_dropout_probabilities = {k: v['probability'] for k, v in six.iteritems(config['dropout'])} + def __init__(self, osm_admin_rtree, neighborhoods_rtree, places_index): - self.config = yaml.load(open(PARSER_DEFAULT_CONFIG)) - self.setup_component_dependencies() - # Non-admin component dropout - self.address_level_dropout_probabilities = {k: v['probability'] for k, v in six.iteritems(self.config['dropout'])} - self.osm_admin_rtree = osm_admin_rtree self.neighborhoods_rtree = neighborhoods_rtree self.places_index = places_index @@ -266,26 +265,30 @@ class AddressComponents(object): def osm_reverse_geocoded_components(self, latitude, longitude): return self.osm_admin_rtree.point_in_poly(latitude, longitude, return_all=True) - def osm_country_and_languages(self, osm_components): + @classmethod + def osm_country_and_languages(cls, osm_components): return OSMCountryReverseGeocoder.country_and_languages_from_components(osm_components) - def osm_component_is_village(self, component): + @classmethod + def osm_component_is_village(cls, component): return component.get('place', '').lower() in ('locality', 'village', 'hamlet') - def categorize_osm_component(self, country, props, containing_components): + @classmethod + def categorize_osm_component(cls, country, props, containing_components): containing_ids = [(c['type'], c['id']) for c in containing_components if 'type' in c and 'id' in c] return osm_address_components.component_from_properties(country, props, containing=containing_ids) - def categorized_osm_components(self, country, osm_components): + @classmethod + def categorized_osm_components(cls, country, osm_components): components = [] for i, props in enumerate(osm_components): name = props.get('name') if not name: continue - component = self.categorize_osm_component(country, props, osm_components) + component = cls.categorize_osm_component(country, props, osm_components) if component is not None: components.append((props, component)) @@ -333,7 +336,8 @@ class AddressComponents(object): return language - def pick_random_name_key(self, props, component, suffix=''): + @classmethod + def pick_random_name_key(cls, props, component, suffix=''): ''' Random name ----------- @@ -345,7 +349,8 @@ class AddressComponents(object): key = ''.join((raw_key, suffix)) if ':' not in raw_key else raw_key return key, raw_key - def all_names(self, props, languages, component=None, keys=ALL_OSM_NAME_KEYS): + @classmethod + def all_names(cls, props, languages, component=None, keys=ALL_OSM_NAME_KEYS): # Preserve uniqueness and order valid_names, _ = boundary_names.name_key_dist(props, component) names = OrderedDict() @@ -362,7 +367,8 @@ class AddressComponents(object): names[v] = None return names.keys() - def place_names_and_components(self, name, osm_components, country=None, languages=None): + @classmethod + def place_names_and_components(cls, name, osm_components, country=None, languages=None): names = set() components = defaultdict(set) @@ -373,7 +379,7 @@ class AddressComponents(object): component = osm_address_components.component_from_properties(country, props, containing=containing_ids) - component_names = set([n.lower() for n in self.all_names(props, languages or [] )]) + component_names = set([n.lower() for n in cls.all_names(props, languages or [] )]) valid_component_names = set() for n in component_names: @@ -408,7 +414,8 @@ class AddressComponents(object): return names, components - def strip_components(self, name, osm_components, country, languages): + @classmethod + def strip_components(cls, name, osm_components, country, languages): if not name or not osm_components: return name @@ -417,7 +424,7 @@ class AddressComponents(object): tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE, token_options=TOKEN_OPTIONS_DROP_PERIODS) - names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages) + names, components = cls.place_names_and_components(name, osm_components, country=country, languages=languages) phrase_filter = PhraseFilter([(n, '') for n in names]) @@ -439,7 +446,8 @@ class AddressComponents(object): parens_regex = re.compile('\(.*?\)') - def normalized_place_name(self, name, tag, osm_components, country=None, languages=None, phrase_from_component=False): + @classmethod + def normalized_place_name(cls, name, tag, osm_components, country=None, languages=None, phrase_from_component=False): ''' Multiple place names -------------------- @@ -455,7 +463,7 @@ class AddressComponents(object): tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE, token_options=TOKEN_OPTIONS_DROP_PERIODS) - names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages) + names, components = cls.place_names_and_components(name, osm_components, country=country, languages=languages) phrase_filter = PhraseFilter([(n, '') for n in names]) @@ -501,8 +509,8 @@ class AddressComponents(object): else: total_tokens += 1 - if self.parens_regex.search(name): - name = self.parens_regex.sub(six.u(''), name).strip() + if cls.parens_regex.search(name): + name = cls.parens_regex.sub(six.u(''), name).strip() # If the name contains a comma, stop and only use the phrase before the comma if ',' in name: @@ -510,13 +518,14 @@ class AddressComponents(object): return name - def normalize_place_names(self, address_components, osm_components, country=None, languages=None, phrase_from_component=False): + @classmethod + def normalize_place_names(cls, address_components, osm_components, country=None, languages=None, phrase_from_component=False): for key in list(address_components): name = address_components[key] - if key in self.BOUNDARY_COMPONENTS: - name = self.normalized_place_name(name, key, osm_components, - country=country, languages=languages, - phrase_from_component=phrase_from_component) + if key in cls.BOUNDARY_COMPONENTS: + name = cls.normalized_place_name(name, key, osm_components, + country=country, languages=languages, + phrase_from_component=phrase_from_component) if name is not None: address_components[key] = name @@ -529,7 +538,8 @@ class AddressComponents(object): self.formatter.aliases.replace(address_components) return address_components - def combine_fields(self, address_components, language, country=None, generated=None): + @classmethod + def combine_fields(cls, address_components, language, country=None, generated=None): combo_config = address_config.get_property('components.combinations', language, country=country, default={}) combos = [] @@ -582,7 +592,8 @@ class AddressComponents(object): address_components[new_label] = new_value return set(components) - def generated_type(self, component, existing_components, language, country=None): + @classmethod + def generated_type(cls, component, existing_components, language, country=None): component_config = address_config.get_property('components.{}'.format(component), language, country=country) if not component_config: return None @@ -600,7 +611,7 @@ class AddressComponents(object): values = [] probs = [] - for num_type in (self.NULL_PHRASE, self.ALPHANUMERIC_PHRASE, self.STANDALONE_PHRASE): + for num_type in (cls.NULL_PHRASE, cls.ALPHANUMERIC_PHRASE, cls.STANDALONE_PHRASE): key = '{}_probability'.format(num_type) prob = prob_dist.get(key) if prob is not None: @@ -617,12 +628,13 @@ class AddressComponents(object): probs = cdf(probs) num_type = weighted_choice(values, probs) - if num_type == self.NULL_PHRASE: + if num_type == cls.NULL_PHRASE: return None else: return num_type - def get_component_phrase(self, cls, component, language, country=None): + @classmethod + def get_component_phrase(cls, component, language, country=None): component = safe_decode(component) if not is_numeric(component) and not (component.isalpha() and len(component) == 1): return None @@ -633,15 +645,17 @@ class AddressComponents(object): else: return None - def normalize_sub_building_components(self, address_components, language, country=None): - for component, cls in six.iteritems(self.sub_building_component_class_map): + @classmethod + def normalize_sub_building_components(cls, address_components, language, country=None): + for component, cls in six.iteritems(cls.sub_building_component_class_map): if component in address_components: val = address_components[component] - new_val = self.get_component_phrase(cls, val, language, country) + new_val = cls.get_component_phrase(cls, val, language, country) if new_val is not None: address_components[component] = new_val - def cldr_country_name(self, country_code, language): + @classmethod + def cldr_country_name(cls, country_code, language): ''' Country names ------------- @@ -666,7 +680,7 @@ class AddressComponents(object): 3. This is implicit, but with probability (1-b)(1-a), keep the country code ''' - cldr_config = nested_get(self.config, ('country', 'cldr')) + cldr_config = nested_get(cls.config, ('country', 'cldr')) alpha_2_iso_code_prob = float(cldr_config['iso_alpha_2_code_probability']) localized_name_prob = float(cldr_config['localized_name_probability']) @@ -1014,26 +1028,28 @@ class AddressComponents(object): value = cls.strip_english_unit_number_suffix(value) return value - def abbreviated_state(self, state, country, language): - abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability'))) + @classmethod + def abbreviated_state(cls, state, country, language): + abbreviate_state_prob = float(nested_get(cls.config, ('state', 'abbreviated_probability'))) if random.random() < abbreviate_state_prob: state = state_abbreviations.get_abbreviation(country, language, state, default=state) return state - def abbreviate_admin_components(self, address_components, country, language, hyphenation=True): - abbreviate_toponym_prob = float(nested_get(self.config, ('boundaries', 'abbreviate_toponym_probability'))) + @classmethod + def abbreviate_admin_components(cls, address_components, country, language, hyphenation=True): + abbreviate_toponym_prob = float(nested_get(cls.config, ('boundaries', 'abbreviate_toponym_probability'))) for component, val in six.iteritems(address_components): if component not in AddressFormatter.BOUNDARY_COMPONENTS: continue if component == AddressFormatter.STATE: - val = self.abbreviated_state(val, country, language) + val = cls.abbreviated_state(val, country, language) else: val = abbreviate(toponym_abbreviations_gazetteer, val, language, abbreviate_prob=abbreviate_toponym_prob) if hyphenation: - val = self.name_hyphens(val) + val = cls.name_hyphens(val) address_components[component] = val def add_city_and_equivalent_points(self, grouped_components, containing_components, country, latitude, longitude): @@ -1322,29 +1338,31 @@ class AddressComponents(object): if country == Countries.JAPAN and (language_suffix.endswith(JAPANESE_ROMAJI) or non_local_language == ENGLISH): self.format_japanese_neighborhood_romaji(address_components) - def generate_sub_building_component(self, component, address_components, language, country=None, **kw): + @classmethod + def generate_sub_building_component(cls, component, address_components, language, country=None, **kw): existing = address_components.get(component, None) if existing is None: - generated_type = self.generated_type(component, address_components, language, country=country) + generated_type = cls.generated_type(component, address_components, language, country=country) return generated_type return None - def add_sub_building_phrase(self, component, phrase_type, address_components, generated, language, country, **kw): - if not generated and not phrase_type != self.STANDALONE_PHRASE: + @classmethod + def add_sub_building_phrase(cls, component, phrase_type, address_components, generated, language, country, **kw): + if not generated and not phrase_type != cls.STANDALONE_PHRASE: return - component_class = self.sub_building_component_class_map[component] + component_class = cls.sub_building_component_class_map[component] - if generated or phrase_type == self.STANDALONE_PHRASE: + if generated or phrase_type == cls.STANDALONE_PHRASE: phrase = component_class.phrase(generated, language, country=country, **kw) if phrase: address_components[component] = phrase elif component in address_components: existing = address_components[component] - phrase = self.get_component_phrase(component_class, existing, language, country=country) + phrase = cls.get_component_phrase(component_class, existing, language, country=country) if phrase and phrase != existing: address_components[component] = phrase elif not phrase: @@ -1432,7 +1450,8 @@ class AddressComponents(object): if replacement != name and not replacement.isdigit(): address_components[component] = replacement - def replace_names(self, address_components): + @classmethod + def replace_names(cls, address_components): ''' Name replacements ----------------- @@ -1441,14 +1460,15 @@ class AddressComponents(object): ''' for component, value in address_components.iteritems(): - replacement = nested_get(self.config, ('value_replacements', component, value), default=None) + replacement = nested_get(cls.config, ('value_replacements', component, value), default=None) if replacement is not None: new_value = repl['replacement'] prob = repl['probability'] if random.random() < prob: address_components[component] = new_value - def remove_numeric_boundary_names(self, address_components): + @classmethod + def remove_numeric_boundary_names(cls, address_components): ''' Numeric boundary name cleanup ----------------------------- @@ -1461,13 +1481,14 @@ class AddressComponents(object): not be simply listed as "1" and people expected to understand. ''' for component in list(address_components): - if component not in self.BOUNDARY_COMPONENTS or component == AddressFormatter.POSTCODE: + if component not in cls.BOUNDARY_COMPONENTS or component == AddressFormatter.POSTCODE: continue value = address_components[component] if value.isdigit(): address_components.pop(component) - def cleanup_boundary_names(self, address_components): + @classmethod + def cleanup_boundary_names(cls, address_components): ''' Boundary name cleanup --------------------- @@ -1475,12 +1496,13 @@ class AddressComponents(object): Cleanup things like addr:city=Rockport, ''' for component in list(address_components): - if component not in self.BOUNDARY_COMPONENTS: + if component not in cls.BOUNDARY_COMPONENTS: continue address_components[component] = address_components[component].strip(six.u(', ')) - def prune_duplicate_names(self, address_components): + @classmethod + def prune_duplicate_names(cls, address_components): ''' Name deduping ------------- @@ -1522,7 +1544,8 @@ class AddressComponents(object): name = name.split(six.u(','), 1)[0].strip() return name - def cleanup_house_number(self, address_components): + @classmethod + def cleanup_house_number(cls, address_components): ''' House number cleanup -------------------- @@ -1567,12 +1590,14 @@ class AddressComponents(object): invalid_street_regex = re.compile('^\s*(?:none|null|not applicable|n\s*/\s*a)\s*$', re.I) - def street_name_is_valid(self, street): - return street is not None and not (self.invalid_street_regex.match(street) or not any((c.isalnum() for c in street))) + @classmethod + def street_name_is_valid(cls, street): + return street is not None and not (cls.invalid_street_regex.match(street) or not any((c.isalnum() for c in street))) - def cleanup_street(self, address_components): + @classmethod + def cleanup_street(cls, address_components): street = address_components.get(AddressFormatter.ROAD) - if street is not None and not self.street_name_is_valid(street): + if street is not None and not cls.street_name_is_valid(street): address_components.pop(AddressFormatter.ROAD) newline_regex = re.compile('[\n]+') @@ -1593,7 +1618,8 @@ class AddressComponents(object): name = cls.newline_regex.sub(six.u(' '), name) return cls.name_regex.match(name).group(1) - def name_hyphens(self, name, hyphenate_multiword_probability=None, remove_hyphen_probability=None): + @classmethod + def name_hyphens(cls, name, hyphenate_multiword_probability=None, remove_hyphen_probability=None): ''' Hyphenated names ---------------- @@ -1602,18 +1628,18 @@ class AddressComponents(object): replace spaces with hyphens. ''' if hyphenate_multiword_probability is None: - hyphenate_multiword_probability = float(nested_get(self.config, ('places', 'hyphenate_multiword_probability'))) + hyphenate_multiword_probability = float(nested_get(cls.config, ('places', 'hyphenate_multiword_probability'))) if remove_hyphen_probability is None: - remove_hyphen_probability = float(nested_get(self.config, ('places', 'remove_hyphen_probability'))) + remove_hyphen_probability = float(nested_get(cls.config, ('places', 'remove_hyphen_probability'))) # Clean string of trailing space/hyphens, the above regex will match any string - name = self.strip_whitespace_and_hyphens(name) + name = cls.strip_whitespace_and_hyphens(name) - if self.hyphen_regex.search(name) and random.random() < remove_hyphen_probability: - return self.dehyphenate_multiword_name(name) - elif self.whitespace_regex.search(name) and random.random() < hyphenate_multiword_probability: - return self.hyphenate_multiword_name(name) + if cls.hyphen_regex.search(name) and random.random() < remove_hyphen_probability: + return cls.dehyphenate_multiword_name(name) + elif cls.whitespace_regex.search(name) and random.random() < hyphenate_multiword_probability: + return cls.hyphenate_multiword_name(name) return name @classmethod @@ -1647,30 +1673,34 @@ class AddressComponents(object): return names - def country_specific_cleanup(self, address_components, country): - if country in self.central_european_city_district_regexes: - self.format_central_european_city_district(country, address_components) + @classmethod + def country_specific_cleanup(cls, address_components, country): + if country in cls.central_european_city_district_regexes: + cls.format_central_european_city_district(country, address_components) - if country == self.IRELAND: - self.format_dublin_postal_district(address_components) - elif country == self.JAMAICA: - self.format_kingston_postcode(address_components) + if country == Countries.IRELAND: + cls.format_dublin_postal_district(address_components) + elif country == Countries.JAMAICA: + cls.format_kingston_postcode(address_components) - def add_house_number_phrase(self, address_components, language, country=None): + @classmethod + def add_house_number_phrase(cls, address_components, language, country=None): house_number = address_components.get(AddressFormatter.HOUSE_NUMBER, None) - if not is_numeric(house_number) and (not house_number or house_number.lower() not in self.latin_alphabet_lower): + if not is_numeric(house_number) and (not house_number or house_number.lower() not in cls.latin_alphabet_lower): return phrase = HouseNumber.phrase(house_number, language, country=country) if phrase and phrase != house_number: address_components[AddressFormatter.HOUSE_NUMBER] = phrase - def add_metro_station_phrase(self, address_components, language, country=None): + @classmethod + def add_metro_station_phrase(cls, address_components, language, country=None): metro_station = address_components.get(AddressFormatter.METRO_STATION, None) phrase = MetroStation.phrase(metro_station, language, country=country) if phrase and phrase != metro_station: address_components[AddressFormatter.METRO_STATION] = phrase - def add_postcode_phrase(self, address_components, language, country=None): + @classmethod + def add_postcode_phrase(cls, address_components, language, country=None): postcode = address_components.get(AddressFormatter.POSTCODE, None) if postcode: phrase = PostCode.phrase(postcode, language, country=country) @@ -1714,8 +1744,9 @@ class AddressComponents(object): address_components.pop(c) component_bitset ^= ComponentDependencies.component_bit_values[c] - def po_box_address(self, address_components, language, country=None): - po_box_config = self.config['po_box'] + @classmethod + def po_box_address(cls, address_components, language, country=None): + po_box_config = cls.config['po_box'] po_box_probability = float(po_box_config['probability']) if random.random() < po_box_probability: address_components = address_components.copy() @@ -1730,22 +1761,23 @@ class AddressComponents(object): drop_address_probability = po_box_config['drop_address_probability'] if random.random() < drop_address_probability: - address_components = self.drop_address(address_components) + address_components = cls.drop_address(address_components) drop_places_probability = po_box_config['drop_places_probability'] if random.random() < drop_places_probability: - address_components = self.drop_places(address_components) - address_components = self.drop_localities(address_components) + address_components = cls.drop_places(address_components) + address_components = cls.drop_localities(address_components) drop_postcode_probability = po_box_config['drop_postcode_probability'] if random.random() < drop_postcode_probability: - address_components = self.drop_postcode(address_components) + address_components = cls.drop_postcode(address_components) return address_components else: return None - def dropout_places(self, address_components, osm_components, country, language, population=None, population_from_city=False): + @classmethod + def dropout_places(cls, address_components, osm_components, country, language, population=None, population_from_city=False): # Population of the city helps us determine if the city can be used # on its own like "Seattle" or "New York" vs. smaller cities like # have to be qualified with a state, country, etc. @@ -1753,11 +1785,11 @@ class AddressComponents(object): if population is None and population_from_city: population = 0 - tagged = self.categorized_osm_components(country, osm_components) + tagged = cls.categorized_osm_components(country, osm_components) for props, component in (tagged or []): if component == AddressFormatter.CITY: - if self.unambiguous_wikipedia(props, language): + if cls.unambiguous_wikipedia(props, language): unambiguous_city = True if 'population' in props: @@ -1770,8 +1802,9 @@ class AddressComponents(object): address_components = place_config.dropout_components(address_components, osm_components, country=country, population=population, unambiguous_city=unambiguous_city) return address_components - def dropout_address_level_component(self, address_components, component): - probability = self.address_level_dropout_probabilities.get(component, None) + @classmethod + def dropout_address_level_component(cls, address_components, component): + probability = cls.address_level_dropout_probabilities.get(component, None) if probability is not None and random.random() < probability: address_components.pop(component) return True