[addresses] using the name key disttribution in AddressComponents.all_names. Returning names and valid components from the new function instead of the full gazetteer (can be build later)

This commit is contained in:
Al
2016-12-18 17:22:13 -05:00
parent 954b6548bf
commit 2727572822

View File

@@ -307,26 +307,33 @@ class AddressComponents(object):
key = ''.join((raw_key, suffix)) if ':' not in raw_key else raw_key
return key, raw_key
def all_names(self, props, languages, keys=ALL_OSM_NAME_KEYS):
def all_names(self, props, languages, component=None, keys=ALL_OSM_NAME_KEYS):
# Preserve uniqueness and order
names = OrderedDict()
valid_names, _ = boundary_names.name_key_dist(props, component)
names = OrderedDict.fromkeys(valid_names)
valid_names = set(valid_names)
for k, v in six.iteritems(props):
if k in keys:
if ':' in k:
if k == 'name:simple' and 'en' in languages:
names[v] = None
elif ':' in k:
k, qual = k.split(':', 1)
if k in self.ALL_OSM_NAME_KEYS and qual.split('_', 1)[0] in languages:
if k in valid_names and qual.split('_', 1)[0] in languages:
names[v] = None
return names.keys()
def place_phrase_gazetteer(self, name, osm_components):
def place_names_and_components(self, name, osm_components, country=None, languages=None):
names = set()
components = defaultdict(set)
name_norm = six.u('').join([t for t, c in normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
token_options=TOKEN_OPTIONS_DROP_PERIODS, whitespace=True)])
for i, props in enumerate(osm_components):
component_names = set([n.lower() for n in self.all_names(props, languages or [])])
containing_ids = [(c['type'], c['id']) for c in osm_components[i + 1:] if 'type' in c and 'id' in c]
component = osm_address_components.component_from_properties(country, props, containing=containing_ids)
component_names = set([n.lower() for n in self.all_names(props, languages or [] )])
valid_component_names = set()
for n in component_names:
@@ -338,10 +345,6 @@ class AddressComponents(object):
valid_component_names.add(norm)
containing_ids = [(c['type'], c['id']) for c in osm_components[i + 1:] if 'type' in c and 'id' in c]
component = osm_address_components.component_from_properties(country, props, containing=containing_ids)
names |= valid_component_names
is_state = False
@@ -360,8 +363,7 @@ class AddressComponents(object):
if abbreviations:
names.update([a.lower() for a in abbreviations])
phrase_filter = PhraseFilter([(n, '') for n in names])
return phrase_filter
return names, components
def normalized_place_name(self, name, tag, osm_components, country=None, languages=None):
'''
@@ -375,9 +377,9 @@ class AddressComponents(object):
tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
token_options=TOKEN_OPTIONS_DROP_PERIODS)
phrase_filter = self.place_phrase_gazetteer(name, osm_component_is_village)
names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages)
components = defaultdict(set)
phrase_filter = PhraseFilter([(n, '') for n in names])
phrases = list(phrase_filter.filter(tokens_lower))