[addresses] using the name key disttribution in AddressComponents.all_names. Returning names and valid components from the new function instead of the full gazetteer (can be build later)

2016-12-18 17:22:13 -05:00
parent 954b6548bf
commit 2727572822
1 changed files with 19 additions and 17 deletions
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -307,26 +307,33 @@ class AddressComponents(object):
        key = ''.join((raw_key, suffix)) if ':' not in raw_key else raw_key
        return key, raw_key

-    def all_names(self, props, languages, keys=ALL_OSM_NAME_KEYS):
+    def all_names(self, props, languages, component=None, keys=ALL_OSM_NAME_KEYS):
        # Preserve uniqueness and order
-        names = OrderedDict()
+        valid_names, _ = boundary_names.name_key_dist(props, component)
+        names = OrderedDict.fromkeys(valid_names)
+        valid_names = set(valid_names)
+
        for k, v in six.iteritems(props):
-            if k in keys:
+            if ':' in k:
+                if k == 'name:simple' and 'en' in languages:
                    names[v] = None
-            elif ':' in k:
                k, qual = k.split(':', 1)
-                if k in self.ALL_OSM_NAME_KEYS and qual.split('_', 1)[0] in languages:
+                if k in valid_names and qual.split('_', 1)[0] in languages:
                    names[v] = None
        return names.keys()

-    def place_phrase_gazetteer(self, name, osm_components):
+    def place_names_and_components(self, name, osm_components, country=None, languages=None):
        names = set()
+        components = defaultdict(set)

        name_norm = six.u('').join([t for t, c in normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
                                                                    token_options=TOKEN_OPTIONS_DROP_PERIODS, whitespace=True)])
-
        for i, props in enumerate(osm_components):
-            component_names = set([n.lower() for n in self.all_names(props, languages or [])])
+            containing_ids = [(c['type'], c['id']) for c in osm_components[i + 1:] if 'type' in c and 'id' in c]
+
+            component = osm_address_components.component_from_properties(country, props, containing=containing_ids)
+
+            component_names = set([n.lower() for n in self.all_names(props, languages or [] )])

            valid_component_names = set()
            for n in component_names:
@@ -338,10 +345,6 @@ class AddressComponents(object):

                valid_component_names.add(norm)

-            containing_ids = [(c['type'], c['id']) for c in osm_components[i + 1:] if 'type' in c and 'id' in c]
-
-            component = osm_address_components.component_from_properties(country, props, containing=containing_ids)
-
            names |= valid_component_names

            is_state = False
@@ -360,8 +363,7 @@ class AddressComponents(object):
                        if abbreviations:
                            names.update([a.lower() for a in abbreviations])

-        phrase_filter = PhraseFilter([(n, '') for n in names])
-        return phrase_filter
+        return names, components

    def normalized_place_name(self, name, tag, osm_components, country=None, languages=None):
        '''
@@ -375,9 +377,9 @@ class AddressComponents(object):
        tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
                                         token_options=TOKEN_OPTIONS_DROP_PERIODS)

-        phrase_filter = self.place_phrase_gazetteer(name, osm_component_is_village)
+        names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages)

-        components = defaultdict(set)
+        phrase_filter = PhraseFilter([(n, '') for n in names])

        phrases = list(phrase_filter.filter(tokens_lower))