[languages] using country_and_languages method in OSM, neighborhoods and OpenAddresses

2016-10-05 02:49:55 -04:00
parent 98a8d898a1
commit faf418decb
7 changed files with 95 additions and 88 deletions
--- a/scripts/geodata/osm/formatter.py
+++ b/scripts/geodata/osm/formatter.py
@@ -7,7 +7,7 @@ import six
 import sys
 import yaml

-from collections import OrderedDict
+from collections import defaultdict, OrderedDict, Counter
 from six import itertools

 this_dir = os.path.realpath(os.path.dirname(__file__))
@@ -109,8 +109,8 @@ class OSMAddressFormatter(object):
            ('is_in:region', AddressFormatter.STATE),
            # Used in Tunisia
            ('addr:governorate', AddressFormatter.STATE),
-            ('addr:postal_code', AddressFormatter.POSTCODE),
            ('addr:postcode', AddressFormatter.POSTCODE),
+            ('addr:postal_code', AddressFormatter.POSTCODE),
            ('addr:zipcode', AddressFormatter.POSTCODE),
            ('postal_code', AddressFormatter.POSTCODE),
            ('addr:country', AddressFormatter.COUNTRY),
@@ -138,6 +138,8 @@ class OSMAddressFormatter(object):
            'commercial': AddressComponents.zones.COMMERCIAL,
            'industrial': AddressComponents.zones.INDUSTRIAL,
            'residential': AddressComponents.zones.RESIDENTIAL,
+            'university': AddressComponents.zones.UNIVERSITY,
+            'college': AddressComponents.zones.UNIVERSITY,
        },
        'amenity': {
            'university': AddressComponents.zones.UNIVERSITY,
@@ -147,10 +149,10 @@ class OSMAddressFormatter(object):

    boundary_component_priorities = {k: i for i, k in enumerate(AddressFormatter.BOUNDARY_COMPONENTS_ORDERED)}

-    def __init__(self, components, subdivisions_rtree=None, buildings_rtree=None, metro_stations_index=None):
+    def __init__(self, components, country_rtree, subdivisions_rtree=None, buildings_rtree=None, metro_stations_index=None):
        # Instance of AddressComponents, contains structures for reverse geocoding, etc.
        self.components = components
-        self.language_rtree = components.language_rtree
+        self.country_rtree = country_rtree

        self.subdivisions_rtree = subdivisions_rtree
        self.buildings_rtree = buildings_rtree
@@ -168,7 +170,7 @@ class OSMAddressFormatter(object):
        if len(candidate_languages) > 1:
            street = tags.get('addr:street', None)

-            namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in tags]
+            namespaced = [l for l, d in candidate_languages if 'addr:street:{}'.format(l) in tags]

            if namespaced and random.random() < pick_namespaced_language_prob:
                language = random.choice(namespaced)
@@ -344,7 +346,6 @@ class OSMAddressFormatter(object):
            return True
        return False

-
    def add_metro_station(self, address_components, latitude, longitude, language=None, default_language=None):
        '''
        Metro stations
@@ -476,19 +477,10 @@ class OSMAddressFormatter(object):
            return (), None

        osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
-        country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
-        if country and candidate_languages:
-            local_languages = [(l['lang'], bool(int(l['default']))) for l in candidate_languages]
-        else:
-            for c in reversed(osm_components):
-                country = c.get('ISO3166-1:alpha2')
-                if country:
-                    country = country.lower()
-                    break
-            else:
-                return (), None

-            local_languages = [(lang, bool(int(default))) for lang, default in get_country_languages(country).iteritems()]
+        country, candidate_languages = OSMCountryReverseGeocoder.country_and_languages_from_components(osm_components)
+
+        local_languages = candidate_languages

        all_local_languages = set([l for l, d in local_languages])
        random_languages = set(INTERNET_LANGUAGE_DISTRIBUTION)
@@ -551,12 +543,17 @@ class OSMAddressFormatter(object):
        # Calculate how many records to produce for this place given its population
        population_divisor = 10000  # Add one record for every 10k in population
        min_references = 5  # Every place gets at least 5 reference to account for variations
+        if component_name == AddressFormatter.CITY:
+            # Cities get a few extra references over e.g. a state_district with the same name
+            # so that if the population is unknown, hopefully the city will have more references
+            # and the parser will prefer that meaning
+            min_references += 2
        max_references = 1000  # Cap the number of references e.g. for India and China country nodes
        num_references = min(population / population_divisor + min_references, max_references)

        cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0))

-        for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name'):
+        for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'):
            if more_than_one_official_language:
                name = tags.get(name_tag)
                language_suffix = ''
@@ -757,7 +754,7 @@ class OSMAddressFormatter(object):
        except Exception:
            return None, None, None

-        country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
+        country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
        if not (country and candidate_languages):
            return None, None, None

@@ -880,7 +877,7 @@ class OSMAddressFormatter(object):
        except Exception:
            return None, None, None

-        country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
+        country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
        if not (country and candidate_languages):
            return None, None, None

@@ -986,8 +983,10 @@ class OSMAddressFormatter(object):
        for node_id, tags, deps in parse_osm(infile):
            tags['type'], tags['id'] = node_id.split(':')
            place_tags, country = self.node_place_tags(tags)
+
            for address_components, language, is_default in place_tags:
                addresses = self.formatted_places(address_components, country, language)
+
                if language is None:
                    language = UNKNOWN_LANGUAGE

@@ -1083,11 +1082,11 @@ class OSMAddressFormatter(object):
            except Exception:
                continue

-            country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
+            country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
            if not (country and candidate_languages):
                continue

-            more_than_one_official_language = sum((1 for l in candidate_languages if int(l['default']))) > 1
+            more_than_one_official_language = sum((1 for l, d in candidate_languages if d)) > 1

            base_name_tag = None
            for t in all_base_name_tags:
@@ -1103,7 +1102,7 @@ class OSMAddressFormatter(object):
                names = defaultdict(list)

                if len(candidate_languages) == 1:
-                    default_language = candidate_languages[0]['lang']
+                    default_language = candidate_languages[0][0]
                elif not more_than_one_official_language:
                    default_language = None
                    name = way['name']
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -12,25 +12,25 @@ plenty of disk space. The following commands can be used in parallel to create
 all the training sets:

 Ways:
-python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --language-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)

 Venues:
-python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --language-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)

 Limited formatted addresses:
-python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --language-rtree-dir=$(LANG_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  -o $(OUT_DIR)
+python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  -o $(OUT_DIR)

 Formatted addresses (tagged):
-python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)
+python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)

 Formatted addresses (untagged):
-python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm  -f -u --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)
+python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm  -f -u --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)

 Intersections (after running intersections.py to create the JSON file):
-python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)
+python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)

 Toponyms:
-python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --language-rtree-dir=$(LANG_RTREE_DIR) -o $(OUT_DIR)
+python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) -o $(OUT_DIR)
 '''

 import argparse
@@ -91,7 +91,7 @@ def normalize_osm_name_tag(tag, script=False):
    return norm.split('_', 1)[0]


-def get_language_names(language_rtree, key, value, tag_prefix='name'):
+def get_language_names(country_rtree, key, value, tag_prefix='name'):
    if not ('lat' in value and 'lon' in value):
        return None, None

@@ -104,7 +104,7 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
    except Exception:
        return None, None

-    country, candidate_languages, language_props = language_rtree.country_and_languages(latitude, longitude)
+    country, candidate_languages = country_rtree.country_and_languages(latitude, longitude)
    if not (country and candidate_languages):
        return None, None

@@ -177,7 +177,7 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
    return country, name_language


-def build_ways_training_data(language_rtree, infile, out_dir, abbreviate_streets=True):
+def build_ways_training_data(country_rtree, infile, out_dir, abbreviate_streets=True):
    '''
    Creates a training set for language classification using most OSM ways
    (streets) under a fairly lengthy osmfilter definition which attempts to
@@ -193,7 +193,7 @@ def build_ways_training_data(language_rtree, infile, out_dir, abbreviate_streets
    writer = csv.writer(f, 'tsv_no_quote')

    for key, value, deps in parse_osm(infile, allowed_types=WAYS_RELATIONS):
-        country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name')
+        country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
        if not name_language:
            continue

@@ -242,7 +242,7 @@ POSTAL_KEYS = (
 )


-def build_toponym_training_data(language_rtree, infile, out_dir):
+def build_toponym_training_data(country_rtree, infile, out_dir):
    '''
    Data set of toponyms by language and country which should assist
    in language classification. OSM tends to use the native language
@@ -268,7 +268,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
        except Exception:
            continue

-        country, candidate_languages, language_props = language_rtree.country_and_languages(latitude, longitude)
+        country, candidate_languages = country_rtree.country_and_languages(latitude, longitude)
        if not (country and candidate_languages):
            continue

@@ -340,7 +340,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
    f.close()


-def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
+def build_address_training_data(country_rtree, infile, out_dir, format=False):
    '''
    Creates training set similar to the ways data but using addr:street tags instead.
    These may be slightly closer to what we'd see in real live addresses, containing
@@ -354,7 +354,7 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
    writer = csv.writer(f, 'tsv_no_quote')

    for key, value, deps in parse_osm(infile):
-        country, street_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street')
+        country, street_language = get_language_names(country_rtree, key, value, tag_prefix='addr:street')
        if not street_language:
            continue

@@ -374,14 +374,14 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
 VENUE_LANGUAGE_DATA_FILENAME = 'names_by_language.tsv'


-def build_venue_training_data(language_rtree, infile, out_dir):
+def build_venue_training_data(country_rtree, infile, out_dir):
    i = 0

    f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w')
    writer = csv.writer(f, 'tsv_no_quote')

    for key, value, deps in parse_osm(infile):
-        country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name')
+        country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
        if not name_language:
            continue

@@ -455,9 +455,9 @@ if __name__ == '__main__':
    parser.add_argument('-x', '--intersections-file',
                        help='Path to planet-ways-latlons.osm')

-    parser.add_argument('--language-rtree-dir',
+    parser.add_argument('--country-rtree-dir',
                        required=True,
-                        help='Language RTree directory')
+                        help='Country RTree directory')

    parser.add_argument('--rtree-dir',
                        default=None,
@@ -493,7 +493,8 @@ if __name__ == '__main__':

    args = parser.parse_args()

-    language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir)
+    country_rtree = OSMCountryReverseGeocoder.load(args.country_rtree_dir)
+
    osm_rtree = None
    if args.rtree_dir:
        osm_rtree = OSMReverseGeocoder.load(args.rtree_dir)
@@ -525,11 +526,11 @@ if __name__ == '__main__':

    # Can parallelize
    if args.streets_file:
-        build_ways_training_data(language_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
+        build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
    if args.borders_file:
-        build_toponym_training_data(language_rtree, args.borders_file, args.out_dir)
+        build_toponym_training_data(country_rtree, args.borders_file, args.out_dir)
    if args.venues_file:
-        build_venue_training_data(language_rtree, args.venues_file, args.out_dir)
+        build_venue_training_data(country_rtree, args.venues_file, args.out_dir)

    if args.address_file or args.intersections_file:
        if osm_rtree is None:
@@ -542,20 +543,20 @@ if __name__ == '__main__':
            parser.error('--geonames-db required for formatted addresses')

    if args.address_file and args.format:
-        components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
-        osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index)
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
        osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
    if args.address_file and args.limited_addresses:
-        components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
-        osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
        osm_formatter.build_limited_training_data(args.address_file, args.out_dir)

    if args.place_nodes_file and args.format:
-        components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
-        osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index)
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
        osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged)

    if args.intersections_file and args.format:
-        components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
-        osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index)
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
        osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged)