[osm/formatting] Adding in cities from Quattroshapes/GeoNames in the case of non-local languages or in general with a small random probability
This commit is contained in:
@@ -349,7 +349,7 @@ def osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude):
|
|||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, infile, out_dir, tag_components=True):
|
def build_address_format_training_data(infile, out_dir, admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, tag_components=True):
|
||||||
'''
|
'''
|
||||||
Creates formatted address training data for supervised sequence labeling (or potentially
|
Creates formatted address training data for supervised sequence labeling (or potentially
|
||||||
for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.
|
for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.
|
||||||
@@ -585,6 +585,28 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
|
|||||||
val = STATE_EXPANSIONS.get(address_country, {}).get(val, val)
|
val = STATE_EXPANSIONS.get(address_country, {}).get(val, val)
|
||||||
address_components[component] = val
|
address_components[component] = val
|
||||||
|
|
||||||
|
'''
|
||||||
|
Quattroshapes/GeoNames cities
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
Quattroshapes isn't great for everything, but it has decent city boundaries
|
||||||
|
in places where OSM sometimes does not (or at least in places where we aren't
|
||||||
|
currently able to create valid polygons). While Quattroshapes itself doesn't
|
||||||
|
reliably use local names, which we'll want for consistency
|
||||||
|
'''
|
||||||
|
|
||||||
|
if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < 0.2):
|
||||||
|
quattroshapes_cities = quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||||
|
for result in quattroshapes_cities:
|
||||||
|
if result.get(quattroshapes.LEVEL) == AddressFormatter.CITY and quattroshapes.GEONAMES_ID in result:
|
||||||
|
geonames_id = int(result[quattroshapes.GEONAMES_ID])
|
||||||
|
names = geonames_db.get_alternate_names(geonames_id)
|
||||||
|
if not names or language not in names:
|
||||||
|
continue
|
||||||
|
|
||||||
|
city = names[language][0]
|
||||||
|
address_components[AddressFormatter.CITY] = city
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Neighborhoods
|
Neighborhoods
|
||||||
-------------
|
-------------
|
||||||
@@ -970,6 +992,10 @@ if __name__ == '__main__':
|
|||||||
default=None,
|
default=None,
|
||||||
help='Quattroshapes reverse geocoder RTree directory')
|
help='Quattroshapes reverse geocoder RTree directory')
|
||||||
|
|
||||||
|
parser.add_argument('-d', '--geonames-db',
|
||||||
|
required=True,
|
||||||
|
help='GeoNames db file')
|
||||||
|
|
||||||
parser.add_argument('-n', '--neighborhoods-rtree-dir',
|
parser.add_argument('-n', '--neighborhoods-rtree-dir',
|
||||||
default=None,
|
default=None,
|
||||||
help='Neighborhoods reverse geocoder RTree directory')
|
help='Neighborhoods reverse geocoder RTree directory')
|
||||||
@@ -992,6 +1018,15 @@ if __name__ == '__main__':
|
|||||||
if args.neighborhoods_rtree_dir:
|
if args.neighborhoods_rtree_dir:
|
||||||
neighborhoods_rtree = NeighborhoodReverseGeocoder.load(args.neighborhoods_rtree_dir)
|
neighborhoods_rtree = NeighborhoodReverseGeocoder.load(args.neighborhoods_rtree_dir)
|
||||||
|
|
||||||
|
quattroshapes_rtree = None
|
||||||
|
if args.quattroshapes_rtree_dir:
|
||||||
|
quattroshapes_rtree = QuattroshapesReverseGeocoder.load(args.quattroshapes_rtree_dir)
|
||||||
|
|
||||||
|
geonames = None
|
||||||
|
|
||||||
|
if args.geonames_db:
|
||||||
|
geonames = GeoNamesDB(args.geonames_db)
|
||||||
|
|
||||||
street_types_gazetteer.configure()
|
street_types_gazetteer.configure()
|
||||||
|
|
||||||
# Can parallelize
|
# Can parallelize
|
||||||
@@ -1002,11 +1037,18 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
if args.address_file and not args.format_only and not args.limited_addresses:
|
if args.address_file and not args.format_only and not args.limited_addresses:
|
||||||
build_address_training_data(language_rtree, args.address_file, args.out_dir)
|
build_address_training_data(language_rtree, args.address_file, args.out_dir)
|
||||||
elif args.address_file and not args.limited_addresses and osm_rtree is None:
|
elif args.address_file and not args.limited_addresses:
|
||||||
parser.error('--rtree-dir required for formatted addresses')
|
if osm_rtree is None:
|
||||||
|
parser.error('--rtree-dir required for formatted addresses')
|
||||||
|
elif neighborhoods_rtree is None:
|
||||||
|
parser.error('--neighborhoods-rtree-dir required for formatted addresses')
|
||||||
|
elif quattroshapes_rtree is None:
|
||||||
|
parser.error('--quattroshapes-rtree-dir required for formatted addresses')
|
||||||
|
elif geonames is None:
|
||||||
|
parser.error('--geonames-db required for formatted addresses')
|
||||||
|
|
||||||
if args.address_file and args.format_only:
|
if args.address_file and args.format_only:
|
||||||
build_address_format_training_data(osm_rtree, language_rtree, neighborhoods_rtree, args.address_file, args.out_dir, tag_components=not args.untagged)
|
build_address_format_training_data(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, args.address_file, args.out_dir, tag_components=not args.untagged)
|
||||||
if args.address_file and args.limited_addresses:
|
if args.address_file and args.limited_addresses:
|
||||||
build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir)
|
build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir)
|
||||||
if args.venues_file:
|
if args.venues_file:
|
||||||
|
|||||||
Reference in New Issue
Block a user