[osm] Combining addr:block_number and addr:housenumber in Japan (randomly adds phrases for the 番号/bango system)
This commit is contained in:
@@ -24,3 +24,8 @@ venues:
|
|||||||
abbreviate_probability: 0.3
|
abbreviate_probability: 0.3
|
||||||
separate_probability: 0.0
|
separate_probability: 0.0
|
||||||
|
|
||||||
|
countries:
|
||||||
|
jp:
|
||||||
|
# Always do this as there are plenty of examples of block numbers without house number
|
||||||
|
combine_block_house_number_probability: 1.0
|
||||||
|
block_phrase_probability: 0.4
|
||||||
@@ -47,6 +47,10 @@ INTERSECTIONS_TAGGED_FILENAME = 'intersections_tagged.tsv'
|
|||||||
|
|
||||||
ALL_LANGUAGES = 'all'
|
ALL_LANGUAGES = 'all'
|
||||||
|
|
||||||
|
JAPAN = 'jp'
|
||||||
|
JAPANESE = 'ja'
|
||||||
|
JAPANESE_ROMAJI = 'ja_rm'
|
||||||
|
|
||||||
|
|
||||||
class OSMAddressFormatter(object):
|
class OSMAddressFormatter(object):
|
||||||
aliases = Aliases(
|
aliases = Aliases(
|
||||||
@@ -240,6 +244,54 @@ class OSMAddressFormatter(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def combine_japanese_house_number(self, address_components, language):
|
||||||
|
'''
|
||||||
|
Japanese house numbers
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Addresses in Japan are pretty unique.
|
||||||
|
There are no street names in most of the country, and so buildings
|
||||||
|
are addressed by the following:
|
||||||
|
|
||||||
|
1. the neighborhood (丁目 or chōme), usually numberic e.g. 4-chōme
|
||||||
|
2. the block number (OSM uses addr:block_number for this)
|
||||||
|
3. the house number
|
||||||
|
|
||||||
|
Sometimes only the block number and house number are abbreviated.
|
||||||
|
|
||||||
|
For libpostal, we want to parse:
|
||||||
|
2丁目3-5 as {'suburb': '2丁目', 'house_number': '3-5'}
|
||||||
|
|
||||||
|
and the abbreviated "2-3-5" as simply house_number and leave
|
||||||
|
it up to the end user to split up that number or not.
|
||||||
|
|
||||||
|
At this stage we're still working with the original OSM tags,
|
||||||
|
so only combine addr_block_number with addr:housenumber
|
||||||
|
|
||||||
|
See: https://en.wikipedia.org/wiki/Japanese_addressing_system
|
||||||
|
'''
|
||||||
|
house_number = address_components.get('addr:housenumber')
|
||||||
|
if not house_number or not house_number.isdigit():
|
||||||
|
return
|
||||||
|
|
||||||
|
block = address_components.get('addr:block_number')
|
||||||
|
if not block or not block.isdigit():
|
||||||
|
return
|
||||||
|
|
||||||
|
separator = six.u('-')
|
||||||
|
|
||||||
|
combine_probability = float(nested_get(self.config, ('countries', 'jp', 'combine_block_house_number_probability'), default=0.0))
|
||||||
|
if random.random() < combine_probability:
|
||||||
|
if random.random() < float(nested_get(self.config, ('countries', 'jp', 'block_phrase_probability'), default=0.0)):
|
||||||
|
block = Block.phrase(language, block_number)
|
||||||
|
house_number = HouseNumber.phrase(house_number, language)
|
||||||
|
if block is None or house_number is None:
|
||||||
|
return
|
||||||
|
separator = six.u(' ') if language == JAPANESE_ROMAJI else six.u('')
|
||||||
|
|
||||||
|
house_number = separator.join([block, house_number])
|
||||||
|
address_components['addr:housenumber'] = house_number
|
||||||
|
|
||||||
def venue_names(self, props, languages):
|
def venue_names(self, props, languages):
|
||||||
'''
|
'''
|
||||||
Venue names
|
Venue names
|
||||||
@@ -382,11 +434,14 @@ class OSMAddressFormatter(object):
|
|||||||
|
|
||||||
combined_street = self.combine_street_name(tags)
|
combined_street = self.combine_street_name(tags)
|
||||||
|
|
||||||
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
|
|
||||||
if not (country and candidate_languages):
|
|
||||||
return None, None, None
|
|
||||||
|
|
||||||
namespaced_language = self.namespaced_language(tags, candidate_languages)
|
namespaced_language = self.namespaced_language(tags, candidate_languages)
|
||||||
|
language = None
|
||||||
|
|
||||||
|
if country == JAPAN:
|
||||||
|
language = JAPANESE
|
||||||
|
if random.random() < float(nested_get(self.config, ('countries', 'jp', 'romaji_probability'), default=0.0)):
|
||||||
|
language = JAPANESE_ROMAJI
|
||||||
|
self.combine_japanese_house_number(tags, language)
|
||||||
|
|
||||||
revised_tags = self.normalize_address_components(tags)
|
revised_tags = self.normalize_address_components(tags)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user