From b1b797171c728b20230f84630313f99a9b66f47e Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 22 Jul 2016 14:52:14 -0400
Subject: [PATCH] =?UTF-8?q?[osm]=20Combining=20addr:block=5Fnumber=20and?=
 =?UTF-8?q?=20addr:housenumber=20in=20Japan=20(randomly=20adds=20phrases?=
 =?UTF-8?q?=20for=20the=20=E7=95=AA=E5=8F=B7/bango=20system)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 resources/parser/data_sets/osm.yaml |  5 +++
 scripts/geodata/osm/formatter.py    | 63 +++++++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/resources/parser/data_sets/osm.yaml b/resources/parser/data_sets/osm.yaml
index 3355aaaf..2be766d1 100644
--- a/resources/parser/data_sets/osm.yaml
+++ b/resources/parser/data_sets/osm.yaml
@@ -24,3 +24,8 @@ venues:
     abbreviate_probability: 0.3
     separate_probability: 0.0
 
+countries:
+    jp:
+        # Always do this as there are plenty of examples of block numbers without house number
+        combine_block_house_number_probability: 1.0
+        block_phrase_probability: 0.4
\ No newline at end of file
diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py
index 3265cc97..9fb4804d 100644
--- a/scripts/geodata/osm/formatter.py
+++ b/scripts/geodata/osm/formatter.py
@@ -47,6 +47,10 @@ INTERSECTIONS_TAGGED_FILENAME = 'intersections_tagged.tsv'
 
 ALL_LANGUAGES = 'all'
 
+JAPAN = 'jp'
+JAPANESE = 'ja'
+JAPANESE_ROMAJI = 'ja_rm'
+
 
 class OSMAddressFormatter(object):
     aliases = Aliases(
@@ -240,6 +244,54 @@ class OSMAddressFormatter(object):
             return True
         return False
 
+    def combine_japanese_house_number(self, address_components, language):
+        '''
+        Japanese house numbers
+        ----------------------
+
+        Addresses in Japan are pretty unique.
+        There are no street names in most of the country, and so buildings
+        are addressed by the following:
+
+        1. the neighborhood (丁目 or chōme), usually numberic e.g. 4-chōme
+        2. the block number (OSM uses addr:block_number for this)
+        3. the house number
+
+        Sometimes only the block number and house number are abbreviated.
+
+        For libpostal, we want to parse:
+        2丁目3-5 as {'suburb': '2丁目', 'house_number': '3-5'}
+
+        and the abbreviated "2-3-5" as simply house_number and leave
+        it up to the end user to split up that number or not.
+
+        At this stage we're still working with the original OSM tags,
+        so only combine addr_block_number with addr:housenumber
+
+        See: https://en.wikipedia.org/wiki/Japanese_addressing_system
+        '''
+        house_number = address_components.get('addr:housenumber')
+        if not house_number or not house_number.isdigit():
+            return
+
+        block = address_components.get('addr:block_number')
+        if not block or not block.isdigit():
+            return
+
+        separator = six.u('-')
+
+        combine_probability = float(nested_get(self.config, ('countries', 'jp', 'combine_block_house_number_probability'), default=0.0))
+        if random.random() < combine_probability:
+            if random.random() < float(nested_get(self.config, ('countries', 'jp', 'block_phrase_probability'), default=0.0)):
+                block = Block.phrase(language, block_number)
+                house_number = HouseNumber.phrase(house_number, language)
+                if block is None or house_number is None:
+                    return
+                separator = six.u(' ') if language == JAPANESE_ROMAJI else six.u('')
+
+            house_number = separator.join([block, house_number])
+            address_components['addr:housenumber'] = house_number
+
     def venue_names(self, props, languages):
         '''
         Venue names
@@ -382,11 +434,14 @@ class OSMAddressFormatter(object):
 
         combined_street = self.combine_street_name(tags)
 
-        country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
-        if not (country and candidate_languages):
-            return None, None, None
-
         namespaced_language = self.namespaced_language(tags, candidate_languages)
+        language = None
+
+        if country == JAPAN:
+            language = JAPANESE
+            if random.random() < float(nested_get(self.config, ('countries', 'jp', 'romaji_probability'), default=0.0)):
+                language = JAPANESE_ROMAJI
+            self.combine_japanese_house_number(tags, language)
 
         revised_tags = self.normalize_address_components(tags)