From f00e425891b13204984040e9ca66de428a5e674f Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Wed, 4 May 2016 02:44:51 -0400
Subject: [PATCH] [osm] Adding parse_osm_number_range for addr:flats and
 addr:unit

---
 scripts/geodata/osm/extract.py | 58 +++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/scripts/geodata/osm/extract.py b/scripts/geodata/osm/extract.py
index 67d66398..7c470829 100644
--- a/scripts/geodata/osm/extract.py
+++ b/scripts/geodata/osm/extract.py
@@ -8,6 +8,7 @@ from .osm XML files.
 
 import os
 import re
+import six
 import sys
 import urllib
 import ujson as json
@@ -22,7 +23,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 
 from geodata.address_formatting.formatter import AddressFormatter
 from geodata.csv_utils import unicode_csv_reader
-
+from geodata.text.normalize import normalize_string, NORMALIZE_STRING_DECOMPOSE, NORMALIZE_STRING_LATIN_ASCII
 
 OSM_BOUNDARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
                                   'resources', 'boundaries', 'osm')
@@ -127,6 +128,58 @@ def osm_wikipedia_title_and_language(key, value):
     return normalize_wikipedia_title(value), language
 
 
+non_breaking_dash = six.u('[-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]')
+simple_number = six.u('(?:{})?[0-9]+(?:\.[0-9]+)?').format(non_breaking_dash)
+simple_number_regex = re.compile(simple_number, re.UNICODE)
+
+non_breaking_dash_regex = re.compile(non_breaking_dash, re.UNICODE)
+number_range_regex = re.compile(six.u('({}){}({})').format(simple_number, non_breaking_dash, simple_number), re.UNICODE)
+letter_range_regex = re.compile(r'([^\W\d_]){}([^\W\d_])'.format(non_breaking_dash.encode('unicode-escape')), re.UNICODE)
+
+
+def parse_osm_number_range(value):
+    value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE)
+    numbers = []
+    values = value.split(six.u(';'))
+    for val in values:
+        val = val.strip()
+        match = number_range_regex.match(val)
+        if match:
+            start_num, end_num = match.groups()
+            try:
+                start_num = int(start_num)
+                end_num = int(end_num)
+                if end_num > start_num:
+                    if end_num - start_num > 100:
+                        end_num = start_num + 100
+                    for i in xrange(start_num, end_num + 1):
+                        numbers.append(safe_decode(i))
+                else:
+                    numbers.extend([start_num, end_num])
+                    continue
+            except (TypeError, ValueError):
+                numbers.extend([start_num, end_num])
+                continue
+
+        else:
+            letter_match = letter_range_regex.match(val)
+            if letter_match:
+                start_num, end_num = letter_match.groups()
+                start_num = ord(start_num)
+                end_num = ord(end_num)
+                if end_num > start_num:
+                    if end_num - start_num > 100:
+                        end_num = start_num + 100
+                    for i in xrange(start_num, end_num + 1):
+                        numbers.append(six.unichr(i))
+                else:
+                    numbers.extend([six.unichr(start_num), six.unichr(end_num)])
+                    continue
+            else:
+                numbers.extend(non_breaking_dash_regex.split(safe_decode(val)))
+    return numbers
+
+
 class OSMAddressComponents(object):
     '''
     Keeps a map of OSM keys and values to the standard components
@@ -146,9 +199,12 @@ class OSMAddressComponents(object):
             'region': AddressFormatter.STATE,
             'province': AddressFormatter.STATE,
             'county': AddressFormatter.STATE_DISTRICT,
+            'island': AddressFormatter.ISLAND,
+            'islet': AddressFormatter.ISLAND,
             'municipality': AddressFormatter.CITY,
             'city': AddressFormatter.CITY,
             'town': AddressFormatter.CITY,
+            'township': AddressFormatter.CITY,
             'village': AddressFormatter.CITY,
             'hamlet': AddressFormatter.CITY,
             'borough': AddressFormatter.CITY_DISTRICT,