From 651bc32650aab6f92b8ef1840235da16b2ce8e15 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Thu, 5 May 2016 12:29:56 -0400
Subject: [PATCH] [addresses] more thoroughly solving the addr:city='Harlem'
 issue

---
 scripts/geodata/addresses/components.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py
index eff434a0..883de4e1 100644
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -200,6 +200,8 @@ class AddressExpander(object):
 
             for k, v in six.iteritems(props):
                 normalized_key = osm_address_components.get_component(country, k, v)
+                if not normalized_key:
+                    continue
                 for cn in component_names:
                     components[cn.lower()].add(normalized_key)
 
@@ -217,17 +219,33 @@ class AddressExpander(object):
 
         num_phrases = 0
         total_tokens = 0
+        current_phrase_start = 0
+        current_phrase_len = 0
+        current_phrase = []
+
         for is_phrase, phrase_tokens, value in phrases:
             if is_phrase:
                 join_phrase = six.u(' ') if whitespace else six.u('')
+
                 if num_phrases > 0:
+                    current_phrase_tokens = tokens_lower[current_phrase_start:current_phrase_start + current_phrase_len]
+
+                    current_phrase = join_phrase.join([t for t, c in current_phrase_tokens])
+                    # Handles cases like addr:city="Harlem" when Harlem is a neighborhood
+                    tags = components.get(current_phrase, set())
+                    if tags and tag not in tags:
+                        return None
+
+                    # Return phrase with original capitalization
                     return join_phrase.join([t for t, c in tokens[:total_tokens]])
                 elif num_phrases == 0 and total_tokens > 0:
                     phrase = join_phrase.join([t for t, c in phrase_tokens])
                     if tag not in components.get(phrase, set()):
                         return None
 
-                current_phrase = tokens[total_tokens:total_tokens + len(phrase_tokens)]
+                current_phrase_start = total_tokens
+                current_phrase_len = len(phrase_tokens)
+
                 total_tokens += len(phrase_tokens)
                 num_phrases += 1
             else: