[addresses] using normalized tokens when stripping off compound place names for things like D.C.

2016-12-09 17:50:08 -05:00
parent c0a468d7e8
commit 675552d254
1 changed files with 3 additions and 1 deletions
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -358,7 +358,9 @@ class AddressComponents(object):
        phrase_filter = PhraseFilter([(n.lower(), '') for n in names])

        tokens = tokenize(name)
-        tokens_lower = [(t.lower(), c) for t, c in tokens]
+        tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
+                                         token_options=TOKEN_OPTIONS_DROP_PERIODS)
+
        phrases = list(phrase_filter.filter(tokens_lower))

        num_phrases = 0