From 675552d2543fb338d5631c21d2a7afcb16d62d54 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 9 Dec 2016 17:50:08 -0500 Subject: [PATCH] [addresses] using normalized tokens when stripping off compound place names for things like D.C. --- scripts/geodata/addresses/components.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 56db8fb9..c3942aff 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -358,7 +358,9 @@ class AddressComponents(object): phrase_filter = PhraseFilter([(n.lower(), '') for n in names]) tokens = tokenize(name) - tokens_lower = [(t.lower(), c) for t, c in tokens] + tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE, + token_options=TOKEN_OPTIONS_DROP_PERIODS) + phrases = list(phrase_filter.filter(tokens_lower)) num_phrases = 0