From 1df1b60a9fac94414d17a05a928d5dbdf0a3f9b7 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 18 Nov 2016 23:35:44 -0500
Subject: [PATCH] [phrases] adding extract_phrases method to gazetteers, which
 returns a set of gazetteer phrases found in a given string

---
 .../geodata/address_expansions/gazetteers.py  | 71 +++++++++++++++----
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/scripts/geodata/address_expansions/gazetteers.py b/scripts/geodata/address_expansions/gazetteers.py
index 493d817e..ea828155 100644
--- a/scripts/geodata/address_expansions/gazetteers.py
+++ b/scripts/geodata/address_expansions/gazetteers.py
@@ -1,6 +1,5 @@
 import os
-import sys
-import ujson as json
+import six
 
 from collections import defaultdict, OrderedDict
 
@@ -119,6 +118,40 @@ class DictionaryPhraseFilter(PhraseFilter):
                 c = token_types.PHRASE
             yield t, c, len(t), map(safe_decode, data)
 
+    def gen_phrases(self, s, canonical_only=False, languages=None):
+        tokens = tokenize(s)
+        norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
+
+        if not languages:
+            languages = None
+        elif not hasattr(languages, '__contains__'):
+            languages = set([languages])
+
+        for t, c, length, data in self.filter(norm_tokens):
+            if c == token_types.PHRASE:
+                if not canonical_only and languages is None:
+                    yield six.u(' ').join([t_i for t_i, c_i in t])
+                else:
+                    phrase = None
+                    for d in data:
+                        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
+
+                        if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages):
+                            phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
+                            yield phrase
+
+    def string_contains_phrases(self, s, canonical_only=False, languages=None):
+        phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages)
+        try:
+            phrases.next()
+            return True
+        except StopIteration:
+            return False
+
+    def extract_phrases(self, s, canonical_only=False, languages=None):
+        return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
+
+
 STREET_TYPES_DICTIONARIES = ('street_types',
                              'directionals',
                              'concatenated_suffixes_separable',
@@ -138,18 +171,18 @@ CHAIN_DICTIONARY = 'chains'
 
 SYNONYM_DICTIONARY = 'synonyms'
 
-NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
-                     SURNAME_DICTIONARY,)
+PERSONAL_NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
+                              SURNAME_DICTIONARY,)
 
 
-NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
-                                                              'building_types',
-                                                              'company_types',
-                                                              'place_names',
-                                                              'qualifiers',
-                                                              'synonyms',
-                                                              'toponyms',
-                                                              )
+NAME_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
+                                                 'building_types',
+                                                 'company_types',
+                                                 'place_names',
+                                                 'qualifiers',
+                                                 'synonyms',
+                                                 'toponyms',
+                                                 )
 
 QUALIFIERS_DICTIONARY = 'qualifiers'
 
@@ -177,9 +210,18 @@ UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement',
                                   'unit_types_standalone',
                                   )
 
+VENUE_NAME_DICTIONARIES = ('academic_degrees',
+                           'building_types',
+                           'company_types',
+                           'organizations',
+                           'people',
+                           'personal_suffixes',
+                           'personal_titles',
+                           'place_names',
+                           )
 
 ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
-    NAME_ABBREVIATION_DICTIONARIES + \
+    NAME_DICTIONARIES + \
     UNIT_ABBREVIATION_DICTIONARIES + \
     ('no_number', 'nulls',)
 
@@ -195,7 +237,7 @@ def create_gazetteer(*dictionaries):
 
 street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
 qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY)
-names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES)
+names_gazetteer = create_gazetteer(*NAME_DICTIONARIES)
 chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
 unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
 street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, )))
@@ -203,3 +245,4 @@ abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
 toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES)
 toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY)
 given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
+venue_name_gazetteer = create_gazetteer(*VENUE_NAME_DICTIONARIES)