From 1df1b60a9fac94414d17a05a928d5dbdf0a3f9b7 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 18 Nov 2016 23:35:44 -0500 Subject: [PATCH] [phrases] adding extract_phrases method to gazetteers, which returns a set of gazetteer phrases found in a given string --- .../geodata/address_expansions/gazetteers.py | 71 +++++++++++++++---- 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/scripts/geodata/address_expansions/gazetteers.py b/scripts/geodata/address_expansions/gazetteers.py index 493d817e..ea828155 100644 --- a/scripts/geodata/address_expansions/gazetteers.py +++ b/scripts/geodata/address_expansions/gazetteers.py @@ -1,6 +1,5 @@ import os -import sys -import ujson as json +import six from collections import defaultdict, OrderedDict @@ -119,6 +118,40 @@ class DictionaryPhraseFilter(PhraseFilter): c = token_types.PHRASE yield t, c, len(t), map(safe_decode, data) + def gen_phrases(self, s, canonical_only=False, languages=None): + tokens = tokenize(s) + norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] + + if not languages: + languages = None + elif not hasattr(languages, '__contains__'): + languages = set([languages]) + + for t, c, length, data in self.filter(norm_tokens): + if c == token_types.PHRASE: + if not canonical_only and languages is None: + yield six.u(' ').join([t_i for t_i, c_i in t]) + else: + phrase = None + for d in data: + lang, dictionary, is_canonical, canonical = d.split(six.b('|')) + + if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages): + phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t]) + yield phrase + + def string_contains_phrases(self, s, canonical_only=False, languages=None): + phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages) + try: + phrases.next() + return True + except StopIteration: + return False + + def extract_phrases(self, s, canonical_only=False, languages=None): + return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages)) + + STREET_TYPES_DICTIONARIES = ('street_types', 'directionals', 'concatenated_suffixes_separable', @@ -138,18 +171,18 @@ CHAIN_DICTIONARY = 'chains' SYNONYM_DICTIONARY = 'synonyms' -NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY, - SURNAME_DICTIONARY,) +PERSONAL_NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY, + SURNAME_DICTIONARY,) -NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees', - 'building_types', - 'company_types', - 'place_names', - 'qualifiers', - 'synonyms', - 'toponyms', - ) +NAME_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees', + 'building_types', + 'company_types', + 'place_names', + 'qualifiers', + 'synonyms', + 'toponyms', + ) QUALIFIERS_DICTIONARY = 'qualifiers' @@ -177,9 +210,18 @@ UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement', 'unit_types_standalone', ) +VENUE_NAME_DICTIONARIES = ('academic_degrees', + 'building_types', + 'company_types', + 'organizations', + 'people', + 'personal_suffixes', + 'personal_titles', + 'place_names', + ) ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \ - NAME_ABBREVIATION_DICTIONARIES + \ + NAME_DICTIONARIES + \ UNIT_ABBREVIATION_DICTIONARIES + \ ('no_number', 'nulls',) @@ -195,7 +237,7 @@ def create_gazetteer(*dictionaries): street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES) qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY) -names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES) +names_gazetteer = create_gazetteer(*NAME_DICTIONARIES) chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY) unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES) street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, ))) @@ -203,3 +245,4 @@ abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES) toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES) toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY) given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY) +venue_name_gazetteer = create_gazetteer(*VENUE_NAME_DICTIONARIES)