Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/address_expansions/init.py
+++ b/scripts/geodata/address_expansions/init.py
--- a/scripts/geodata/address_expansions/abbreviations.py
+++ b/scripts/geodata/address_expansions/abbreviations.py
@@ -0,0 +1,233 @@
+import random
+import re
+import six
+
+from geodata.address_expansions.gazetteers import *
+from geodata.encoding import safe_decode, safe_encode
+from geodata.text.tokenize import tokenize_raw, token_types
+from geodata.text.utils import non_breaking_dash_regex
+
+
+LOWER, UPPER, TITLE, MIXED = range(4)
+
+
+def token_capitalization(s):
+    if s.istitle():
+        return TITLE
+    elif s.islower():
+        return LOWER
+    elif s.isupper():
+        return UPPER
+    else:
+        return MIXED
+
+
+expansion_token_regex = re.compile('([^  \-\.]+)([\.\- ]+|$)')
+
+
+def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
+    expansion_tokens = expansion_token_regex.findall(expansion)
+
+    if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
+        expansion_tokenized = tokenize(expansion)
+        is_acronym = len(expansion_tokenized) == 1 and expansion_tokenized[0][1] == token_types.ACRONYM
+        if len(expansion) <= 3 or is_acronym:
+            return expansion.upper()
+        else:
+            return expansion.title()
+    elif len(tokens) == len(expansion_tokens):
+        strings = []
+        for (t, c), (e, suf) in zip(tokens, expansion_tokens):
+            cap = token_capitalization(t)
+            if suf == six.u(' '):
+                suf = space_token
+            if cap == LOWER:
+                strings.append(six.u('').join((e.lower(), suf)))
+            elif cap == UPPER:
+                strings.append(six.u('').join((e.upper(), suf)))
+            elif cap == TITLE:
+                strings.append(six.u('').join((e.title(), suf)))
+            elif t.lower() == e.lower():
+                strings.append(t)
+            else:
+                strings.append(six.u('').join((e.title(), suf)))
+        return six.u('').join(strings)
+    else:
+
+        strings = []
+        for e, suf in expansion_tokens:
+            strings.append(e.title())
+            if suf == six.u(' '):
+                strings.append(space_token)
+            else:
+                strings.append(suf)
+        return six.u('').join(strings)
+
+
+def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
+    '''
+    Abbreviations
+    -------------
+
+    OSM discourages abbreviations, but to make our training data map better
+    to real-world input, we can safely replace the canonical phrase with an
+    abbreviated version and retain the meaning of the words
+    '''
+    raw_tokens = tokenize_raw(s)
+    s_utf8 = safe_encode(s)
+    tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
+    norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
+
+    n = len(tokens)
+
+    abbreviated = []
+
+    i = 0
+
+    def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
+        data = [d.split(six.b('|')) for d in data]
+
+        # local copy
+        abbreviated = []
+
+        n = len(t)
+
+        # Append the original tokens with whitespace if there is any
+        if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
+            for j, (t_i, c_i) in enumerate(t):
+                abbreviated.append(tokens[i + j][0])
+
+                if j < n - 1:
+                    abbreviated.append(space_token)
+            return abbreviated
+
+        for lang, dictionary, is_canonical, canonical in data:
+            if lang not in (language, 'all'):
+                continue
+
+            is_canonical = int(is_canonical)
+            is_stopword = dictionary == 'stopword'
+            is_prefix = dictionary.startswith('concatenated_prefixes')
+            is_suffix = dictionary.startswith('concatenated_suffixes')
+            is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
+
+            suffix = None
+            prefix = None
+
+            if not is_canonical:
+                continue
+
+            if not is_prefix and not is_suffix:
+                abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
+                # TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
+                # would require an audit of the dictionaries though so abbreviations are listed from
+                # left-to-right by frequency of usage
+                token = random.choice(abbreviations) if abbreviations else canonical
+                token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
+                abbreviated.append(token)
+                break
+            elif is_prefix:
+                token = tokens[i][0]
+                prefix, token = token[:length], token[length:]
+
+                abbreviated.append(prefix)
+                if random.random() < separate_prob:
+                    sub_tokens = tokenize(token)
+                    if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
+                        token = six.u('').join((t for t, c in sub_tokens[1:]))
+
+                    abbreviated.append(space_token)
+                if token.islower():
+                    abbreviated.append(token.title())
+                else:
+                    abbreviated.append(token)
+                abbreviated.append(space_token)
+                break
+            elif is_suffix:
+                token = tokens[i][0]
+
+                token, suffix = token[:-length], token[-length:]
+
+                concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
+
+                separated_abbreviations = []
+                phrase = gazetteer.trie.get(suffix.rstrip('.'))
+                suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
+                for l, d, _, c in suffix_data:
+                    if l == lang and c == canonical:
+                        separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
+
+                separate = random.random() < separate_prob
+
+                if concatenated_abbreviations and not separate:
+                    abbreviation = random.choice(concatenated_abbreviations)
+                elif separated_abbreviations:
+                    abbreviation = random.choice(separated_abbreviations)
+                else:
+                    abbreviation = canonical
+
+                if separate:
+                    sub_tokens = tokenize(token)
+                    if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
+                        token = six.u('').join((t for t, c in sub_tokens[:-1]))
+
+                abbreviated.append(token)
+                if separate:
+                    abbreviated.append(space_token)
+                if suffix.isupper():
+                    abbreviated.append(abbreviation.upper())
+                elif separate:
+                    abbreviated.append(abbreviation.title())
+                else:
+                    abbreviated.append(abbreviation)
+                break
+        else:
+            for j, (t_i, c_i) in enumerate(t):
+                abbreviated.append(tokens[i + j][0])
+                if j < n - 1:
+                    abbreviated.append(space_token)
+        return abbreviated
+
+    for t, c, length, data in gazetteer.filter(norm_tokens):
+        if c == token_types.PHRASE:
+            abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
+            abbreviated.extend(abbrev_tokens)
+
+            if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
+                abbreviated.append(six.u(' '))
+
+            i += len(t)
+
+        else:
+            token = tokens[i][0]
+            if not non_breaking_dash_regex.search(token):
+                abbreviated.append(token)
+            else:
+                sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
+                sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
+
+                sub_token_abbreviated = []
+                sub_i = 0
+                sub_n = len(sub_tokens)
+                for t, c, length, data in gazetteer.filter(sub_tokens_norm):
+                    if c == token_types.PHRASE:
+                        abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
+                        sub_token_abbreviated.extend(abbrev_tokens)
+                        sub_i += len(t)
+                        if sub_i < sub_n:
+                            if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
+                                sub_token_abbreviated.append(six.u('.'))
+                            sub_token_abbreviated.append(six.u('-'))
+                    else:
+                        sub_token_abbreviated.append(sub_tokens[sub_i][0])
+                        sub_i += 1
+                        if sub_i < sub_n:
+                            sub_token_abbreviated.append(six.u('-'))
+
+                abbreviated.append(six.u('').join(sub_token_abbreviated))
+
+            if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
+                abbreviated.append(six.u(' '))
+            i += 1
+
+    return six.u('').join(abbreviated).strip()
--- a/scripts/geodata/address_expansions/address_dictionaries.py
+++ b/scripts/geodata/address_expansions/address_dictionaries.py
@@ -0,0 +1,254 @@
+import os
+import sys
+
+from collections import defaultdict
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.encoding import safe_encode, safe_decode
+
+ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                      'resources', 'dictionaries')
+
+ADDRESS_HEADER_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_rule.h')
+ADDRESS_DATA_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c')
+
+address_language_index_template = u'{{{language}, {index}, {length}}}'
+address_expansion_rule_template = u'{{{phrase}, {num_dictionaries}, {{{dictionaries}}}, {canonical_index}}}'
+
+
+address_expansion_rule_header_template = u'''
+#ifndef ADDRESS_EXPANSION_RULE_H
+#define ADDRESS_EXPANSION_RULE_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "constants.h"
+#include "gazetteers.h"
+
+#define MAX_DICTIONARY_TYPES {max_dictionary_types}
+
+typedef struct address_expansion_rule {{
+    char *phrase;
+    uint32_t num_dictionaries;
+    dictionary_type_t dictionaries[MAX_DICTIONARY_TYPES];
+    int32_t canonical_index;
+}} address_expansion_rule_t;
+
+typedef struct address_language_index {{
+    char language[MAX_LANGUAGE_LEN];
+    uint32_t index;
+    size_t len;
+}} address_language_index_t;
+
+
+#endif
+'''
+
+address_expansion_data_file_template = u'''
+char *canonical_strings[] = {{
+    {canonical_strings}
+}};
+
+address_expansion_rule_t expansion_rules[] = {{
+    {expansion_rules}
+}};
+
+address_language_index_t expansion_languages[] = {{
+    {address_languages}
+}};
+'''
+
+
+gazetteer_types = {
+    'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE',
+    'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION',
+    'building_types': 'DICTIONARY_BUILDING_TYPE',
+    'categories': 'DICTIONARY_CATEGORY',
+    'categories_plural': 'DICTIONARY_CATEGORY_PLURAL',
+    'chains': 'DICTIONARY_CHAIN',
+    'company_types': 'DICTIONARY_COMPANY_TYPE',
+    'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE',
+    'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE',
+    'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE',
+    'cross_streets': 'DICTIONARY_CROSS_STREET',
+    'directionals': 'DICTIONARY_DIRECTIONAL',
+    'elisions': 'DICTIONARY_ELISION',
+    'entrances': 'DICTIONARY_ENTRANCE',
+    'given_names': 'DICTIONARY_GIVEN_NAME',
+    'house_numbers': 'DICTIONARY_HOUSE_NUMBER',
+    'level_types_basement': 'DICTIONARY_LEVEL_BASEMENT',
+    'level_types_mezzanine': 'DICTIONARY_LEVEL_MEZZANINE',
+    'level_types_numbered': 'DICTIONARY_LEVEL_NUMBERED',
+    'level_types_standalone': 'DICTIONARY_LEVEL_STANDALONE',
+    'level_types_sub_basement': 'DICTIONARY_LEVEL_SUB_BASEMENT',
+    'near': 'DICTIONARY_NEAR',
+    'no_number': 'DICTIONARY_NO_NUMBER',
+    'number': 'DICTIONARY_NUMBER',
+    'nulls': 'DICTIONARY_NULL',
+    'organizations': 'DICTIONARY_NAMED_ORGANIZATION',
+    'people': 'DICTIONARY_NAMED_PERSON',
+    'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX',
+    'personal_titles': 'DICTIONARY_PERSONAL_TITLE',
+    'place_names': 'DICTIONARY_PLACE_NAME',
+    'post_office': 'DICTIONARY_POST_OFFICE',
+    'postcodes': 'DICTIONARY_POSTAL_CODE',
+    'qualifiers': 'DICTIONARY_QUALIFIER',
+    'staircases': 'DICTIONARY_STAIRCASE',
+    'stopwords': 'DICTIONARY_STOPWORD',
+    'street_names': 'DICTIONARY_STREET_NAME',
+    'street_types': 'DICTIONARY_STREET_TYPE',
+    'surnames': 'DICTIONARY_SURNAME',
+    'synonyms': 'DICTIONARY_SYNONYM',
+    'toponyms': 'DICTIONARY_TOPONYM',
+    'unit_directions': 'DICTIONARY_UNIT_DIRECTION',
+    'unit_types_numbered': 'DICTIONARY_UNIT_NUMBERED',
+    'unit_types_standalone': 'DICTIONARY_UNIT_STANDALONE',
+
+}
+
+
+class InvalidAddressFileException(Exception):
+    pass
+
+
+def read_dictionary_file(path):
+    for i, line in enumerate(open(path)):
+        line = safe_decode(line.rstrip())
+        if not line.strip():
+            continue
+
+        if u'}' in line:
+            raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
+        phrases = line.split(u'|')
+
+        if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
+            raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
+
+        yield phrases
+
+
+def quote_string(s):
+    return u'"{}"'.format(safe_decode(s).replace('\\', '\\\\').replace('"', '\\"'))
+
+
+class AddressPhraseDictionaries(object):
+    def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
+        self.base_dir = base_dir
+        self.languages = []
+
+        self.language_dictionaries = defaultdict(list)
+        self.phrases = defaultdict(list)
+
+        for language in os.listdir(base_dir):
+            language_dir = os.path.join(base_dir, language)
+            if not os.path.isdir(language_dir):
+                continue
+
+            self.languages.append(language)
+
+            for filename in os.listdir(language_dir):
+                if not filename.endswith('.txt'):
+                    raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename))
+                dictionary_name = filename.split('.')[0].lower()
+
+                if dictionary_name not in gazetteer_types:
+                    raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types))))
+                self.language_dictionaries[language].append(dictionary_name)
+
+                path = os.path.join(language_dir, filename)
+                for i, line in enumerate(open(path)):
+                    line = safe_decode(line.rstrip())
+                    if not line.strip():
+                        continue
+
+                    if u'}' in line:
+                        raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
+                    phrases = line.split(u'|')
+
+                    if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
+                        raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
+
+                    self.phrases[(language, dictionary_name)].append(phrases)
+
+        self.language_dictionaries = dict(self.language_dictionaries)
+        self.phrases = dict(self.phrases)
+
+
+address_phrase_dictionaries = AddressPhraseDictionaries()
+
+
+def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
+    address_languages = []
+    expansion_rules = []
+    canonical_strings = []
+
+    max_dictionary_types = 0
+
+    for language in address_phrase_dictionaries.languages:
+        num_language_rules = 0
+        language_index = len(expansion_rules)
+
+        language_canonical_dictionaries = defaultdict(list)
+        canonical_indices = {}
+
+        for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]:
+            dictionary_type = gazetteer_types[dictionary_name]
+
+            for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]:
+                canonical = phrases[0]
+                if len(phrases) > 1:
+                    canonical_index = canonical_indices.get(canonical, None)
+                    if canonical_index is None:
+                        canonical_index = len(canonical_strings)
+                        canonical_strings.append(quote_string(canonical))
+                        canonical_indices[canonical] = canonical_index
+                else:
+                    canonical_index = -1
+
+                for i, p in enumerate(phrases):
+                    language_canonical_dictionaries[p, canonical_index if i > 0 else -1].append(dictionary_type)
+
+        for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
+            max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
+            rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
+                                                                   num_dictionaries=str(len(dictionary_types)),
+                                                                   dictionaries=', '.join(dictionary_types),
+                                                                   canonical_index=canonical_index)
+            expansion_rules.append(rule_template)
+            num_language_rules += 1
+
+        address_languages.append(address_language_index_template.format(language=quote_string(language),
+                                                                        index=language_index,
+                                                                        length=num_language_rules))
+
+    header = address_expansion_rule_header_template.format(
+        max_dictionary_types=str(max_dictionary_types)
+    )
+    out = open(header_file, 'w')
+    out.write(safe_encode(header))
+    out.close()
+
+    data_file = address_expansion_data_file_template.format(
+        canonical_strings=u''',
+    '''.join(canonical_strings),
+        expansion_rules=u''',
+    '''.join(expansion_rules),
+        address_languages=u''',
+    '''.join(address_languages),
+    )
+
+    out = open(output_file, 'w')
+    out.write(safe_encode(data_file))
+    out.close()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        input_dir = sys.argv[1]
+    else:
+        input_dir = ADDRESS_EXPANSIONS_DIR
+
+    create_address_expansion_rules_file(base_dir=input_dir, output_file=ADDRESS_DATA_FILE)
--- a/scripts/geodata/address_expansions/equivalence.py
+++ b/scripts/geodata/address_expansions/equivalence.py
@@ -0,0 +1,56 @@
+import random
+import re
+import six
+
+from itertools import izip
+
+from geodata.address_expansions.gazetteers import *
+from geodata.encoding import safe_decode, safe_encode
+from geodata.text.normalize import normalized_tokens
+from geodata.text.tokenize import tokenize_raw, token_types
+from geodata.text.utils import non_breaking_dash_regex
+
+
+def canonicals_for_language(data, language):
+    canonicals = set()
+
+    for d in data:
+        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
+        if language is None or lang == language:
+            canonicals.add(canonical)
+
+    return canonicals
+
+def equivalent(s1, s2, gazetteer, language):
+    '''
+    Address/place equivalence
+    -------------------------
+
+    OSM discourages abbreviations, but to make our training data map better
+    to real-world input, we can safely replace the canonical phrase with an
+    abbreviated version and retain the meaning of the words
+    '''
+
+    tokens_s1 = normalized_tokens(s1)
+    tokens_s2 = normalized_tokens(s2)
+
+    abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1))
+    abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2))
+
+    if len(abbreviated_s1) != len(abbreviated_s2):
+        return False
+
+    for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2):
+        if c1 != token_types.PHRASE and c2 != token_types.PHRASE:
+            if t1 != t2:
+                return False
+        elif c2 == token_types.PHRASE and c2 == token_types.PHRASE:
+            canonicals_s1 = canonicals_for_language(d1, language)
+            canonicals_s2 = canonicals_for_language(d2, language)
+
+            if not canonicals_s1 & canonicals_s2:
+                return False
+        else:
+            return False
+
+    return True
--- a/scripts/geodata/address_expansions/gazetteers.py
+++ b/scripts/geodata/address_expansions/gazetteers.py
@@ -0,0 +1,260 @@
+import os
+import six
+
+from collections import defaultdict, OrderedDict
+
+from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
+from geodata.encoding import safe_decode, safe_encode
+from geodata.i18n.unicode_paths import DATA_DIR
+from geodata.text.normalize import normalized_tokens, normalize_string
+from geodata.text.tokenize import tokenize, token_types
+from geodata.text.phrases import PhraseFilter
+from geodata.enum import EnumValue
+
+from marisa_trie import BytesTrie
+
+
+DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')
+
+PREFIX_KEY = u'\x02'
+SUFFIX_KEY = u'\x03'
+
+POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix',
+                               'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix',
+                               'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc',
+                               'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
+                               'm', 'mm', 'mmm', 'mmmm'])
+
+
+class DictionaryPhraseFilter(PhraseFilter):
+    serialize = safe_encode
+    deserialize = safe_decode
+
+    def __init__(self, *dictionaries):
+        self.dictionaries = dictionaries
+        self.canonicals = {}
+
+        kvs = defaultdict(OrderedDict)
+
+        for language in address_phrase_dictionaries.languages:
+            for dictionary_name in self.dictionaries:
+                is_suffix_dictionary = 'suffixes' in dictionary_name
+                is_prefix_dictionary = 'prefixes' in dictionary_name
+
+                for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
+                    canonical = phrases[0]
+                    canonical_normalized = normalize_string(canonical)
+
+                    self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]
+
+                    for i, phrase in enumerate(phrases):
+
+                        if phrase in POSSIBLE_ROMAN_NUMERALS:
+                            continue
+
+                        is_canonical = normalize_string(phrase) == canonical_normalized
+
+                        if is_suffix_dictionary:
+                            phrase = SUFFIX_KEY + phrase[::-1]
+                        elif is_prefix_dictionary:
+                            phrase = PREFIX_KEY + phrase
+
+                        kvs[phrase][(language, dictionary_name, canonical)] = is_canonical
+
+        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
+
+        self.trie = BytesTrie(kvs)
+
+    def serialize(self, s):
+        return s
+
+    def deserialize(self, s):
+        return s
+
+    def search_substring(self, s):
+        if len(s) == 0:
+            return None, 0
+
+        for i in xrange(len(s) + 1):
+            if not self.trie.has_keys_with_prefix(s[:i]):
+                i -= 1
+                break
+        if i > 0:
+            return (self.trie.get(s[:i]), i)
+        else:
+            return None, 0
+
+    def search_suffix(self, token):
+        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
+        if suffix_len > 0:
+            suffix_len -= len(SUFFIX_KEY)
+        return suffix_search, suffix_len
+
+    def search_prefix(self, token):
+        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
+        if prefix_len > 0:
+            prefix_len -= len(PREFIX_KEY)
+        return prefix_search, prefix_len
+
+    def basic_filter(self, tokens):
+        return super(DictionaryPhraseFilter, self).filter(tokens)
+
+    def filter(self, tokens):
+        for p, t, data in self.basic_filter(tokens):
+            if not p:
+                t, c = t
+                token = t
+                token_len = len(token)
+
+                suffix_search, suffix_len = self.search_suffix(token)
+                if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
+                    yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search))
+                    continue
+                prefix_search, prefix_len = self.search_prefix(token)
+                if prefix_search and self.trie.get(token[:prefix_len]):
+                    yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search))
+                    continue
+            else:
+                c = token_types.PHRASE
+            yield t, c, len(t), map(safe_decode, data)
+
+    def gen_phrases(self, s, canonical_only=False, languages=None):
+        tokens = tokenize(s)
+        norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
+
+        if not languages:
+            languages = None
+        elif not hasattr(languages, '__iter__'):
+            languages = [languages]
+
+        if not hasattr(languages, '__contains__'):
+            languages = set(languages)
+
+        for t, c, length, data in self.filter(norm_tokens):
+            if c == token_types.PHRASE:
+                if not canonical_only and languages is None:
+                    yield six.u(' ').join([t_i for t_i, c_i in t])
+                else:
+                    phrase = None
+                    for d in data:
+                        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
+
+                        if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'):
+                            phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
+                            yield phrase
+
+    def string_contains_phrases(self, s, canonical_only=False, languages=None):
+        phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages)
+        try:
+            phrases.next()
+            return True
+        except StopIteration:
+            return False
+
+    def extract_phrases(self, s, canonical_only=False, languages=None):
+        return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
+
+
+STREET_TYPES_ONLY_DICTIONARIES = ('street_types',
+                                  'directionals',
+                                  'concatenated_suffixes_separable',
+                                  'concatenated_suffixes_inseparable',
+                                  'people',
+                                  'personal_suffixes',
+                                  'personal_titles',
+                                  )
+
+STREET_TYPES_DICTIONARIES = STREET_TYPES_ONLY_DICTIONARIES + ('concatenated_prefixes_separable',
+                                                              'organizations',
+                                                              'qualifiers',
+                                                              'stopwords',
+                                                              )
+
+GIVEN_NAME_DICTIONARY = 'given_names'
+SURNAME_DICTIONARY = 'surnames'
+
+CHAIN_DICTIONARY = 'chains'
+
+SYNONYM_DICTIONARY = 'synonyms'
+
+PERSONAL_NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
+                              SURNAME_DICTIONARY,)
+
+
+NAME_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
+                                                 'building_types',
+                                                 'company_types',
+                                                 'place_names',
+                                                 'qualifiers',
+                                                 'synonyms',
+                                                 'toponyms',
+                                                 )
+
+QUALIFIERS_DICTIONARY = 'qualifiers'
+
+HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number')
+
+POSTCODE_DICTIONARIES = ('postcode',)
+
+TOPONYMS_DICTIONARY = 'toponyms'
+
+TOPONYM_ABBREVIATION_DICTIONARIES = ('qualifiers',
+                                     'directionals',
+                                     'personal_titles',
+                                     'synonyms',
+                                     )
+
+
+UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement',
+                                  'level_types_mezzanine',
+                                  'level_types_numbered',
+                                  'level_types_standalone',
+                                  'level_types_sub_basement',
+                                  'number',
+                                  'post_office',
+                                  'unit_types_numbered',
+                                  'unit_types_standalone',
+                                  )
+
+VENUE_NAME_DICTIONARIES = ('academic_degrees',
+                           'building_types',
+                           'chains',
+                           'company_types',
+                           'directionals',
+                           'given_names',
+                           'organizations',
+                           'people',
+                           'personal_suffixes',
+                           'personal_titles',
+                           'place_names',
+                           'stopwords',
+                           'surnames',
+                           )
+
+ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
+    NAME_DICTIONARIES + \
+    UNIT_ABBREVIATION_DICTIONARIES + \
+    ('no_number', 'nulls',)
+
+
+_gazetteers = []
+
+
+def create_gazetteer(*dictionaries):
+    g = DictionaryPhraseFilter(*dictionaries)
+    _gazetteers.append(g)
+    return g
+
+
+street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
+street_types_only_gazetteer = create_gazetteer(*STREET_TYPES_ONLY_DICTIONARIES)
+qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY)
+names_gazetteer = create_gazetteer(*NAME_DICTIONARIES)
+chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
+unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
+street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, )))
+abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
+toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES)
+toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY)
+given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
+venue_names_gazetteer = create_gazetteer(*VENUE_NAME_DICTIONARIES)