From acd953ce51707316f6bd62657074476f0f368306 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Thu, 29 Dec 2016 02:17:05 -0500
Subject: [PATCH] [parser] first pass at new parser feature extraction

- removing geodb phrases
- use Latin-ASCII-simple transliteration (no umlauts, etc.)
- no digit normalization for admin component phrases and postcodes
- tag = START + word, special feature for first word in the sequence
- add the new admin boundary categories
- for hyphenated non-phrase words, add each sub-word
- for rare and unknown words, add ngram features of 3-6 characters with
  underscores to indicate beginnings and endings (similar to language
  classifier features)
- defines notion of "rare words" (known words with a frequency <= n where
  n > the unknown word threshold), so known words can share
  statistical strength with artificial and real unknown words
---
 src/address_parser.c | 764 +++++++++++++++++++++++++++----------------
 src/address_parser.h |  23 +-
 2 files changed, 492 insertions(+), 295 deletions(-)

diff --git a/src/address_parser.c b/src/address_parser.c
index 7b7ea47d..2306d85c 100644
--- a/src/address_parser.c
+++ b/src/address_parser.c
@@ -1,7 +1,7 @@
 #include "address_parser.h"
 #include "address_dictionary.h"
 #include "features.h"
-#include "geodb.h"
+#include "ngrams.h"
 #include "scanner.h"
 
 #include "log/log.h"
@@ -13,6 +13,8 @@
 #define UNKNOWN_WORD "UNKNOWN"
 #define UNKNOWN_NUMERIC "UNKNOWN_NUMERIC"
 
+#define DEFAULT_RARE_WORD_THRESHOLD 50
+
 static address_parser_t *parser = NULL;
 
 //#define PRINT_ADDRESS_PARSER_FEATURES
@@ -21,20 +23,29 @@ typedef enum {
     ADDRESS_PARSER_NULL_PHRASE,
     ADDRESS_PARSER_DICTIONARY_PHRASE,
     ADDRESS_PARSER_COMPONENT_PHRASE,
-    ADDRESS_PARSER_GEODB_PHRASE
+    ADDRESS_PARSER_PREFIX_PHRASE,
+    ADDRESS_PARSER_SUFFIX_PHRASE
 } address_parser_phrase_type_t;
 
-address_parser_t *address_parser_new(void) {
+
+static parser_options_t PARSER_DEFAULT_OPTIONS = {
+    .rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD
+};
+
+address_parser_t *address_parser_new_options(parser_options_t options) {
     address_parser_t *parser = malloc(sizeof(address_parser_t));
+    parser->options = options;
     return parser;
 }
 
+address_parser_t *address_parser_new(void) {
+    return address_parser_new_options(PARSER_DEFAULT_OPTIONS);
+}
 
 address_parser_t *get_address_parser(void) {
     return parser;
 }
 
-
 bool address_parser_save(address_parser_t *self, char *output_dir) {
     if (self == NULL || output_dir == NULL) return false;
 
@@ -158,7 +169,7 @@ inline void address_parser_normalize_token(cstring_array *array, char *str, toke
 }
 
 inline void address_parser_normalize_phrase_token(cstring_array *array, char *str, token_t token) {
-    normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_PHRASE_TOKEN_OPTIONS);
+    normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS);
 }
 
 inline char *address_parser_normalize_string(char *str) {
@@ -193,16 +204,24 @@ void address_parser_context_destroy(address_parser_context_t *self) {
         char_array_destroy(self->long_context_component_phrase);
     }
 
-    if (self->geodb_phrase != NULL) {
-        char_array_destroy(self->geodb_phrase);
+    if (self->prefix_phrase != NULL) {
+        char_array_destroy(self->prefix_phrase);
     }
 
-    if (self->context_geodb_phrase != NULL) {
-        char_array_destroy(self->context_geodb_phrase);
+    if (self->context_prefix_phrase != NULL) {
+        char_array_destroy(self->context_prefix_phrase);
     }
 
-    if (self->long_context_geodb_phrase != NULL) {
-        char_array_destroy(self->long_context_geodb_phrase);
+    if (self->suffix_phrase != NULL) {
+        char_array_destroy(self->suffix_phrase);
+    }
+
+    if (self->context_suffix_phrase != NULL) {
+        char_array_destroy(self->context_suffix_phrase);
+    }
+
+    if (self->ngrams != NULL) {
+        cstring_array_destroy(self->ngrams);
     }
 
     if (self->sub_token != NULL) {
@@ -221,6 +240,18 @@ void address_parser_context_destroy(address_parser_context_t *self) {
         cstring_array_destroy(self->normalized);
     }
 
+    if (self->normalized_tokens != NULL) {
+        token_array_destroy(self->normalized_tokens);
+    }
+
+    if (self->normalized_admin != NULL) {
+        cstring_array_destroy(self->normalized_admin);
+    }
+
+    if (self->normalized_admin_tokens != NULL) {
+        token_array_destroy(self->normalized_admin_tokens);
+    }
+
     if (self->features != NULL) {
         cstring_array_destroy(self->features);
     }
@@ -237,14 +268,6 @@ void address_parser_context_destroy(address_parser_context_t *self) {
         int64_array_destroy(self->address_phrase_memberships);
     }
 
-    if (self->geodb_phrases != NULL) {
-        phrase_array_destroy(self->geodb_phrases);
-    }
-
-    if (self->geodb_phrase_memberships != NULL) {
-        int64_array_destroy(self->geodb_phrase_memberships);
-    }
-
     if (self->component_phrases != NULL) {
         phrase_array_destroy(self->component_phrases);
     }
@@ -253,6 +276,14 @@ void address_parser_context_destroy(address_parser_context_t *self) {
         int64_array_destroy(self->component_phrase_memberships);
     }
 
+    if (self->prefix_phrases != NULL) {
+        phrase_array_destroy(self->prefix_phrases);
+    }
+
+    if (self->suffix_phrases != NULL) {
+        phrase_array_destroy(self->suffix_phrases);
+    }
+
     free(self);
 }
 
@@ -294,18 +325,28 @@ address_parser_context_t *address_parser_context_new(void) {
         goto exit_address_parser_context_allocated;
     }
 
-    context->geodb_phrase = char_array_new();
-    if (context->geodb_phrase == NULL) {
+    context->prefix_phrase = char_array_new();
+    if (context->prefix_phrase == NULL) {
         goto exit_address_parser_context_allocated;
     }
 
-    context->context_geodb_phrase = char_array_new();
-    if (context->context_geodb_phrase == NULL) {
+    context->context_prefix_phrase = char_array_new();
+    if (context->context_prefix_phrase == NULL) {
         goto exit_address_parser_context_allocated;
     }
 
-    context->long_context_geodb_phrase = char_array_new();
-    if (context->long_context_geodb_phrase == NULL) {
+    context->suffix_phrase = char_array_new();
+    if (context->suffix_phrase == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
+    context->context_suffix_phrase = char_array_new();
+    if (context->context_suffix_phrase == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
+    context->ngrams = cstring_array_new();
+    if (context->ngrams == NULL) {
         goto exit_address_parser_context_allocated;
     }
 
@@ -329,6 +370,21 @@ address_parser_context_t *address_parser_context_new(void) {
         goto exit_address_parser_context_allocated;
     }
 
+    context->normalized_tokens = token_array_new();
+    if (context->normalized_tokens == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
+    context->normalized_admin = cstring_array_new();
+    if (context->normalized_admin == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
+    context->normalized_admin_tokens = token_array_new();
+    if (context->normalized_admin_tokens == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
     context->features = cstring_array_new();
     if (context->features == NULL) {
         goto exit_address_parser_context_allocated;
@@ -349,16 +405,6 @@ address_parser_context_t *address_parser_context_new(void) {
         goto exit_address_parser_context_allocated;
     }
 
-    context->geodb_phrases = phrase_array_new();
-    if (context->geodb_phrases == NULL) {
-        goto exit_address_parser_context_allocated;
-    }
-
-    context->geodb_phrase_memberships = int64_array_new();
-    if (context->geodb_phrase_memberships == NULL) {
-        goto exit_address_parser_context_allocated;
-    }
-
     context->component_phrases = phrase_array_new();
     if (context->component_phrases == NULL) {
         goto exit_address_parser_context_allocated;
@@ -369,6 +415,16 @@ address_parser_context_t *address_parser_context_new(void) {
         goto exit_address_parser_context_allocated;
     }
 
+    context->prefix_phrases = phrase_array_new();
+    if (context->prefix_phrases == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
+    context->suffix_phrases = phrase_array_new();
+    if (context->suffix_phrases == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
     return context;
 
 exit_address_parser_context_allocated:
@@ -376,9 +432,30 @@ exit_address_parser_context_allocated:
     return NULL;
 }
 
-void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) {
-    int64_t i, j;
+inline static void fill_phrase_memberships(phrase_array *phrases, int64_array *phrase_memberships, size_t len) {
+    int64_t i = 0;
+    for (int64_t j = 0; j < phrases->n; j++) {
+        phrase_t phrase = phrases->a[j];
 
+        for (; i < phrase.start; i++) {
+            int64_array_push(phrase_memberships, NULL_PHRASE_MEMBERSHIP);
+            log_debug("token i=%lld, null phrase membership\n", i);
+        }
+
+        for (i = phrase.start; i < phrase.start + phrase.len; i++) {
+            log_debug("token i=%lld, phrase membership=%lld\n", i, j);
+            int64_array_push(phrase_memberships, j);
+        }
+    }
+
+    for (; i < len; i++) {
+        log_debug("token i=%lld, null phrase membership\n", i);
+        int64_array_push(phrase_memberships, NULL_PHRASE_MEMBERSHIP);
+    }
+}
+
+
+void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) {
     uint32_t token_index;
     char *word;
     phrase_t phrase;
@@ -387,16 +464,52 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars
     context->country = country;
 
     cstring_array *normalized = context->normalized;
+    token_array *normalized_tokens = context->normalized_tokens;
     cstring_array_clear(normalized);
+    token_array_clear(normalized_tokens);
+
+    cstring_array *normalized_admin = context->normalized_admin;
+    token_array *normalized_admin_tokens = context->normalized_admin_tokens;
+    cstring_array_clear(normalized_admin);
+    token_array_clear(normalized_admin_tokens);
 
     char *str = tokenized_str->str;
     token_array *tokens = tokenized_str->tokens;
 
     cstring_array_foreach(tokenized_str->strings, token_index, word, {
         token_t token = tokens->a[token_index];
+
+        size_t token_offset = normalized->str->n;
         address_parser_normalize_token(normalized, str, token);
+        size_t token_len;
+        if (normalized->str->n > token_offset) {
+           token_len = normalized->str->n - 1 - token_offset;
+        } else {
+            token_len = 0;
+        }
+        token_t normalized_token;
+        normalized_token.offset = token_offset;
+        normalized_token.len = token_len;
+        normalized_token.type = token.type;
+        token_array_push(normalized_tokens, normalized_token);
+
+        size_t admin_token_offset = normalized_admin->str->n;
+        address_parser_normalize_phrase_token(normalized_admin, str, token);
+        size_t admin_token_len;
+        if (normalized_admin->str->n > admin_token_offset) {
+           admin_token_len = normalized_admin->str->n - 1 - admin_token_offset;
+        } else {
+            admin_token_len = 0;
+        }
+        token_t normalized_admin_token;
+        normalized_admin_token.offset = admin_token_offset;
+        normalized_admin_token.len = admin_token_len;
+        normalized_admin_token.type = token.type;
+        token_array_push(normalized_admin_tokens, normalized_admin_token);
     })
 
+    char *normalized_str = normalized->str->a;
+    char *normalized_str_admin = normalized_admin->str->a;
 
     /*
     Address dictionary phrases
@@ -412,113 +525,50 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars
     phrase_array_clear(context->address_dictionary_phrases);
     int64_array_clear(context->address_phrase_memberships);
 
-    i = 0;
     phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
     int64_array *address_phrase_memberships = context->address_phrase_memberships;
 
-    if (search_address_dictionaries_tokens_with_phrases(str, tokens, context->language, &context->address_dictionary_phrases)) {
-        for (j = 0; j < address_dictionary_phrases->n; j++) {
-            phrase = address_dictionary_phrases->a[j];
+    size_t num_tokens = tokens->n;
 
-            for (; i < phrase.start; i++) {
-                int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
-                log_debug("token i=%lld, null phrase membership\n", i);
-            }
+    bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, context->language, &context->address_dictionary_phrases);
+    fill_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens);
 
-            for (i = phrase.start; i < phrase.start + phrase.len; i++) {
-                log_debug("token i=%lld, phrase membership=%lld\n", i, j);
-                int64_array_push(address_phrase_memberships, j);
-            }
-        }
+    for (size_t i = 0; i < num_tokens; i++) {
+        token_t token = tokens->a[i];
+
+        phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, language);
+        phrase_array_push(context->prefix_phrases, prefix_phrase);
+
+        phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, language);
+        phrase_array_push(context->suffix_phrases, suffix_phrase);
     }
 
-    for (; i < tokens->n; i++) {
-        log_debug("token i=%lld, null phrase membership\n", i);
-        int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
-    }
+    /*
+    Component phrases
+    -----------------
+    Precomputed phrases for cities, states, countries, etc. from the training data
 
-    phrase_array_clear(context->geodb_phrases);
-    int64_array_clear(context->geodb_phrase_memberships);
-
-    phrase_array *geodb_phrases = context->geodb_phrases;
-    int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships;
-    i = 0;
-
-    if (search_geodb_tokens_with_phrases(str, tokens, &context->geodb_phrases)) {
-        for (j = 0; j < geodb_phrases->n; j++) {
-            phrase = geodb_phrases->a[j];
-
-            for (; i < phrase.start; i++) {
-                log_debug("token i=%lld, null geo phrase membership\n", i);
-                int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
-            }
-
-            for (i = phrase.start; i < phrase.start + phrase.len; i++) {
-                log_debug("token i=%lld, geo phrase membership=%lld\n", i, j);
-                int64_array_push(geodb_phrase_memberships, j);
-            }
-        }
-    }
-
-    for (; i < tokens->n; i++) {
-        log_debug("token i=%lld, null geo phrase membership\n", i);
-        int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
-    }
+    Note: if the training data has lots of mislabeled examples (e.g. Brooklyn as city
+    instead of a city_district), this may cause the parser to get confused. It will
+    penalize itself for getting the wrong answer when really the underlying data
+    is simply ambiguous. In the OSM training data a lot of work has been done to
+    ensure that there's little or no systematic mislabeling. As such, other data
+    sets shouldn't be added willy-nilly unless the labels are consistent.
+    */
 
     phrase_array_clear(context->component_phrases);
     int64_array_clear(context->component_phrase_memberships);
-    i = 0;
 
     phrase_array *component_phrases = context->component_phrases;
     int64_array *component_phrase_memberships = context->component_phrase_memberships;
 
-    if (trie_search_tokens_with_phrases(parser->phrase_types, str, tokens, &component_phrases)) {
-        for (j = 0; j < component_phrases->n; j++) {
-            phrase = component_phrases->a[j];
-
-            for (; i < phrase.start; i++) {
-                log_debug("token i=%lld, null component phrase membership\n", i);
-                int64_array_push(component_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
-            }
-
-            for (i = phrase.start; i < phrase.start + phrase.len; i++) {
-                log_debug("token i=%lld, component phrase membership=%lld\n", i, j);
-                int64_array_push(component_phrase_memberships, j);
-            }
-        }
-    }
-
-    for (; i < tokens->n; i++) {
-        log_debug("token i=%lld, null component phrase membership\n", i);
-        int64_array_push(component_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
-    }
+    bool have_component_phrases = trie_search_tokens_with_phrases(parser->phrase_types, normalized_str_admin, normalized_admin_tokens, &component_phrases);
+    fill_phrase_memberships(component_phrases, component_phrase_memberships, num_tokens);
 
 }
 
-static inline char *get_phrase_string_array(cstring_array *str, char_array *phrase_tokens, phrase_t phrase) {
-    char_array_clear(phrase_tokens);
-
-    size_t phrase_end = phrase.start + phrase.len;
-
-    for (int k = phrase.start; k < phrase_end; k++) {
-        char *w = cstring_array_get_string(str, k);
-        char_array_append(phrase_tokens, w);
-        if (k < phrase_end - 1) {
-            char_array_append(phrase_tokens, " ");
-        }
-    }
-    char_array_terminate(phrase_tokens);
-
-    return char_array_get_string(phrase_tokens);
-}
-
-
-static inline char *get_phrase_string(tokenized_string_t *str, char_array *phrase_tokens, phrase_t phrase) {
-    return get_phrase_string_array(str->strings, phrase_tokens, phrase);
-}
-
-static inline phrase_t get_phrase(phrase_array *phrases, int64_array *phrase_memberships, uint32_t i) {
-    if (phrases == NULL || phrase_memberships == NULL || i > phrases->n - 1) {
+static inline phrase_t phrase_at_index(phrase_array *phrases, int64_array *phrase_memberships, uint32_t i) {
+    if (phrases == NULL || phrase_memberships == NULL || i > phrase_memberships->n - 1) {
         return NULL_PHRASE;
     }
 
@@ -542,9 +592,10 @@ static inline address_parser_phrase_t word_or_phrase_at_index(tokenized_string_t
     address_parser_phrase_t response;
     char *phrase_string = NULL;
 
-    phrase = get_phrase(context->address_dictionary_phrases, context->address_phrase_memberships, i);
+    phrase = phrase_at_index(context->address_dictionary_phrases, context->address_phrase_memberships, i);
+    
     if (phrase.len > 0) {
-        phrase_string = get_phrase_string(tokenized, context->context_phrase, phrase),
+        phrase_string = cstring_array_get_phrase(context->normalized, context->context_phrase, phrase),
 
         response = (address_parser_phrase_t){
             phrase_string,
@@ -556,16 +607,12 @@ static inline address_parser_phrase_t word_or_phrase_at_index(tokenized_string_t
 
     address_parser_types_t types;
 
-    phrase = get_phrase(context->component_phrases, context->component_phrase_memberships, i);
+    phrase = phrase_at_index(context->component_phrases, context->component_phrase_memberships, i);
     if (phrase.len > 0) {
         types.value = phrase.data;
         uint32_t component_phrase_types = types.components;
 
-        if (component_phrase_types != ADDRESS_COMPONENT_POSTAL_CODE) {
-            phrase_string = get_phrase_string(tokenized, context->context_component_phrase, phrase);
-        } else {
-            phrase_string = get_phrase_string_array(context->normalized, context->context_component_phrase, phrase);
-        }
+        phrase_string = cstring_array_get_phrase(context->normalized_admin, context->context_component_phrase, phrase);
 
         response = (address_parser_phrase_t){
             phrase_string,
@@ -575,31 +622,56 @@ static inline address_parser_phrase_t word_or_phrase_at_index(tokenized_string_t
         return response;
     }
 
-     geodb_value_t geo;
+    phrase_t prefix_phrase = context->prefix_phrases->a[i];
+    phrase_t suffix_phrase = context->suffix_phrases->a[i];
 
-    phrase = get_phrase(context->geodb_phrases, context->geodb_phrase_memberships, i);
-    if (phrase.len > 0) {
-        geo.value = phrase.data;
-        uint32_t geodb_phrase_types = geo.components;
-
-        if (geodb_phrase_types != GEONAMES_ADDRESS_COMPONENT_POSTCODE) {
-            phrase_string = get_phrase_string(tokenized, context->context_geodb_phrase, phrase);
-        } else {
-            phrase_string = get_phrase_string_array(context->normalized, context->context_geodb_phrase, phrase);
-        }
-
-        response = (address_parser_phrase_t){
-            phrase_string,
-            ADDRESS_PARSER_GEODB_PHRASE,
-            phrase
-        };
-        return response;
-
-    }
+    uint32_t expansion_index;
+    address_expansion_value_t *expansion_value;
 
     cstring_array *normalized = context->normalized;
 
     char *word = cstring_array_get_string(normalized, i);
+    token_t token = tokenized->tokens->a[i];
+
+    // Suffixes like straße, etc.
+    if (suffix_phrase.len > 0) {
+        expansion_index = suffix_phrase.data;
+        expansion_value = address_dictionary_get_expansions(expansion_index);
+
+        if (expansion_value->components & ADDRESS_STREET) {
+            char_array_clear(context->context_suffix_phrase);
+            size_t suffix_len = suffix_phrase.len;
+            char_array_add_len(context->context_suffix_phrase, word + (token.len - suffix_phrase.len), suffix_len);
+            char *suffix = char_array_get_string(context->suffix_phrase);
+            response = (address_parser_phrase_t){
+                suffix,
+                ADDRESS_PARSER_SUFFIX_PHRASE,
+                suffix_phrase
+            };
+            return response;
+        }
+    }
+
+    // Prefixes like hinter, etc.
+    if (prefix_phrase.len > 0) {
+        expansion_index = prefix_phrase.data;
+        expansion_value = address_dictionary_get_expansions(expansion_index);
+
+        // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category
+        if (expansion_value->components ^ ADDRESS_ANY) {
+            char_array_clear(context->context_prefix_phrase);
+            size_t prefix_len = prefix_phrase.len;
+            char_array_add_len(context->context_prefix_phrase, word, prefix_len);
+            char *prefix = char_array_get_string(context->context_prefix_phrase);
+            response = (address_parser_phrase_t){
+                prefix,
+                ADDRESS_PARSER_PREFIX_PHRASE,
+                prefix_phrase
+            };
+            return response;
+        }
+    }
+
     response = (address_parser_phrase_t){
         word,
         ADDRESS_PARSER_NULL_PHRASE,
@@ -618,7 +690,7 @@ static inline int64_t phrase_index(int64_array *phrase_memberships, size_t start
     int64_t membership;
 
     if (direction == -1) {
-        for (size_t idx = start; idx >= 0; idx--) {
+        for (ssize_t idx = start; idx >= 0; idx--) {
             if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
                 return (int64_t)idx;
             }
@@ -635,7 +707,31 @@ static inline int64_t phrase_index(int64_array *phrase_memberships, size_t start
     return -1;
 }
 
-static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string, char *prev2, char *prev) {
+
+static inline int64_t next_numeric_token_index(tokenized_string_t *tokenized, address_parser_context_t *context, size_t start) {
+    if (context == NULL) return -1;
+
+    token_array *tokens = tokenized->tokens;
+
+    if (tokens == NULL || start > tokens->n - 1) return -1;
+
+    phrase_t phrase;
+
+    for (size_t i = start; i < tokens->n; i++) {
+        if (context->address_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP &&
+            context->component_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP) {
+            token_t token = tokens->a[i];
+            if (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) {
+                return i;
+            }
+        }
+    }
+
+    return -1;
+}
+
+
+static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string) {
     if (phrase_types == component) {
         log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types);
         feature_array_add(features, 2, "unambiguous phrase type", phrase_type);
@@ -645,6 +741,42 @@ static inline void add_phrase_features(cstring_array *features, uint32_t phrase_
     }
 }
 
+static bool add_ngram_features(cstring_array *features, char *feature_prefix, cstring_array *ngrams, char *str, size_t n, size_t prefix_len, size_t suffix_len) {
+    if (features == NULL || ngrams == NULL) return false;
+
+    size_t len = strlen(str);
+
+    if (n == 0 || n > len - 1) return false;
+
+    size_t ngram_num_chars_len = INT64_MAX_STRING_SIZE;
+    char ngram_num_chars[ngram_num_chars_len];
+    sprintf(ngram_num_chars, "%zu", n);
+
+    bool known_prefix = prefix_len > 0;
+    bool known_suffix = suffix_len > 0;
+
+    cstring_array_clear(ngrams);
+    if (!add_ngrams(ngrams, n, str + prefix_len, len - suffix_len - prefix_len, !known_prefix, !known_suffix)) {
+        return false;
+    }
+    
+    uint32_t idx;
+    char *ngram;
+
+    if (feature_prefix != NULL) {
+        cstring_array_foreach(ngrams, idx, ngram, {
+            feature_array_add(features, 4, feature_prefix, "ngrams", ngram_num_chars, ngram);
+        })
+    } else {
+        cstring_array_foreach(ngrams, idx, ngram, {
+            feature_array_add(features, 3, "ngrams", ngram_num_chars, ngram);
+        })
+    }
+
+    return true;
+}
+
+
 /*
 address_parser_features
 -----------------------
@@ -672,7 +804,7 @@ char *prev2: the predicted tag at index i - 2
 
 */
 
-bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t i, char *prev, char *prev2) {
+bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx, char *prev, char *prev2) {
     if (self == NULL || ctx == NULL) return false;
 
     address_parser_t *parser = (address_parser_t *)self;
@@ -684,8 +816,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
 
     phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
     int64_array *address_phrase_memberships = context->address_phrase_memberships;
-    phrase_array *geodb_phrases = context->geodb_phrases;
-    int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships;
     phrase_array *component_phrases = context->component_phrases;
     int64_array *component_phrase_memberships = context->component_phrase_memberships;
     cstring_array *normalized = context->normalized;
@@ -694,14 +824,16 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
 
     cstring_array_clear(features);
 
-    token_t token = tokenized->tokens->a[i];
+    token_array *tokens = tokenized->tokens;
 
-    ssize_t last_index = (ssize_t)i - 1;
-    ssize_t next_index = (ssize_t)i + 1;
+    token_t token = tokens->a[idx];
 
-    char *word = cstring_array_get_string(normalized, i);
+    ssize_t last_index = (ssize_t)idx - 1;
+    ssize_t next_index = (ssize_t)idx + 1;
+
+    char *word = cstring_array_get_string(normalized, idx);
     if (word == NULL) {
-        log_error("got NULL word at %d\n", i);
+        log_error("got NULL word at %d\n", idx);
         return false;
     }
 
@@ -709,19 +841,18 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
 
     log_debug("word=%s\n", word);
 
-    expansion_value_t expansion;
-
     phrase_t phrase = NULL_PHRASE;
 
     char *phrase_string = NULL;
-    char *geo_phrase_string = NULL;
     char *component_phrase_string = NULL;
 
-    int64_t address_phrase_index = address_phrase_memberships->a[i];
+    int64_t address_phrase_index = address_phrase_memberships->a[idx];
 
     char_array *phrase_tokens = context->phrase;
     char_array *component_phrase_tokens = context->component_phrase;
-    char_array *geodb_phrase_tokens = context->geodb_phrase;
+
+    uint32_t expansion_index;
+    address_expansion_value_t *expansion_value;
 
     bool add_word_feature = true;
 
@@ -733,52 +864,29 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
         last_index = (ssize_t)phrase.start - 1;
         next_index = (ssize_t)phrase.start + phrase.len;
 
-        expansion.value = phrase.data;
-        uint32_t address_phrase_types = expansion.components;
-
-        log_debug("expansion=%d\n", expansion.value);
+        expansion_index = phrase.data;
+        expansion_value = address_dictionary_get_expansions(expansion_index);
+        uint32_t address_phrase_types = 0;
+        if (expansion_value != NULL) {
+            address_phrase_types = expansion_value->components;
+        } else {
+            log_warn("expansion_value is NULL. word=%s, sentence=%s\n", word, tokenized->str);
+        }
 
         if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME)) {
-            phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase);
+            phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase);
 
             add_word_feature = false;
             log_debug("phrase_string=%s\n", phrase_string);
 
-            add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string, prev2, prev);
-            add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string, prev2, prev);
-            add_phrase_features(features, address_phrase_types, ADDRESS_HOUSE_NUMBER, "house_number", phrase_string, prev2, prev);
+            add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string);
+            add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string);
+            add_phrase_features(features, address_phrase_types, ADDRESS_HOUSE_NUMBER, "house_number", phrase_string);
 
         }
     }
 
-    // Prefixes like hinter, etc.
-    phrase_t prefix_phrase = search_address_dictionaries_prefix(word, token.len, language);
-    if (prefix_phrase.len > 0) {
-        expansion.value = prefix_phrase.data;
-        // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category
-        if (expansion.components ^ ADDRESS_ANY) {
-            char_array_clear(phrase_tokens);
-            char_array_add_len(phrase_tokens, word, prefix_phrase.len);
-            char *prefix = char_array_get_string(phrase_tokens);
-            log_debug("got prefix: %s\n", prefix);
-            feature_array_add(features, 2, "prefix", prefix);
-        }
-    }
-
-    // Suffixes like straße, etc.
-    phrase_t suffix_phrase = search_address_dictionaries_suffix(word, token.len, language);
-    if (suffix_phrase.len > 0) {
-        expansion.value = suffix_phrase.data;
-        if (expansion.components & ADDRESS_STREET) {
-            char_array_clear(phrase_tokens);
-            char_array_add_len(phrase_tokens, word + (token.len - suffix_phrase.len), suffix_phrase.len);
-            char *suffix = char_array_get_string(phrase_tokens);
-            log_debug("got suffix: %s\n", suffix);
-            feature_array_add(features, 2, "suffix", suffix);
-        }
-    }
-
-    int64_t component_phrase_index = component_phrase_memberships->a[i];
+    int64_t component_phrase_index = component_phrase_memberships->a[idx];
     phrase = NULL_PHRASE;
 
     address_parser_types_t types;
@@ -789,7 +897,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
     if (component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
         phrase = component_phrases->a[component_phrase_index];
 
-        component_phrase_string = get_phrase_string(tokenized, component_phrase_tokens, phrase);
+        component_phrase_string = cstring_array_get_phrase(context->normalized_admin, component_phrase_tokens, phrase);
         
         types.value = phrase.data;
         uint32_t component_phrase_types = types.components;
@@ -798,7 +906,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
         if (last_index >= (ssize_t)phrase.start - 1 || next_index <= (ssize_t)phrase.start + phrase.len - 1) {
             last_index = (ssize_t)phrase.start - 1;
             next_index = (ssize_t)phrase.start + phrase.len;
-
         }
 
         if (component_phrase_string != NULL && component_phrase_types ^ ADDRESS_COMPONENT_POSTAL_CODE) {
@@ -807,113 +914,185 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
         }
 
         if (component_phrase_types > 0) {
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_SUBURB, "suburb", component_phrase_string, prev2, prev);
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY, "city", component_phrase_string, prev2, prev);
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY_DISTRICT, "city_district", component_phrase_string, prev2, prev);
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string, prev2, prev);
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string, prev2, prev);
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string, prev2, prev);
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_POSTAL_CODE, "postal_code", component_phrase_string, prev2, prev);
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string, prev2, prev);
-            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string, prev2, prev);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_SUBURB, "suburb", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY, "city", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY_DISTRICT, "city_district", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_POSTAL_CODE, "postal_code", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string);
+            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_WORLD_REGION, "world_region", component_phrase_string);
         }
 
-        if (most_common == ADDRESS_PARSER_CITY) {
+        if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
             feature_array_add(features, 2, "commonly city", component_phrase_string);
-        } else if (most_common == ADDRESS_PARSER_STATE) {
+        } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
             feature_array_add(features, 2, "commonly state", component_phrase_string);
-        } else if (most_common == ADDRESS_PARSER_COUNTRY) {
+        } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
             feature_array_add(features, 2, "commonly country", component_phrase_string);
-        } else if (most_common == ADDRESS_PARSER_COUNTRY_REGION) {
+        } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
             feature_array_add(features, 2, "commonly country_region", component_phrase_string);
-        } else if (most_common == ADDRESS_PARSER_STATE_DISTRICT) {
+        } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
             feature_array_add(features, 2, "commonly state_district", component_phrase_string);
-        } else if (most_common == ADDRESS_PARSER_ISLAND) {
+        } else if (most_common == ADDRESS_PARSER_BOUNDARY_ISLAND) {
             feature_array_add(features, 2, "commonly island", component_phrase_string);
-        } else if (most_common == ADDRESS_PARSER_SUBURB) {
+        } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
             feature_array_add(features, 2, "commonly suburb", component_phrase_string);
-        } else if (most_common == ADDRESS_PARSER_CITY_DISTRICT) {
+        } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
             feature_array_add(features, 2, "commonly city_district", component_phrase_string);
-        } else if (most_common == ADDRESS_PARSER_POSTAL_CODE) {
+        } else if (most_common == ADDRESS_PARSER_BOUNDARY_POSTAL_CODE) {
             feature_array_add(features, 2, "commonly postal_code", component_phrase_string);
             possible_postal_code = true;
         }
 
     }
 
-    int64_t geodb_phrase_index = geodb_phrase_memberships->a[i];
-
-    phrase = NULL_PHRASE;
-    geodb_value_t geo;
-
-    // GeoDB phrases
-    if (component_phrase_index == NULL_PHRASE_MEMBERSHIP && geodb_phrase_index != NULL_PHRASE_MEMBERSHIP) {
-        phrase = geodb_phrases->a[geodb_phrase_index];
-
-        geo_phrase_string = get_phrase_string(tokenized, geodb_phrase_tokens, phrase);
-        geo.value = phrase.data;
-        uint32_t geodb_phrase_types = geo.components;
-
-        if (last_index >= (ssize_t)phrase.start - 1 || next_index <= (ssize_t)phrase.start + phrase.len) {
-            last_index = (ssize_t)phrase.start - 1;
-            next_index = (ssize_t)phrase.start + phrase.len;
-        }
-
-        if (geo_phrase_string != NULL && geodb_phrase_types ^ GEONAMES_ADDRESS_COMPONENT_POSTCODE) {
-            feature_array_add(features, 2, "phrase", geo_phrase_string);
-            add_word_feature = false;
-        }
-
-        if (geodb_phrase_types ^ ADDRESS_ANY) {
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_LOCALITY, "gn city", geo_phrase_string, prev2, prev);
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN1, "gn admin1", geo_phrase_string, prev2, prev);
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN2, "gn admin2", geo_phrase_string, prev2, prev);
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN3, "gn admin3", geo_phrase_string, prev2, prev);
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN4, "gn admin4", geo_phrase_string, prev2, prev);
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER, "gn admin other", geo_phrase_string, prev2, prev);
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD, "gn neighborhood", geo_phrase_string, prev2, prev);
-
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_COUNTRY, "gn country", geo_phrase_string, prev2, prev);
-            add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_POSTCODE, "gn postal code", geo_phrase_string, prev2, prev);
-
-        }
-
-        possible_postal_code = geodb_phrase_types & GEONAMES_ADDRESS_COMPONENT_POSTCODE;
-
-    }
-
     uint32_t word_freq = word_vocab_frequency(parser, word);
 
+    bool is_word = is_word_token(token.type);
+
     bool is_unknown_word = false;
+    bool is_unknown = false;
+
+    bool known_prefix = false;
+    bool known_suffix = false;
+
+    size_t prefix_len = 0;
+    size_t suffix_len = 0;
+
+    char *prefix = NULL;
+    char *suffix = NULL;
 
     if (add_word_feature) {
         // Bias unit, acts as an intercept
         feature_array_add(features, 1, "bias");
 
+        phrase_t prefix_phrase = context->prefix_phrases->a[idx];
+        phrase_t suffix_phrase = context->suffix_phrases->a[idx];
+
+        // Prefixes like hinter, etc.
+        if (prefix_phrase.len > 0) {
+            expansion_index = prefix_phrase.data;
+            expansion_value = address_dictionary_get_expansions(expansion_index);
+
+            // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category
+            if (expansion_value->components ^ ADDRESS_ANY) {
+                known_prefix = true;
+                char_array_clear(phrase_tokens);
+                prefix_len = prefix_phrase.len;
+                char_array_add_len(phrase_tokens, word, prefix_len);
+                prefix = char_array_get_string(phrase_tokens);
+                log_debug("got prefix: %s\n", prefix);
+                feature_array_add(features, 2, "prefix", prefix);
+            }
+        }
+
+        // Suffixes like straße, etc.
+        if (suffix_phrase.len > 0) {
+            expansion_index = suffix_phrase.data;
+            expansion_value = address_dictionary_get_expansions(expansion_index);
+
+            if (expansion_value->components & ADDRESS_STREET) {
+                known_suffix = true;
+                char_array_clear(context->suffix_phrase);
+                suffix_len = suffix_phrase.len;
+                char_array_add_len(context->suffix_phrase, word + (token.len - suffix_phrase.len), suffix_len);
+                suffix = char_array_get_string(context->suffix_phrase);
+                log_debug("got suffix: %s\n", suffix);
+                feature_array_add(features, 2, "suffix", suffix);
+            }
+        }
+
+        bool is_hyphenated = false;
+
+        // For rare words and unknown words (so unknown words can benefit from statistics of known but super common words)
+        if (word_freq <= parser->options.rare_word_threshold && is_word) {
+            bool ngrams_added = false;
+            size_t hyphenated_word_offset = 0;
+            bool first_sub_token = true;
+            bool last_sub_token = true;
+
+            ssize_t next_hyphen_index;
+
+            do {
+                next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset);
+                char *sub_word = word;
+                size_t sub_word_len = word_len;
+
+                if (next_hyphen_index >= 0) {
+                    is_hyphenated = true;
+                    char_array_clear(context->sub_token);
+                    char_array_add_len(context->sub_token, word + hyphenated_word_offset, next_hyphen_index);
+                    token_array_push(context->sub_tokens, (token_t){hyphenated_word_offset, next_hyphen_index, token.type});
+                    sub_word = char_array_get_string(context->sub_token);
+                    sub_word_len = context->sub_token->n;
+                    last_sub_token = false;
+                } else if (is_hyphenated) {
+                    char_array_clear(context->sub_token);
+                    char_array_add_len(context->sub_token, word + hyphenated_word_offset, word_len - hyphenated_word_offset);
+                    sub_word = char_array_get_string(context->sub_token);
+                    sub_word_len = context->sub_token->n;
+                    last_sub_token = true;
+                }
+
+                bool add_prefix = first_sub_token && prefix_len < sub_word_len;
+                bool add_suffix = last_sub_token && suffix_len < sub_word_len;
+
+                if (is_hyphenated) {
+                    uint32_t sub_word_freq = word_vocab_frequency(parser, sub_word);
+                    if (sub_word_freq > 0) {
+                        feature_array_add(features, 2, "sub_word", sub_word);
+                    }
+                }
+
+                // N-gram features from 3-6 characters
+                for (size_t ng = 3; ng <= 6; ng++) {
+                    ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0);
+                }
+
+                hyphenated_word_offset += next_hyphen_index + 1;
+                first_sub_token = false;
+            } while(next_hyphen_index >= 0);
+
+        }
+
         if (word_freq > 0) {
             // The individual word
             feature_array_add(features, 2, "word", word);
         } else {
             log_debug("word not in vocab: %s\n", word);
+
+            is_unknown = true;
             word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
+
+            if (is_word_token(token.type)) {
+                is_unknown_word = true;
+            }
         }
+
+        if (idx == 0) {
+            //feature_array_add(features, 1, "prev tag=START");
+            feature_array_add(features, 2, "idx-1 tag=START+word", word);
+            //feature_array_add(features, 3, "prev tag=START+word+next word", word, next_word);
+        }
+
     } else if (component_phrase_string != NULL) {
         word = component_phrase_string;
-    } else if (geo_phrase_string != NULL) {
-        word = geo_phrase_string;
     } else if (phrase_string != NULL) {
         word = phrase_string;
     }
 
-    if (prev != NULL && last_index == i - 1) {
+    if (prev != NULL && last_index == idx - 1) {
         // Previous tag and current word
-        feature_array_add(features, 3, "i-1 tag+word", prev, word);
-        feature_array_add(features, 2, "i-1 tag", prev);
+        feature_array_add(features, 3, "prev tag+word", prev, word);
+        feature_array_add(features, 2, "prev tag", prev);
 
         if (prev2 != NULL) {
             // Previous two tags and current word
-            feature_array_add(features, 4, "i-2 tag+i-1 tag+word", prev2, prev, word);
-            feature_array_add(features, 3, "i-2 tag+i-1 tag", prev2, prev);
+            feature_array_add(features, 4, "prev2 tag+prev tag+word", prev2, prev, word);
+            feature_array_add(features, 3, "prev2 tag+prev tag", prev2, prev);
         }
     }
 
@@ -930,14 +1109,14 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
         }
 
         // Previous word
-        feature_array_add(features, 2, "i-1 word", prev_word);
+        feature_array_add(features, 2, "prev word", prev_word);
 
-        if (last_index == i - 1) {
-            feature_array_add(features, 3, "i-1 tag+i-1 word", prev, prev_word);
+        if (last_index == idx - 1) {
+            feature_array_add(features, 3, "prev tag+prev word", prev, prev_word);
         }
 
         // Previous word and current word
-        feature_array_add(features, 3, "i-1 word+word", prev_word, word);
+        feature_array_add(features, 3, "prev word+word", prev_word, word);
     }
 
     size_t num_tokens = tokenized->tokens->n;
@@ -958,21 +1137,24 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
         }
 
         // Next word e.g. if the current word is unknown and the next word is "street"
-        feature_array_add(features, 2, "i+1 word", next_word);
+        feature_array_add(features, 2, "next word", next_word);
 
         // Current word and next word
-        feature_array_add(features, 3, "word+i+1 word", word, next_word);
+        feature_array_add(features, 3, "word+next word", word, next_word);
+
+        // Prev tag, current word and next word
+        //feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word);
     }
 
     #ifndef PRINT_ADDRESS_PARSER_FEATURES
     if (0) {
     #endif
 
-    uint32_t idx;
+    uint32_t fidx;
     char *feature;
 
     printf("{");
-    cstring_array_foreach(features, idx, feature, {
+    cstring_array_foreach(features, fidx, feature, {
         printf("  %s, ", feature);
     })
     printf("}\n");
@@ -1058,21 +1240,23 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c
 
             response = address_parser_response_new();
 
-            if (most_common == ADDRESS_PARSER_CITY) {
+            if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
                 label = strdup(ADDRESS_PARSER_LABEL_CITY);
-            } else if (most_common == ADDRESS_PARSER_STATE) {
+            } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
                 label = strdup(ADDRESS_PARSER_LABEL_STATE);
-            } else if (most_common == ADDRESS_PARSER_COUNTRY) {
+            } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
                 label = strdup(ADDRESS_PARSER_LABEL_COUNTRY);
-            } else if (most_common == ADDRESS_PARSER_COUNTRY_REGION) {
-                label = strdup(ADDRESS_PARSER_LABEL_COUNTRY_REGION);
-            } else if (most_common == ADDRESS_PARSER_STATE_DISTRICT) {
+            } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
                 label = strdup(ADDRESS_PARSER_LABEL_STATE_DISTRICT);
-            } else if (most_common == ADDRESS_PARSER_SUBURB) {
+            } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
+                label = strdup(ADDRESS_PARSER_LABEL_COUNTRY_REGION);
+            } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
                 label = strdup(ADDRESS_PARSER_LABEL_SUBURB);
-            } else if (most_common == ADDRESS_PARSER_CITY_DISTRICT) {
+            } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
                 label = strdup(ADDRESS_PARSER_LABEL_CITY_DISTRICT);
-            } else if (most_common == ADDRESS_PARSER_POSTAL_CODE) {
+            } else if (most_common == ADDRESS_PARSER_BOUNDARY_WORLD_REGION) {
+                label = strdup(ADDRESS_PARSER_LABEL_WORLD_REGION);
+            } else if (most_common == ADDRESS_PARSER_BOUNDARY_POSTAL_CODE) {
                 label = strdup(ADDRESS_PARSER_LABEL_POSTAL_CODE);
             }
 
diff --git a/src/address_parser.h b/src/address_parser.h
index a00b9a8a..1bbc67cf 100644
--- a/src/address_parser.h
+++ b/src/address_parser.h
@@ -135,30 +135,42 @@ typedef struct address_parser_context {
     char_array *phrase;
     char_array *context_phrase;
     char_array *long_context_phrase;
+    char_array *prefix_phrase;
+    char_array *context_prefix_phrase;
+    char_array *suffix_phrase;
+    char_array *context_suffix_phrase;
     char_array *component_phrase;
     char_array *context_component_phrase;
     char_array *long_context_component_phrase;
-    char_array *geodb_phrase;
-    char_array *context_geodb_phrase;
-    char_array *long_context_geodb_phrase;
+    // ngrams and prefix/suffix features
+    cstring_array *ngrams;
     // For hyphenated words
     char_array *sub_token;
     token_array *sub_tokens;
     // Strings/arrays relating to the sentence
     uint32_array *separators;
     cstring_array *normalized;
+    token_array *normalized_tokens;
+    cstring_array *normalized_admin;
+    token_array *normalized_admin_tokens;
     // Known phrases
     phrase_array *address_dictionary_phrases;
     int64_array *address_phrase_memberships; // Index in address_dictionary_phrases or -1
-    phrase_array *geodb_phrases;
-    int64_array *geodb_phrase_memberships; // Index in gedob_phrases or -1
     phrase_array *component_phrases;
     int64_array *component_phrase_memberships; // Index in component_phrases or -1
+    phrase_array *prefix_phrases;
+    phrase_array *suffix_phrases;
+    // The tokenized string used to conveniently access both words as C strings and tokens by index
     tokenized_string_t *tokenized_str;
 } address_parser_context_t;
 
+typedef struct parser_options {
+    uint64_t rare_word_threshold;
+} parser_options_t;
+
 // Can add other gazetteers as well
 typedef struct address_parser {
+    parser_options_t options;
     averaged_perceptron_t *model;
     trie_t *vocab;
     trie_t *phrase_types;
@@ -167,6 +179,7 @@ typedef struct address_parser {
 // General usage
 
 address_parser_t *address_parser_new(void);
+address_parser_t *address_parser_new_options(parser_options_t options);
 address_parser_t *get_address_parser(void);
 bool address_parser_load(char *dir);