diff --git a/src/address_parser.c b/src/address_parser.c
index be12c362..608dc33c 100644
--- a/src/address_parser.c
+++ b/src/address_parser.c
@@ -406,6 +406,14 @@ void address_parser_context_destroy(address_parser_context_t *self) {
         cstring_array_destroy(self->features);
     }
 
+    if (self->prev_tag_features != NULL) {
+        cstring_array_destroy(self->prev_tag_features);
+    }
+
+    if (self->prev2_tag_features != NULL) {
+        cstring_array_destroy(self->prev2_tag_features);
+    }
+
     if (self->tokenized_str != NULL) {
         tokenized_string_destroy(self->tokenized_str);
     }
@@ -558,6 +566,16 @@ address_parser_context_t *address_parser_context_new(void) {
         goto exit_address_parser_context_allocated;
     }
 
+    context->prev_tag_features = cstring_array_new();
+    if (context->prev_tag_features == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
+    context->prev2_tag_features = cstring_array_new();
+    if (context->prev2_tag_features == NULL) {
+        goto exit_address_parser_context_allocated;
+    }
+
     context->tokenized_str = tokenized_string_new();
     if (context->tokenized_str == NULL) {
         goto exit_address_parser_context_allocated;
@@ -999,13 +1017,15 @@ char *prev2: the predicted tag at index i - 2
 
 */
 
-bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx, char *prev, char *prev2) {
+bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) {
     if (self == NULL || ctx == NULL) return false;
 
     address_parser_t *parser = (address_parser_t *)self;
     address_parser_context_t *context = (address_parser_context_t *)ctx;
 
     cstring_array *features = context->features;
+    cstring_array *prev_tag_features = context->prev_tag_features;
+    cstring_array *prev2_tag_features = context->prev2_tag_features;
     char *language = context->language;
     char *country = context->country;
 
@@ -1020,6 +1040,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
     uint32_array *separators = context->separators;
 
     cstring_array_clear(features);
+    cstring_array_clear(prev_tag_features);
+    cstring_array_clear(prev2_tag_features);
 
     token_array *tokens = tokenized->tokens;
 
@@ -1366,7 +1388,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
 
         if (idx == 0) {
             feature_array_add(features, 2, "first word", word);
-            //feature_array_add(features, 3, "prev tag=START+word+next word", word, next_word);
+            //feature_array_add(features, 3, "first word+next word", word, next_word);
         }
 
     } else if (component_phrase_string != NULL) {
@@ -1375,16 +1397,15 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
         word = phrase_string;
     }
 
-    if (prev != NULL && last_index == idx - 1) {
+    if (last_index == idx - 1) {
         // Previous tag and current word
-        feature_array_add(features, 3, "prev tag+word", prev, word);
-        feature_array_add(features, 2, "prev tag", prev);
+        feature_array_add(prev_tag_features, 2, "prev tag+word", word);
+        feature_array_add(prev_tag_features, 1, "prev tag");
 
-        if (prev2 != NULL) {
-            // Previous two tags and current word
-            feature_array_add(features, 4, "prev2 tag+prev tag+word", prev2, prev, word);
-            feature_array_add(features, 3, "prev2 tag+prev tag", prev2, prev);
-        }
+        
+        // Previous two tags and current word
+        feature_array_add(prev2_tag_features, 2, "prev2 tag+prev tag+word", word);
+        feature_array_add(prev2_tag_features, 1, "prev2 tag+prev tag");
     }
 
     if (last_index >= 0) {
@@ -1405,7 +1426,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
 
 
         if (last_index == idx - 1) {
-            feature_array_add(features, 3, "prev tag+prev word", prev, prev_word);
+            feature_array_add(prev_tag_features, 2, "prev tag+prev word", prev_word);
         }
 
         // Previous word and current word
@@ -1542,19 +1563,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
         }
     }
 
-    if (parser->options.print_features) {
-        uint32_t fidx;
-        char *feature;
-
-        printf("{ ");
-        size_t num_features = cstring_array_num_strings(features);
-        cstring_array_foreach(context->features, fidx, feature, {
-            printf("%s", feature);
-            if (fidx < num_features - 1) printf(", ");
-        })
-        printf(" }\n");
-    }
-
     return true;
 
 }
@@ -1682,9 +1690,23 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c
 
     char *prev_label = NULL;
 
-    if (averaged_perceptron_tagger_predict(model, parser, context, context->features, token_labels, &address_parser_features, tokenized_str)) {
+    if (averaged_perceptron_tagger_predict(model, parser, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, &address_parser_features, tokenized_str)) {
         response = address_parser_response_new();
 
+        if (parser->options.print_features) {
+            uint32_t fidx;
+            char *feature;
+
+            printf("{ ");
+            size_t num_features = cstring_array_num_strings(context->features);
+            cstring_array_foreach(context->features, fidx, feature, {
+                printf("%s", feature);
+                if (fidx < num_features - 1) printf(", ");
+            })
+            printf(" }\n");
+        }
+
+
         size_t num_strings = cstring_array_num_strings(tokenized_str->strings);
 
         cstring_array *labels = cstring_array_new_size(num_strings);
diff --git a/src/address_parser.h b/src/address_parser.h
index 33e92a4a..b3df7837 100644
--- a/src/address_parser.h
+++ b/src/address_parser.h
@@ -129,6 +129,8 @@ typedef struct address_parser_context {
     char *language;
     char *country;
     cstring_array *features;
+    cstring_array *prev_tag_features;
+    cstring_array *prev2_tag_features;
     // Temporary strings used at each token during feature extraction
     char_array *phrase;
     char_array *context_phrase;
@@ -211,7 +213,7 @@ void address_parser_context_destroy(address_parser_context_t *self);
 void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country);
 
 // Feature function
-bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2);
+bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i);
 
 // I/O methods
 
diff --git a/src/address_parser_test.c b/src/address_parser_test.c
index 35cf1093..0a44ff40 100644
--- a/src/address_parser_test.c
+++ b/src/address_parser_test.c
@@ -80,7 +80,7 @@ bool address_parser_test(address_parser_t *parser, char *filename, address_parse
 
         size_t starting_errors = result->num_errors;
 
-        if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, token_labels, &address_parser_features, data_set->tokenized_str)) {
+        if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, &address_parser_features, data_set->tokenized_str)) {
             uint32_t i;
             char *predicted;
             cstring_array_foreach(token_labels, i, predicted, {
diff --git a/src/address_parser_train.c b/src/address_parser_train.c
index 371c2518..71fcbb20 100644
--- a/src/address_parser_train.c
+++ b/src/address_parser_train.c
@@ -4,6 +4,7 @@
 #include "averaged_perceptron_trainer.h"
 #include "collections.h"
 #include "constants.h"
+#include "cooccurrences.h"
 #include "file_utils.h"
 #include "geodb.h"
 #include "shuffle.h"
@@ -702,9 +703,13 @@ address_parser_t *address_parser_init(char *filename) {
         }
     })
 
+    size_t hash_size;
+    const char *context_token;
+    bool sort_reverse = true;
+
     log_info("Creating phrase_types trie\n");
 
-    bool sort_reverse = true;
+    sort_reverse = true;
     char **phrase_keys = str_uint32_hash_sort_keys_by_value(phrase_counts, sort_reverse);
     if (phrase_keys == NULL) {
         log_error("phrase_keys == NULL\n");
@@ -713,7 +718,7 @@ address_parser_t *address_parser_init(char *filename) {
         goto exit_hashes_allocated;
     }
 
-    size_t hash_size = kh_size(phrase_counts);
+    hash_size = kh_size(phrase_counts);
     address_parser_types_array *phrase_types_array = address_parser_types_array_new_size(hash_size);
 
     for (size_t idx = 0; idx < hash_size; idx++) {
@@ -828,7 +833,6 @@ address_parser_t *address_parser_init(char *filename) {
     }
 
     khash_t(str_set) *context_phrases;
-    const char *context_token;
 
     uint32_t postal_code_id;
     uint32_t context_phrase_id;
@@ -970,7 +974,7 @@ bool address_parser_train_epoch(address_parser_t *self, averaged_perceptron_trai
 
         address_parser_context_fill(context, self, data_set->tokenized_str, language, country);
 
-        bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, &address_parser_features, data_set->tokenized_str, data_set->labels);
+        bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, &address_parser_features, data_set->tokenized_str, data_set->labels);
 
         if (!example_success) {
             log_error("Error training example\n");
diff --git a/src/averaged_perceptron_tagger.c b/src/averaged_perceptron_tagger.c
index 83c7324a..781b64e0 100644
--- a/src/averaged_perceptron_tagger.c
+++ b/src/averaged_perceptron_tagger.c
@@ -2,11 +2,11 @@
 #include "log/log.h"
 
 
-bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) {
+bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) {
 
     // Keep two tags of history in training
-    char *prev = START;
-    char *prev2 = START2;
+    char *prev = NULL;
+    char *prev2 = NULL;
 
     uint32_t prev_id = 0;
     uint32_t prev2_id = 0;
@@ -22,17 +22,26 @@ bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagg
 
         if (i > 1) {
             prev2 = cstring_array_get_string(model->classes, prev2_id);            
-        } else if (i == 1) {
-            prev2 = START;
         }
 
         log_debug("prev=%s, prev2=%s\n", prev, prev2);
 
-        if (!feature_function(tagger, context, tokenized, i, prev, prev2)) {
+        if (!feature_function(tagger, context, tokenized, i)) {
             log_error("Could not add address parser features\n");
             return false;
         }
 
+        uint32_t fidx;
+        const char *feature;
+
+        cstring_array_foreach(prev_tag_features, fidx, feature, {
+            feature_array_add(features, 2, (char *)feature, prev);
+        })
+
+        cstring_array_foreach(prev2_tag_features, fidx, feature, {
+            feature_array_add(features, 3, (char *)feature, prev2, prev);
+        })
+
         uint32_t guess = averaged_perceptron_predict(model, features);
         char *predicted = cstring_array_get_string(model->classes, guess);
 
diff --git a/src/averaged_perceptron_tagger.h b/src/averaged_perceptron_tagger.h
index bd9bcfe4..eae2e40a 100644
--- a/src/averaged_perceptron_tagger.h
+++ b/src/averaged_perceptron_tagger.h
@@ -18,14 +18,15 @@ the current value.
 #include <string.h>
 
 #include "averaged_perceptron.h"
+#include "features.h"
 #include "tokens.h"
 
 #define START "START"
 #define START2 "START2"
 
-// Arguments:                              tagger, context, tokenized str, index, i-1 tag, i-2 tag
-typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t, char *, char *);
+// Arguments:                              tagger, context, tokenized str, index
+typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t);
 
-bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized);
+bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized);
 
 #endif
\ No newline at end of file
diff --git a/src/averaged_perceptron_trainer.c b/src/averaged_perceptron_trainer.c
index f2ad4352..aa89a68c 100644
--- a/src/averaged_perceptron_trainer.c
+++ b/src/averaged_perceptron_trainer.c
@@ -320,7 +320,7 @@ bool averaged_perceptron_trainer_update_counts(averaged_perceptron_trainer_t *se
     return true;
 }
 
-bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) {
+bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) {
     // Keep two tags of history in training
     char *prev = START;
     char *prev2 = START2;
@@ -353,7 +353,7 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
             prev2 = START;
         }
 
-        if (!feature_function(tagger, context, tokenized, i, prev, prev2)) {
+        if (!feature_function(tagger, context, tokenized, i)) {
             log_error("Could not add address parser features\n");
             return false;
         }
@@ -365,6 +365,17 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
             return false;
         }
 
+        uint32_t fidx;
+        const char *feature;
+
+        cstring_array_foreach(prev_tag_features, fidx, feature, {
+            feature_array_add(features, 2, (char *)feature, prev);
+        })
+
+        cstring_array_foreach(prev2_tag_features, fidx, feature, {
+            feature_array_add(features, 3, (char *)feature, prev2, prev);
+        })
+
         uint32_t guess = averaged_perceptron_trainer_predict(self, features);
 
         // Online error-driven learning, only needs to update weights when it gets a wrong answer, making training fast
diff --git a/src/averaged_perceptron_trainer.h b/src/averaged_perceptron_trainer.h
index a100f3df..aad0488b 100644
--- a/src/averaged_perceptron_trainer.h
+++ b/src/averaged_perceptron_trainer.h
@@ -36,6 +36,7 @@ Link: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf
 #include "averaged_perceptron.h"
 #include "averaged_perceptron_tagger.h"
 #include "collections.h"
+#include "features.h"
 #include "string_utils.h"
 #include "tokens.h"
 #include "trie.h"
@@ -75,6 +76,8 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
                                                void *tagger,
                                                void *context,
                                                cstring_array *features,
+                                               cstring_array *prev_tag_features,
+                                               cstring_array *prev2_tag_features,
                                                ap_tagger_feature_function feature_function,
                                                tokenized_string_t *tokenized,
                                                cstring_array *labels