[parser] adding long-context features which help classify the first token in the string by finding the relative positions of a) the first numeric token and b) the first street-level phrase like "Ave" or "Calle"

2017-02-14 18:42:51 -05:00
parent 08976c772e
commit 8eafc5730b
2 changed files with 164 additions and 26 deletions
--- a/src/address_parser.h
+++ b/src/address_parser.h
@@ -74,14 +74,13 @@ with the general error-driven averaged perceptron.
 #define SEPARATOR_LABEL "sep"
 #define FIELD_SEPARATOR_LABEL "fsep"

-#define ADDRESS_COMPONENT_NON_BOUNDARY 1 << 0
+#define ADDRESS_COMPONENT_NON_BOUNDARY 0
 #define ADDRESS_COMPONENT_SUBURB 1 << 3
 #define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 4
 #define ADDRESS_COMPONENT_CITY 1 << 5
 #define ADDRESS_COMPONENT_ISLAND 1 << 7
 #define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 8
 #define ADDRESS_COMPONENT_STATE 1 << 9
-#define ADDRESS_COMPONENT_POSTAL_CODE 1 << 10
 #define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 11
 #define ADDRESS_COMPONENT_COUNTRY 1 << 13
 #define ADDRESS_COMPONENT_WORLD_REGION 1 << 14
@@ -136,8 +135,10 @@ typedef struct address_parser_context {
    char_array *long_context_phrase;
    char_array *prefix_phrase;
    char_array *context_prefix_phrase;
+    char_array *long_context_prefix_phrase;
    char_array *suffix_phrase;
    char_array *context_suffix_phrase;
+    char_array *long_context_suffix_phrase;
    char_array *component_phrase;
    char_array *context_component_phrase;
    char_array *long_context_component_phrase;