[parser] No digit normalization in training data-derived parser phrases (for postcodes, etc.), phrases include the new island type, house number phrases if any are valid. Adjacent words are now full phrases if they are part of a multiword token like a city name. For hyphenated names like Carmel-by-the-Sea, adding a version to the phrase dictionary where the hyphens are replaced with spaces

2016-06-22 11:50:42 -04:00
parent 41ae742285
commit 44908ff95a
3 changed files with 439 additions and 146 deletions
--- a/src/address_parser.h
+++ b/src/address_parser.h
@@ -59,7 +59,8 @@ with the general error-driven averaged perceptron.
 #define NULL_PHRASE_MEMBERSHIP -1

 #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
-#define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS
+#define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS
+#define ADDRESS_PARSER_NORMALIZE_PHRASE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS

 #define ADDRESS_SEPARATOR_NONE 0
 #define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0
@@ -77,10 +78,11 @@ with the general error-driven averaged perceptron.
 #define ADDRESS_COMPONENT_SUBURB 1 << 7
 #define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 8
 #define ADDRESS_COMPONENT_CITY 1 << 9
-#define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 10
-#define ADDRESS_COMPONENT_STATE 1 << 11
-#define ADDRESS_COMPONENT_POSTAL_CODE 1 << 12
-#define ADDRESS_COMPONENT_COUNTRY 1 << 13
+#define ADDRESS_COMPONENT_ISLAND 1 << 10
+#define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 11
+#define ADDRESS_COMPONENT_STATE 1 << 12
+#define ADDRESS_COMPONENT_POSTAL_CODE 1 << 13
+#define ADDRESS_COMPONENT_COUNTRY 1 << 14

 typedef enum {
    ADDRESS_PARSER_HOUSE,
@@ -90,6 +92,7 @@ typedef enum {
    ADDRESS_PARSER_CITY_DISTRICT,
    ADDRESS_PARSER_CITY,
    ADDRESS_PARSER_STATE_DISTRICT,
+    ADDRESS_PARSER_ISLAND,
    ADDRESS_PARSER_STATE,
    ADDRESS_PARSER_POSTAL_CODE,
    ADDRESS_PARSER_COUNTRY,
@@ -103,8 +106,9 @@ typedef enum {
 #define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district"
 #define ADDRESS_PARSER_LABEL_CITY "city"
 #define ADDRESS_PARSER_LABEL_STATE_DISTRICT  "state_district"
+#define ADDRESS_PARSER_LABEL_ISLAND "island"
 #define ADDRESS_PARSER_LABEL_STATE  "state"
-#define ADDRESS_PARSER_LABEL_POSTAL_CODE  "postal_code"
+#define ADDRESS_PARSER_LABEL_POSTAL_CODE  "postcode"
 #define ADDRESS_PARSER_LABEL_COUNTRY  "country"

 typedef union address_parser_types {
@@ -120,20 +124,29 @@ typedef struct address_parser_context {
    char *language;
    char *country;
    cstring_array *features;
+    // Temporary strings used at each token during feature extraction
    char_array *phrase;
+    char_array *context_phrase;
+    char_array *long_context_phrase;
    char_array *component_phrase;
+    char_array *context_component_phrase;
+    char_array *long_context_component_phrase;
    char_array *geodb_phrase;
+    char_array *context_geodb_phrase;
+    char_array *long_context_geodb_phrase;
+    // For hyphenated words
+    char_array *sub_token;
+    token_array *sub_tokens;
+    // Strings/arrays relating to the sentence
    uint32_array *separators;
    cstring_array *normalized;
+    // Known phrases
    phrase_array *address_dictionary_phrases;
-    // Index in address_dictionary_phrases or -1
-    int64_array *address_phrase_memberships;
+    int64_array *address_phrase_memberships; // Index in address_dictionary_phrases or -1
    phrase_array *geodb_phrases;
-    // Index in gedob_phrases or -1
-    int64_array *geodb_phrase_memberships;
+    int64_array *geodb_phrase_memberships; // Index in gedob_phrases or -1
    phrase_array *component_phrases;
-    // Index in component_phrases or -1
-    int64_array *component_phrase_memberships;
+    int64_array *component_phrase_memberships; // Index in component_phrases or -1
    tokenized_string_t *tokenized_str;
 } address_parser_context_t;