/* address_parser.h ---------------- International address parser, designed to use OSM training data, over 40M addresses formatted with the OpenCage address formatting templates: https://github.com/OpenCageData/address-formatting. This is a sequence modeling problem similar to e.g. part-of-speech tagging, named entity recognition, etc. in which we have a sequence of inputs (words/tokens) and want to predict a sequence of outputs (labeled part-of-address tags). This is a supervised learning model and the training data is created in the Python geodata package included with this repo. Example record: en us 123/house_number Fake/road Street/road Brooklyn/city NY/state 12345/postcode Where the fields are: {language, country, tagged address}. After training, the address parser can take as input a tokenized input string e.g. "123 Fake Street Brooklyn NY 12345" and parse it into: { "house_number": "123", "road": "Fake Street", "city": "Brooklyn", "state": "NY", "postcode": "12345" } The model used is a greedy averaged perceptron rather than something like a CRF since there's ample training data from OSM and the accuracy on this task is already very high with the simpler model. However, it is still worth investigating CRFs as they are relatively fast at prediction time for a small number of tags, can often achieve better performance and are robust to correlated features, which may not be true with the general error-driven averaged perceptron. */ #ifndef ADDRESS_PARSER_H #define ADDRESS_PARSER_H #include #include #include #include "averaged_perceptron.h" #include "averaged_perceptron_tagger.h" #include "libpostal.h" #include "libpostal_config.h" #include "collections.h" #include "normalize.h" #include "string_utils.h" #define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat" #define NULL_PHRASE_MEMBERSHIP -1 #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_SIMPLE_LATIN_ASCII #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8 NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_STRIP_ACCENTS #define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS #define ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS #define ADDRESS_PARSER_NORMALIZE_POSTAL_CODE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC #define ADDRESS_SEPARATOR_NONE 0 #define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0 #define ADDRESS_SEPARATOR_FIELD 1 << 1 #define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH|| (token_type) == SEMICOLON || (token_type) == PUNCT_OPEN || (token_type) == PUNCT_CLOSE ) #define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD || (token_type) == COLON ) #define SEPARATOR_LABEL "sep" #define FIELD_SEPARATOR_LABEL "fsep" #define ADDRESS_COMPONENT_NON_BOUNDARY 1 << 0 #define ADDRESS_COMPONENT_SUBURB 1 << 3 #define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 4 #define ADDRESS_COMPONENT_CITY 1 << 5 #define ADDRESS_COMPONENT_ISLAND 1 << 7 #define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 8 #define ADDRESS_COMPONENT_STATE 1 << 9 #define ADDRESS_COMPONENT_POSTAL_CODE 1 << 10 #define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 11 #define ADDRESS_COMPONENT_COUNTRY 1 << 13 #define ADDRESS_COMPONENT_WORLD_REGION 1 << 14 typedef enum { ADDRESS_PARSER_BOUNDARY_NONE, ADDRESS_PARSER_BOUNDARY_SUBURB, ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT, ADDRESS_PARSER_BOUNDARY_CITY, ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT, ADDRESS_PARSER_BOUNDARY_ISLAND, ADDRESS_PARSER_BOUNDARY_STATE, ADDRESS_PARSER_BOUNDARY_POSTAL_CODE, ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION, ADDRESS_PARSER_BOUNDARY_COUNTRY, ADDRESS_PARSER_BOUNDARY_WORLD_REGION, NUM_ADDRESS_PARSER_BOUNDARY_TYPES } address_parser_boundary_components; #define ADDRESS_PARSER_LABEL_HOUSE "house" #define ADDRESS_PARSER_LABEL_HOUSE_NUMBER "house_number" #define ADDRESS_PARSER_LABEL_ROAD "road" #define ADDRESS_PARSER_LABEL_SUBURB "suburb" #define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district" #define ADDRESS_PARSER_LABEL_CITY "city" #define ADDRESS_PARSER_LABEL_STATE_DISTRICT "state_district" #define ADDRESS_PARSER_LABEL_ISLAND "island" #define ADDRESS_PARSER_LABEL_STATE "state" #define ADDRESS_PARSER_LABEL_POSTAL_CODE "postcode" #define ADDRESS_PARSER_LABEL_COUNTRY_REGION "country_region" #define ADDRESS_PARSER_LABEL_COUNTRY "country" #define ADDRESS_PARSER_LABEL_WORLD_REGION "world_region" typedef union address_parser_types { uint32_t value; struct { uint32_t components:16; // Bitset of components uint32_t most_common:16; // Most common component as short integer enum value }; } address_parser_types_t; typedef struct address_parser_context { char *language; char *country; cstring_array *features; // Temporary strings used at each token during feature extraction char_array *phrase; char_array *context_phrase; char_array *long_context_phrase; char_array *prefix_phrase; char_array *context_prefix_phrase; char_array *suffix_phrase; char_array *context_suffix_phrase; char_array *component_phrase; char_array *context_component_phrase; char_array *long_context_component_phrase; // ngrams and prefix/suffix features cstring_array *ngrams; // For hyphenated words char_array *sub_token; token_array *sub_tokens; // Strings/arrays relating to the sentence uint32_array *separators; cstring_array *normalized; token_array *normalized_tokens; cstring_array *normalized_admin; token_array *normalized_admin_tokens; // Known phrases phrase_array *address_dictionary_phrases; int64_array *address_phrase_memberships; // Index in address_dictionary_phrases or -1 phrase_array *component_phrases; int64_array *component_phrase_memberships; // Index in component_phrases or -1 phrase_array *prefix_phrases; phrase_array *suffix_phrases; // The tokenized string used to conveniently access both words as C strings and tokens by index tokenized_string_t *tokenized_str; } address_parser_context_t; typedef struct parser_options { uint64_t rare_word_threshold; bool print_features; } parser_options_t; // Can add other gazetteers as well typedef struct address_parser { parser_options_t options; averaged_perceptron_t *model; trie_t *vocab; trie_t *phrase_types; } address_parser_t; // General usage address_parser_t *address_parser_new(void); address_parser_t *address_parser_new_options(parser_options_t options); address_parser_t *get_address_parser(void); bool address_parser_load(char *dir); address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context); void address_parser_destroy(address_parser_t *self); char *address_parser_normalize_string(char *str); void address_parser_normalize_token(cstring_array *array, char *str, token_t token); address_parser_context_t *address_parser_context_new(void); void address_parser_context_destroy(address_parser_context_t *self); void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country); // Feature function bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2); // I/O methods bool address_parser_load(char *dir); bool address_parser_save(address_parser_t *self, char *output_dir); // Module setup/teardown bool address_parser_module_setup(char *dir); void address_parser_module_teardown(void); #endif