[parsing] Initial commit of the address parser, training/testing, feature function, I/O

2015-11-30 14:48:13 -05:00
parent e62eb1e697
commit 89677d94a3
7 changed files with 1602 additions and 4 deletions
--- a/src/address_parser.h
+++ b/src/address_parser.h
@@ -0,0 +1,134 @@
+/*
+address_parser.h
+----------------
+
+International address parser, designed to use OSM training data,
+over 40M addresses formatted with the OpenCage address formatting
+templates: https://github.com/OpenCageData/address-formatting.
+
+This is a sequence modeling problem similar to e.g. part-of-speech
+tagging, named entity recognition, etc. in which we have a sequence
+of inputs (words/tokens) and want to predict a sequence of outputs
+(labeled part-of-address tags). This is a supervised learning model
+and the training data is created in the Python geodata package
+included with this repo. Example record:
+
+en  us  123/house_number Fake/road Street/road Brooklyn/city NY/state 12345/postcode
+
+Where the fields are: {language, country, tagged address}.
+
+After training, the address parser can take as input a tokenized
+input string e.g. "123 Fake Street Brooklyn NY 12345" and parse
+it into:
+
+{
+    "house_number": "123",
+    "road": "Fake Street",
+    "city": "Brooklyn",
+    "state": "NY",
+    "postcode": "12345"
+}
+
+The model used is a greedy averaged perceptron rather than something
+like a CRF since there's ample training data from OSM and the accuracy
+on this task is already very high with the simpler model.
+
+However, it is still worth investigating CRFs as they are relatively fast
+at prediction time for a small number of tags, can often achieve better
+performance and are robust to correlated features, which may not be true
+with the general error-driven averaged perceptron.
+
+*/
+#ifndef ADDRESS_PARSER_H
+#define ADDRESS_PARSER_H
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "averaged_perceptron.h"
+#include "averaged_perceptron_tagger.h"
+#include "bloom.h"
+#include "libpostal_config.h"
+#include "collections.h"
+#include "normalize.h"
+#include "string_utils.h"
+
+#define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat"
+
+#define NULL_PHRASE_MEMBERSHIP -1
+
+#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
+#define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS
+
+#define ADDRESS_SEPARATOR_NONE 0
+#define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0
+#define ADDRESS_SEPARATOR_FIELD 1 << 1
+
+#define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH )
+#define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD)
+
+#define SEPARATOR_LABEL "sep"
+#define FIELD_SEPARATOR_LABEL "fsep"
+
+typedef struct address_parser_context {
+    char *language;
+    char *country;
+    cstring_array *features;
+    char_array *phrase;
+    uint32_array *separators;
+    cstring_array *normalized;
+    phrase_array *address_dictionary_phrases;
+    // Index in address_dictionary_phrases or -1
+    int64_array *address_phrase_memberships;
+    phrase_array *geodb_phrases;
+    // Index in gedob_phrases or -1
+    int64_array *geodb_phrase_memberships;
+    tokenized_string_t *tokenized_str;
+} address_parser_context_t;
+
+typedef struct address_parser_response {
+    size_t num_components;
+    char **components;
+    char **labels;
+} address_parser_response_t;
+
+// Can add other gazetteers as well
+typedef struct address_parser {
+    averaged_perceptron_t *model;
+    trie_t *vocab;
+} address_parser_t;
+
+// General usage
+
+address_parser_t *address_parser_new(void);
+address_parser_t *get_address_parser(void);
+bool address_parser_load(char *dir);
+
+void address_parser_response_destroy(address_parser_response_t *self);
+address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context);
+void address_parser_destroy(address_parser_t *self);
+
+char *address_parser_normalize_string(char *str);
+void address_parser_normalize_token(cstring_array *array, char *str, token_t token);
+
+address_parser_context_t *address_parser_context_new(void);
+void address_parser_context_destroy(address_parser_context_t *self);
+
+void address_parser_context_fill(address_parser_context_t *context, tokenized_string_t *tokenized_str, char *language, char *country);
+
+// Feature function
+bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2);
+
+// I/O methods
+
+bool address_parser_load(char *dir);
+bool address_parser_save(address_parser_t *self, char *output_dir);
+
+// Module setup/teardown
+
+bool address_parser_module_setup(char *dir);
+void address_parser_module_teardown(void);
+
+
+#endif