[parsing] Initial commit of the address parser, training/testing, feature function, I/O
This commit is contained in:
748
src/address_parser.c
Normal file
748
src/address_parser.c
Normal file
@@ -0,0 +1,748 @@
|
||||
#include "address_parser.h"
|
||||
#include "address_dictionary.h"
|
||||
#include "features.h"
|
||||
#include "geodb.h"
|
||||
#include "scanner.h"
|
||||
|
||||
#include "log/log.h"
|
||||
|
||||
#define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat"
|
||||
#define ADDRESS_PARSER_VOCAB_FILENAME "address_parser_vocab.trie"
|
||||
|
||||
#define UNKNOWN_WORD "UNKNOWN"
|
||||
|
||||
static address_parser_t *parser = NULL;
|
||||
|
||||
|
||||
address_parser_t *address_parser_new(void) {
|
||||
address_parser_t *parser = malloc(sizeof(address_parser_t));
|
||||
return parser;
|
||||
}
|
||||
|
||||
|
||||
address_parser_t *get_address_parser(void) {
|
||||
return parser;
|
||||
}
|
||||
|
||||
|
||||
bool address_parser_save(address_parser_t *self, char *output_dir) {
|
||||
if (self == NULL || output_dir == NULL) return false;
|
||||
|
||||
char_array *path = char_array_new_size(strlen(output_dir));
|
||||
|
||||
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_MODEL_FILENAME);
|
||||
char *model_path = char_array_get_string(path);
|
||||
|
||||
if (!averaged_perceptron_save(self->model, model_path)) {
|
||||
char_array_destroy(path);
|
||||
return false;
|
||||
}
|
||||
|
||||
char_array_clear(path);
|
||||
|
||||
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_VOCAB_FILENAME);
|
||||
char *vocab_path = char_array_get_string(path);
|
||||
|
||||
if (!trie_save(self->vocab, vocab_path)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char_array_destroy(path);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool address_parser_load(char *dir) {
|
||||
if (parser != NULL) return false;
|
||||
if (dir == NULL) {
|
||||
dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
|
||||
}
|
||||
|
||||
char_array *path = char_array_new_size(strlen(dir));
|
||||
|
||||
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME);
|
||||
char *model_path = char_array_get_string(path);
|
||||
|
||||
averaged_perceptron_t *model = averaged_perceptron_load(model_path);
|
||||
|
||||
if (model == NULL) {
|
||||
char_array_destroy(path);
|
||||
return false;
|
||||
}
|
||||
|
||||
parser = address_parser_new();
|
||||
parser->model = model;
|
||||
|
||||
char_array_clear(path);
|
||||
|
||||
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_VOCAB_FILENAME);
|
||||
|
||||
char *vocab_path = char_array_get_string(path);
|
||||
|
||||
trie_t *vocab = trie_load(vocab_path);
|
||||
|
||||
if (vocab == NULL) {
|
||||
address_parser_destroy(parser);
|
||||
char_array_destroy(path);
|
||||
return false;
|
||||
}
|
||||
|
||||
parser->vocab = vocab;
|
||||
|
||||
char_array_destroy(path);
|
||||
return true;
|
||||
}
|
||||
|
||||
void address_parser_destroy(address_parser_t *self) {
|
||||
if (self == NULL) return;
|
||||
|
||||
if (self->model != NULL) {
|
||||
averaged_perceptron_destroy(self->model);
|
||||
}
|
||||
|
||||
if (self->vocab != NULL) {
|
||||
trie_destroy(self->vocab);
|
||||
}
|
||||
|
||||
free(self);
|
||||
}
|
||||
|
||||
static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) {
|
||||
uint32_t count = 0;
|
||||
bool has_key = trie_get_data(parser->vocab, word, &count);
|
||||
return count;
|
||||
}
|
||||
|
||||
inline void address_parser_normalize_token(cstring_array *array, char *str, token_t token) {
|
||||
normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS);
|
||||
}
|
||||
|
||||
inline char *address_parser_normalize_string(char *str) {
|
||||
return normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS);
|
||||
}
|
||||
|
||||
|
||||
void address_parser_context_destroy(address_parser_context_t *self) {
|
||||
if (self == NULL) return;
|
||||
|
||||
if (self->phrase != NULL) {
|
||||
char_array_destroy(self->phrase);
|
||||
}
|
||||
|
||||
if (self->separators != NULL) {
|
||||
uint32_array_destroy(self->separators);
|
||||
}
|
||||
|
||||
if (self->normalized != NULL) {
|
||||
cstring_array_destroy(self->normalized);
|
||||
}
|
||||
|
||||
if (self->features != NULL) {
|
||||
cstring_array_destroy(self->features);
|
||||
}
|
||||
|
||||
if (self->tokenized_str != NULL) {
|
||||
tokenized_string_destroy(self->tokenized_str);
|
||||
}
|
||||
|
||||
if (self->address_dictionary_phrases != NULL) {
|
||||
phrase_array_destroy(self->address_dictionary_phrases);
|
||||
}
|
||||
|
||||
if (self->address_phrase_memberships != NULL) {
|
||||
int64_array_destroy(self->address_phrase_memberships);
|
||||
}
|
||||
|
||||
if (self->geodb_phrases != NULL) {
|
||||
phrase_array_destroy(self->geodb_phrases);
|
||||
}
|
||||
|
||||
if (self->geodb_phrase_memberships != NULL) {
|
||||
int64_array_destroy(self->geodb_phrase_memberships);
|
||||
}
|
||||
|
||||
free(self);
|
||||
}
|
||||
|
||||
address_parser_context_t *address_parser_context_new(void) {
|
||||
address_parser_context_t *context = malloc(sizeof(address_parser_context_t));
|
||||
|
||||
if (context == NULL) return NULL;
|
||||
|
||||
context->language = NULL;
|
||||
context->country = NULL;
|
||||
|
||||
context->phrase = char_array_new();
|
||||
if (context->phrase == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
context->separators = uint32_array_new();
|
||||
if (context->separators == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
context->normalized = cstring_array_new();
|
||||
if (context->normalized == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
context->features = cstring_array_new();
|
||||
if (context->features == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
context->tokenized_str = tokenized_string_new();
|
||||
if (context->tokenized_str == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
context->address_dictionary_phrases = phrase_array_new();
|
||||
if (context->address_dictionary_phrases == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
context->address_phrase_memberships = int64_array_new();
|
||||
if (context->address_phrase_memberships == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
context->geodb_phrases = phrase_array_new();
|
||||
if (context->geodb_phrases == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
context->geodb_phrase_memberships = int64_array_new();
|
||||
if (context->geodb_phrase_memberships == NULL) {
|
||||
goto exit_address_parser_context_allocated;
|
||||
}
|
||||
|
||||
return context;
|
||||
|
||||
exit_address_parser_context_allocated:
|
||||
address_parser_context_destroy(context);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void address_parser_context_fill(address_parser_context_t *context, tokenized_string_t *tokenized_str, char *language, char *country) {
|
||||
int64_t i, j;
|
||||
|
||||
uint32_t token_index;
|
||||
char *word;
|
||||
phrase_t phrase;
|
||||
|
||||
context->language = language;
|
||||
context->country = country;
|
||||
|
||||
cstring_array *normalized = context->normalized;
|
||||
cstring_array_clear(normalized);
|
||||
|
||||
char *str = tokenized_str->str;
|
||||
token_array *tokens = tokenized_str->tokens;
|
||||
|
||||
cstring_array_foreach(tokenized_str->strings, token_index, word, {
|
||||
token_t token = tokens->a[token_index];
|
||||
address_parser_normalize_token(normalized, str, token);
|
||||
})
|
||||
|
||||
phrase_array_clear(context->address_dictionary_phrases);
|
||||
int64_array_clear(context->address_phrase_memberships);
|
||||
|
||||
i = 0;
|
||||
phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
|
||||
int64_array *address_phrase_memberships = context->address_phrase_memberships;
|
||||
|
||||
if (search_address_dictionaries_tokens_with_phrases(str, tokens, context->language, &context->address_dictionary_phrases)) {
|
||||
for (j = 0; j < address_dictionary_phrases->n; j++) {
|
||||
phrase = address_dictionary_phrases->a[j];
|
||||
|
||||
for (; i < phrase.start; i++) {
|
||||
int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
|
||||
log_debug("token i=%lld, null phrase membership\n", i);
|
||||
}
|
||||
|
||||
for (i = phrase.start; i < phrase.start + phrase.len; i++) {
|
||||
log_debug("token i=%lld, phrase membership=%lld\n", i, j);
|
||||
int64_array_push(address_phrase_memberships, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < tokens->n; i++) {
|
||||
log_debug("token i=%lld, null phrase membership\n", i);
|
||||
int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
|
||||
}
|
||||
|
||||
phrase_array_clear(context->geodb_phrases);
|
||||
int64_array_clear(context->geodb_phrase_memberships);
|
||||
|
||||
phrase_array *geodb_phrases = context->geodb_phrases;
|
||||
int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships;
|
||||
i = 0;
|
||||
|
||||
if (search_geodb_tokens_with_phrases(str, tokens, &context->geodb_phrases)) {
|
||||
for (j = 0; j < geodb_phrases->n; j++) {
|
||||
phrase = geodb_phrases->a[j];
|
||||
|
||||
for (; i < phrase.start; i++) {
|
||||
log_debug("token i=%lld, null geo phrase membership\n", i);
|
||||
int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
|
||||
}
|
||||
|
||||
for (i = phrase.start; i < phrase.start + phrase.len; i++) {
|
||||
log_debug("token i=%lld, geo phrase membership=%lld\n", i, j);
|
||||
int64_array_push(geodb_phrase_memberships, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (; i < tokens->n; i++) {
|
||||
log_debug("token i=%lld, null geo phrase membership\n", i);
|
||||
int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
static inline char *get_phrase_string(tokenized_string_t *str, char_array *phrase_tokens, phrase_t phrase) {
|
||||
size_t phrase_len = 0;
|
||||
char_array_clear(phrase_tokens);
|
||||
|
||||
size_t phrase_end = phrase.start + phrase.len;
|
||||
|
||||
for (int k = phrase.start; k < phrase_end; k++) {
|
||||
char *w = tokenized_string_get_token(str, k);
|
||||
char_array_append(phrase_tokens, w);
|
||||
if (k < phrase_end - 1) {
|
||||
char_array_append(phrase_tokens, " ");
|
||||
}
|
||||
}
|
||||
char_array_terminate(phrase_tokens);
|
||||
|
||||
return char_array_get_string(phrase_tokens);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
|
||||
typedef struct adjacent_phrase {
|
||||
phrase_t phrase;
|
||||
uint32_t num_separators;
|
||||
} adjacent_phrase_t;
|
||||
|
||||
#define NULL_ADJACENT_PHRASE (adjacent_phrase_t){NULL_PHRASE, 0};
|
||||
|
||||
static inline adjacent_phrase_t get_adjacent_phrase(int64_array *phrase_memberships, phrase_array *phrases, uint32_array *separator_positions, uint32_t i, int32_t direction) {
|
||||
uint32_t *separators = separator_positions->a;
|
||||
int64_t *memberships = phrase_memberships->a;
|
||||
|
||||
uint32_t num_strings = (uint32_t)phrase_memberships->n;
|
||||
|
||||
adjacent_phrase_t adjacent = NULL_ADJACENT_PHRASE;
|
||||
|
||||
if (direction == -1) {
|
||||
for (uint32_t idx = i; idx >= 0; idx--) {
|
||||
uint32_t separator = separators[idx];
|
||||
if (separator > ADDRESS_SEPARATOR_NONE) {
|
||||
adjacent.num_separators++;
|
||||
}
|
||||
|
||||
int64_t membership = memberships[ids];
|
||||
if (membership != NULL_PHRASE_MEMBERSHIP) {
|
||||
adjacent.phrase = phrases->a[membership];
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
} else if (direction == 1) {
|
||||
for (uint32_t idx = i; idx < num_strings; idx++) {
|
||||
uint32_t separator = separators[idx];
|
||||
if (separator > ADDRESS_SEPARATOR_NONE) {
|
||||
adjacent.num_separators++;
|
||||
}
|
||||
|
||||
int64_t membership = memberships[ids];
|
||||
if (membership != NULL_PHRASE_MEMBERSHIP) {
|
||||
adjacent.phrase = phrases->a[membership];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return adjacent;
|
||||
}
|
||||
*/
|
||||
|
||||
static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string, char *prev2, char *prev) {
|
||||
if (phrase_types == component) {
|
||||
log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types);
|
||||
feature_array_add(features, 2, "unambiguous phrase type", phrase_type);
|
||||
feature_array_add(features, 3, "unambiguous phrase type+phrase", phrase_type, phrase_string);
|
||||
} else if (phrase_types & component) {
|
||||
feature_array_add(features, 3, "phrase type+phrase", phrase_type, phrase_string);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
address_parser_features
|
||||
-----------------------
|
||||
|
||||
This is a feature function similar to those found in MEMM and CRF models.
|
||||
|
||||
Follows the signature of an ap_feature_function so it can be called
|
||||
as a function pointer by the averaged perceptron model.
|
||||
|
||||
Parameters:
|
||||
|
||||
address_parser_t *self: a pointer to the address_parser struct, which contains
|
||||
word frequencies and perhaps other useful corpus-wide statistics.
|
||||
|
||||
address_parser_context_t *context: The context struct containing:
|
||||
- phrase dictionary memberships for all the tokens
|
||||
- country (if knkown)
|
||||
- language (if known)
|
||||
- features array
|
||||
|
||||
tokenized_string_t *tokenized: the sequence of tokens for parsing
|
||||
uint32_t i: the current token index
|
||||
char *prev: the predicted tag at index i - 1
|
||||
char *prev2: the predicted tag at index i - 2
|
||||
|
||||
*/
|
||||
|
||||
bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t i, char *prev, char *prev2) {
|
||||
if (self == NULL || ctx == NULL) return false;
|
||||
|
||||
address_parser_t *parser = (address_parser_t *)self;
|
||||
address_parser_context_t *context = (address_parser_context_t *)ctx;
|
||||
|
||||
cstring_array *features = context->features;
|
||||
char *language = context->language;
|
||||
char *country = context->country;
|
||||
|
||||
phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
|
||||
int64_array *address_phrase_memberships = context->address_phrase_memberships;
|
||||
phrase_array *geodb_phrases = context->geodb_phrases;
|
||||
int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships;
|
||||
cstring_array *normalized = context->normalized;
|
||||
|
||||
uint32_array *separators = context->separators;
|
||||
|
||||
cstring_array_clear(features);
|
||||
|
||||
// Bias unit, acts as an intercept
|
||||
feature_array_add(features, 1, "bias");
|
||||
|
||||
char *original_word = tokenized_string_get_token(tokenized, i);
|
||||
|
||||
token_t token = tokenized->tokens->a[i];
|
||||
|
||||
ssize_t last_index = (ssize_t)i - 1;
|
||||
ssize_t next_index = (ssize_t)i + 1;
|
||||
|
||||
char *word = cstring_array_get_string(normalized, i);
|
||||
if (word == NULL) {
|
||||
log_error("got NULL word at %d\n", i);
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t word_len = strlen(word);
|
||||
char *current_word = word;
|
||||
|
||||
log_debug("word=%s\n", word);
|
||||
|
||||
expansion_value_t expansion;
|
||||
|
||||
phrase_t phrase = NULL_PHRASE;
|
||||
|
||||
char *phrase_string = NULL;
|
||||
char *geo_phrase_string = NULL;
|
||||
|
||||
int64_t address_phrase_index = address_phrase_memberships->a[i];
|
||||
|
||||
char_array *phrase_tokens = context->phrase;
|
||||
|
||||
// Address dictionary phrases
|
||||
if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) {
|
||||
phrase = address_dictionary_phrases->a[address_phrase_index];
|
||||
log_debug("phrase\n");
|
||||
|
||||
last_index = (ssize_t)phrase.start - 1;
|
||||
next_index = (ssize_t)phrase.start + phrase.len;
|
||||
|
||||
expansion.value = phrase.data;
|
||||
uint32_t address_phrase_types = expansion.components;
|
||||
|
||||
log_debug("expansion=%d\n", expansion.value);
|
||||
|
||||
if (address_phrase_types & (ADDRESS_STREET | ADDRESS_NAME)) {
|
||||
phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase);
|
||||
|
||||
if (phrase_string != NULL) {
|
||||
word = phrase_string;
|
||||
}
|
||||
|
||||
log_debug("phrase_string=%s\n", phrase_string);
|
||||
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string, prev2, prev);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string, prev2, prev);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Prefixes like hinter, etc.
|
||||
phrase_t prefix_phrase = search_address_dictionaries_prefix(original_word, token.len, language);
|
||||
if (prefix_phrase.len > 0) {
|
||||
expansion.value = prefix_phrase.data;
|
||||
// Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category
|
||||
if (expansion.components ^ ADDRESS_ANY) {
|
||||
char_array_clear(phrase_tokens);
|
||||
char_array_add_len(phrase_tokens, original_word, prefix_phrase.len);
|
||||
char *prefix = char_array_get_string(phrase_tokens);
|
||||
log_debug("got prefix: %s\n", prefix);
|
||||
feature_array_add(features, 2, "prefix", prefix);
|
||||
}
|
||||
}
|
||||
|
||||
// Suffixes like straße, etc.
|
||||
phrase_t suffix_phrase = search_address_dictionaries_suffix(original_word, token.len, language);
|
||||
if (suffix_phrase.len > 0) {
|
||||
expansion.value = suffix_phrase.data;
|
||||
if (expansion.components & ADDRESS_STREET) {
|
||||
char_array_clear(phrase_tokens);
|
||||
char_array_add_len(phrase_tokens, original_word + (token.len - suffix_phrase.len), suffix_phrase.len);
|
||||
char *suffix = char_array_get_string(phrase_tokens);
|
||||
log_debug("got suffix: %s\n", suffix);
|
||||
feature_array_add(features, 2, "suffix", suffix);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t geodb_phrase_index = geodb_phrase_memberships->a[i];
|
||||
|
||||
phrase = NULL_PHRASE;
|
||||
geodb_value_t geo;
|
||||
|
||||
// GeoDB phrases
|
||||
if (geodb_phrase_index != NULL_PHRASE_MEMBERSHIP) {
|
||||
phrase = geodb_phrases->a[geodb_phrase_index];
|
||||
|
||||
geo_phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase);
|
||||
geo.value = phrase.data;
|
||||
uint32_t geodb_phrase_types = geo.components;
|
||||
|
||||
if (last_index <= (ssize_t)phrase.start - 1 && next_index >= (ssize_t)phrase.start + phrase.len - 1) {
|
||||
last_index = (ssize_t)phrase.start - 1;
|
||||
next_index = (ssize_t)phrase.start + phrase.len;
|
||||
if (geo_phrase_string != NULL && geodb_phrase_types ^ ADDRESS_POSTAL_CODE) {
|
||||
word = geo_phrase_string;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (geodb_phrase_types ^ ADDRESS_ANY) {
|
||||
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_LOCALITY, "city", geo_phrase_string, prev2, prev);
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN1, "admin1", geo_phrase_string, prev2, prev);
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN2, "admin2", geo_phrase_string, prev2, prev);
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN3, "admin3", geo_phrase_string, prev2, prev);
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN4, "admin4", geo_phrase_string, prev2, prev);
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN_OTHER, "admin other", geo_phrase_string, prev2, prev);
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_NEIGHBORHOOD, "neighborhood", geo_phrase_string, prev2, prev);
|
||||
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_COUNTRY, "country", geo_phrase_string, prev2, prev);
|
||||
add_phrase_features(features, geodb_phrase_types, ADDRESS_POSTAL_CODE, "postal code", geo_phrase_string, prev2, prev);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
uint32_t word_freq = word_vocab_frequency(parser, word);
|
||||
|
||||
if (phrase_string == NULL && geo_phrase_string == NULL) {
|
||||
if (word_freq > 0) {
|
||||
// The individual word
|
||||
feature_array_add(features, 2, "word", word);
|
||||
} else {
|
||||
log_debug("word not in vocab: %s\n", original_word);
|
||||
word = UNKNOWN_WORD;
|
||||
}
|
||||
}
|
||||
|
||||
if (prev != NULL) {
|
||||
// Previous tag and current word
|
||||
feature_array_add(features, 3, "i-1 tag+word", prev, current_word);
|
||||
feature_array_add(features, 2, "i-1 tag", prev);
|
||||
|
||||
if (prev2 != NULL) {
|
||||
// Previous two tags and current word
|
||||
feature_array_add(features, 4, "i-2 tag+i-1 tag+word", prev2, prev, current_word);
|
||||
feature_array_add(features, 3, "i-2 tag+i-1 tag", prev2, prev);
|
||||
}
|
||||
}
|
||||
|
||||
if (last_index >= 0) {
|
||||
char *prev_word = cstring_array_get_string(normalized, last_index);
|
||||
|
||||
uint32_t prev_word_freq = word_vocab_frequency(parser, prev_word);
|
||||
if (prev_word_freq == 0) {
|
||||
prev_word = UNKNOWN_WORD;
|
||||
}
|
||||
|
||||
// Previous word
|
||||
feature_array_add(features, 2, "i-1 word", prev_word);
|
||||
// Previous tag + previous word
|
||||
if (last_index == i - 1) {
|
||||
feature_array_add(features, 3, "i-1 tag+i-1 word", prev, prev_word);
|
||||
}
|
||||
// Previous word and current word
|
||||
feature_array_add(features, 3, "i-1 word+word", prev_word, word);
|
||||
}
|
||||
|
||||
size_t num_tokens = tokenized->tokens->n;
|
||||
|
||||
if (next_index < num_tokens) {
|
||||
char *next_word = cstring_array_get_string(normalized, next_index);
|
||||
|
||||
uint32_t next_word_freq = word_vocab_frequency(parser, next_word);
|
||||
if (next_word_freq == 0) {
|
||||
next_word = UNKNOWN_WORD;
|
||||
}
|
||||
|
||||
// Next word e.g. if the current word is unknown and the next word is "street"
|
||||
feature_array_add(features, 2, "i+1 word", next_word);
|
||||
// Current word and next word
|
||||
feature_array_add(features, 3, "word+i+1 word", word, next_word);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
address_parser_response_t *address_parser_response_new(void) {
|
||||
address_parser_response_t *response = malloc(sizeof(address_parser_response_t));
|
||||
return response;
|
||||
}
|
||||
|
||||
void address_parser_response_destroy(address_parser_response_t *self) {
|
||||
if (self == NULL) return;
|
||||
|
||||
for (int i = 0; i < self->num_components; i++) {
|
||||
if (self->components != NULL) {
|
||||
free(self->components[i]);
|
||||
}
|
||||
|
||||
if (self->labels != NULL) {
|
||||
free(self->labels[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (self->components != NULL) {
|
||||
free(self->components);
|
||||
}
|
||||
|
||||
if (self->labels != NULL) {
|
||||
free(self->labels);
|
||||
}
|
||||
|
||||
free(self);
|
||||
}
|
||||
|
||||
|
||||
address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context) {
|
||||
if (address == NULL || context == NULL) return NULL;
|
||||
|
||||
char *normalized = address_parser_normalize_string(address);
|
||||
bool is_normalized = normalized != NULL;
|
||||
if (!is_normalized) {
|
||||
normalized = address;
|
||||
}
|
||||
|
||||
address_parser_t *parser = get_address_parser();
|
||||
averaged_perceptron_t *model = parser->model;
|
||||
|
||||
token_array *tokens = tokenize(normalized);
|
||||
char_array *token_array = char_array_new();
|
||||
|
||||
tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);
|
||||
|
||||
for (int i = 0; i < tokens->n; i++) {
|
||||
token_t token = tokens->a[i];
|
||||
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
|
||||
uint32_array_push(context->separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
||||
continue;
|
||||
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
tokenized_string_add_token(tokenized_str, (const char *)normalized, token.len, token.type, token.offset);
|
||||
uint32_array_push(context->separators, ADDRESS_SEPARATOR_NONE);
|
||||
}
|
||||
|
||||
address_parser_context_fill(context, tokenized_str, language, country);
|
||||
|
||||
cstring_array *token_labels = cstring_array_new_size(tokens->n);
|
||||
|
||||
char *prev_label = NULL;
|
||||
|
||||
address_parser_response_t *response = NULL;
|
||||
|
||||
if (averaged_perceptron_tagger_predict(model, parser, context, context->features, token_labels, &address_parser_features, tokenized_str)) {
|
||||
response = address_parser_response_new();
|
||||
|
||||
size_t num_strings = cstring_array_num_strings(tokenized_str->strings);
|
||||
|
||||
cstring_array *labels = cstring_array_new_size(num_strings);
|
||||
cstring_array *components = cstring_array_new_size(strlen(address) + num_strings);
|
||||
|
||||
|
||||
for (int i = 0; i < num_strings; i++) {
|
||||
char *str = tokenized_string_get_token(tokenized_str, i);
|
||||
char *label = cstring_array_get_string(token_labels, i);
|
||||
|
||||
if (prev_label == NULL || strcmp(label, prev_label) != 0) {
|
||||
cstring_array_add_string(labels, label);
|
||||
cstring_array_start_token(components);
|
||||
|
||||
}
|
||||
|
||||
if (prev_label != NULL && strcmp(label, prev_label) == 0) {
|
||||
cstring_array_cat_string(components, " ");
|
||||
cstring_array_cat_string(components, str);
|
||||
} else {
|
||||
cstring_array_append_string(components, str);
|
||||
cstring_array_terminate(components);
|
||||
}
|
||||
|
||||
prev_label = label;
|
||||
}
|
||||
response->num_components = cstring_array_num_strings(components);
|
||||
response->components = cstring_array_to_strings(components);
|
||||
response->labels = cstring_array_to_strings(labels);
|
||||
|
||||
}
|
||||
|
||||
token_array_destroy(tokens);
|
||||
tokenized_string_destroy(tokenized_str);
|
||||
cstring_array_destroy(token_labels);
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool address_parser_module_setup(char *dir) {
|
||||
if (parser == NULL) {
|
||||
return address_parser_load(dir);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void address_parser_module_teardown(void) {
|
||||
if (parser != NULL) {
|
||||
address_parser_destroy(parser);
|
||||
}
|
||||
parser = NULL;
|
||||
}
|
||||
134
src/address_parser.h
Normal file
134
src/address_parser.h
Normal file
@@ -0,0 +1,134 @@
|
||||
/*
|
||||
address_parser.h
|
||||
----------------
|
||||
|
||||
International address parser, designed to use OSM training data,
|
||||
over 40M addresses formatted with the OpenCage address formatting
|
||||
templates: https://github.com/OpenCageData/address-formatting.
|
||||
|
||||
This is a sequence modeling problem similar to e.g. part-of-speech
|
||||
tagging, named entity recognition, etc. in which we have a sequence
|
||||
of inputs (words/tokens) and want to predict a sequence of outputs
|
||||
(labeled part-of-address tags). This is a supervised learning model
|
||||
and the training data is created in the Python geodata package
|
||||
included with this repo. Example record:
|
||||
|
||||
en us 123/house_number Fake/road Street/road Brooklyn/city NY/state 12345/postcode
|
||||
|
||||
Where the fields are: {language, country, tagged address}.
|
||||
|
||||
After training, the address parser can take as input a tokenized
|
||||
input string e.g. "123 Fake Street Brooklyn NY 12345" and parse
|
||||
it into:
|
||||
|
||||
{
|
||||
"house_number": "123",
|
||||
"road": "Fake Street",
|
||||
"city": "Brooklyn",
|
||||
"state": "NY",
|
||||
"postcode": "12345"
|
||||
}
|
||||
|
||||
The model used is a greedy averaged perceptron rather than something
|
||||
like a CRF since there's ample training data from OSM and the accuracy
|
||||
on this task is already very high with the simpler model.
|
||||
|
||||
However, it is still worth investigating CRFs as they are relatively fast
|
||||
at prediction time for a small number of tags, can often achieve better
|
||||
performance and are robust to correlated features, which may not be true
|
||||
with the general error-driven averaged perceptron.
|
||||
|
||||
*/
|
||||
#ifndef ADDRESS_PARSER_H
|
||||
#define ADDRESS_PARSER_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "averaged_perceptron.h"
|
||||
#include "averaged_perceptron_tagger.h"
|
||||
#include "bloom.h"
|
||||
#include "libpostal_config.h"
|
||||
#include "collections.h"
|
||||
#include "normalize.h"
|
||||
#include "string_utils.h"
|
||||
|
||||
#define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat"
|
||||
|
||||
#define NULL_PHRASE_MEMBERSHIP -1
|
||||
|
||||
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
|
||||
#define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS
|
||||
|
||||
#define ADDRESS_SEPARATOR_NONE 0
|
||||
#define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0
|
||||
#define ADDRESS_SEPARATOR_FIELD 1 << 1
|
||||
|
||||
#define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH )
|
||||
#define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD)
|
||||
|
||||
#define SEPARATOR_LABEL "sep"
|
||||
#define FIELD_SEPARATOR_LABEL "fsep"
|
||||
|
||||
typedef struct address_parser_context {
|
||||
char *language;
|
||||
char *country;
|
||||
cstring_array *features;
|
||||
char_array *phrase;
|
||||
uint32_array *separators;
|
||||
cstring_array *normalized;
|
||||
phrase_array *address_dictionary_phrases;
|
||||
// Index in address_dictionary_phrases or -1
|
||||
int64_array *address_phrase_memberships;
|
||||
phrase_array *geodb_phrases;
|
||||
// Index in gedob_phrases or -1
|
||||
int64_array *geodb_phrase_memberships;
|
||||
tokenized_string_t *tokenized_str;
|
||||
} address_parser_context_t;
|
||||
|
||||
typedef struct address_parser_response {
|
||||
size_t num_components;
|
||||
char **components;
|
||||
char **labels;
|
||||
} address_parser_response_t;
|
||||
|
||||
// Can add other gazetteers as well
|
||||
typedef struct address_parser {
|
||||
averaged_perceptron_t *model;
|
||||
trie_t *vocab;
|
||||
} address_parser_t;
|
||||
|
||||
// General usage
|
||||
|
||||
address_parser_t *address_parser_new(void);
|
||||
address_parser_t *get_address_parser(void);
|
||||
bool address_parser_load(char *dir);
|
||||
|
||||
void address_parser_response_destroy(address_parser_response_t *self);
|
||||
address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context);
|
||||
void address_parser_destroy(address_parser_t *self);
|
||||
|
||||
char *address_parser_normalize_string(char *str);
|
||||
void address_parser_normalize_token(cstring_array *array, char *str, token_t token);
|
||||
|
||||
address_parser_context_t *address_parser_context_new(void);
|
||||
void address_parser_context_destroy(address_parser_context_t *self);
|
||||
|
||||
void address_parser_context_fill(address_parser_context_t *context, tokenized_string_t *tokenized_str, char *language, char *country);
|
||||
|
||||
// Feature function
|
||||
bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2);
|
||||
|
||||
// I/O methods
|
||||
|
||||
bool address_parser_load(char *dir);
|
||||
bool address_parser_save(address_parser_t *self, char *output_dir);
|
||||
|
||||
// Module setup/teardown
|
||||
|
||||
bool address_parser_module_setup(char *dir);
|
||||
void address_parser_module_teardown(void);
|
||||
|
||||
|
||||
#endif
|
||||
180
src/address_parser_io.c
Normal file
180
src/address_parser_io.c
Normal file
@@ -0,0 +1,180 @@
|
||||
#include "address_parser_io.h"
|
||||
|
||||
address_parser_data_set_t *address_parser_data_set_init(char *filename) {
|
||||
address_parser_data_set_t *data_set = malloc(sizeof(address_parser_data_set_t));
|
||||
data_set->f = fopen(filename, "r");
|
||||
if (data_set->f == NULL) {
|
||||
free(data_set);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
data_set->tokens = token_array_new();
|
||||
data_set->tokenized_str = NULL;
|
||||
data_set->labels = cstring_array_new();
|
||||
data_set->separators = uint32_array_new();
|
||||
data_set->language = char_array_new_size(MAX_LANGUAGE_LEN);
|
||||
data_set->country = char_array_new_size(MAX_COUNTRY_CODE_LEN);
|
||||
|
||||
return data_set;
|
||||
}
|
||||
|
||||
|
||||
bool address_parser_data_set_tokenize_line(char *input, token_array *tokens, uint32_array *separators, cstring_array *labels) {
|
||||
size_t count = 0;
|
||||
|
||||
token_t token;
|
||||
|
||||
uint32_t i = 0;
|
||||
char *str = NULL;
|
||||
|
||||
cstring_array *pairs = cstring_array_split(input, " ", 1, &count);
|
||||
size_t num_pairs = cstring_array_num_strings(pairs);
|
||||
|
||||
char *label = NULL;
|
||||
|
||||
// First populate token array
|
||||
cstring_array_foreach(pairs, i, str, {
|
||||
size_t pair_len = strlen(str);
|
||||
|
||||
char *last_separator = strrchr(str, (int)'/');
|
||||
|
||||
if (last_separator == NULL) {
|
||||
log_error("All tokens must be delimited with '/'\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t last_separator_index = last_separator - str;
|
||||
|
||||
label = str + last_separator_index + 1;
|
||||
|
||||
uint32_t last_separator_type;
|
||||
if (strcmp(label, FIELD_SEPARATOR_LABEL) == 0) {
|
||||
last_separator_type = uint32_array_pop(separators);
|
||||
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD | ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
||||
continue;
|
||||
} else if (strcmp(label, SEPARATOR_LABEL) == 0) {
|
||||
last_separator_type = uint32_array_pop(separators);
|
||||
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
||||
continue;
|
||||
}
|
||||
|
||||
token.offset = pairs->indices->a[i];
|
||||
token.len = last_separator_index;
|
||||
|
||||
scanner_t scanner = scanner_from_string(input + token.offset, token.len);
|
||||
token.type = scan_token(&scanner);
|
||||
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
|
||||
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
||||
continue;
|
||||
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
|
||||
// shouldn't happen but just in case
|
||||
continue;
|
||||
} else {
|
||||
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
|
||||
}
|
||||
|
||||
cstring_array_add_string(labels, label);
|
||||
|
||||
token_array_push(tokens, token);
|
||||
})
|
||||
|
||||
cstring_array_destroy(pairs);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool address_parser_data_set_next(address_parser_data_set_t *data_set) {
|
||||
if (data_set == NULL) return false;
|
||||
|
||||
char *line = file_getline(data_set->f);
|
||||
if (line == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t token_count;
|
||||
|
||||
cstring_array *fields = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count);
|
||||
|
||||
free(line);
|
||||
|
||||
if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) {
|
||||
log_error("Token count did not match, ected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count);
|
||||
}
|
||||
|
||||
char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE);
|
||||
char *country = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_COUNTRY);
|
||||
char *address = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_ADDRESS);
|
||||
|
||||
log_debug("Doing: %s\n", address);
|
||||
|
||||
char *normalized = address_parser_normalize_string(address);
|
||||
bool is_normalized = normalized != NULL;
|
||||
if (!is_normalized) {
|
||||
log_debug("could not normalize\n");
|
||||
normalized = strdup(address);
|
||||
}
|
||||
|
||||
log_debug("Normalized: %s\n", normalized);
|
||||
|
||||
token_array *tokens = data_set->tokens;
|
||||
cstring_array *labels = data_set->labels;
|
||||
uint32_array *separators = data_set->separators;
|
||||
|
||||
token_array_clear(tokens);
|
||||
cstring_array_clear(labels);
|
||||
uint32_array_clear(separators);
|
||||
size_t len = strlen(normalized);
|
||||
|
||||
char_array_clear(data_set->country);
|
||||
char_array_add(data_set->country, country);
|
||||
|
||||
char_array_clear(data_set->language);
|
||||
char_array_add(data_set->language, language);
|
||||
|
||||
tokenized_string_t *tokenized_str = NULL;
|
||||
|
||||
if (address_parser_data_set_tokenize_line(normalized, tokens, separators, labels)) {
|
||||
// Add tokens as discrete strings for easier use in feature functions
|
||||
bool copy_tokens = true;
|
||||
tokenized_str = tokenized_string_from_tokens(normalized, tokens, copy_tokens);
|
||||
}
|
||||
|
||||
data_set->tokenized_str = tokenized_str;
|
||||
|
||||
cstring_array_destroy(fields);
|
||||
|
||||
return tokenized_str != NULL;
|
||||
}
|
||||
|
||||
|
||||
void address_parser_data_set_destroy(address_parser_data_set_t *self) {
|
||||
if (self == NULL) return;
|
||||
|
||||
if (self->f != NULL) {
|
||||
fclose(self->f);
|
||||
}
|
||||
|
||||
if (self->tokens != NULL) {
|
||||
token_array_destroy(self->tokens);
|
||||
}
|
||||
|
||||
if (self->labels != NULL) {
|
||||
cstring_array_destroy(self->labels);
|
||||
}
|
||||
|
||||
if (self->separators != NULL) {
|
||||
uint32_array_destroy(self->separators);
|
||||
}
|
||||
|
||||
if (self->language != NULL) {
|
||||
char_array_destroy(self->language);
|
||||
}
|
||||
|
||||
if (self->country != NULL) {
|
||||
char_array_destroy(self->country);
|
||||
}
|
||||
|
||||
free(self);
|
||||
}
|
||||
40
src/address_parser_io.h
Normal file
40
src/address_parser_io.h
Normal file
@@ -0,0 +1,40 @@
|
||||
#ifndef ADDRESS_PARSER_IO_H
|
||||
#define ADDRESS_PARSER_IO_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "address_parser.h"
|
||||
#include "collections.h"
|
||||
#include "file_utils.h"
|
||||
#include "scanner.h"
|
||||
#include "string_utils.h"
|
||||
|
||||
#define AMBIGUOUS_LANGUAGE "xxx"
|
||||
#define UNKNOWN_LANGUAGE "unk"
|
||||
|
||||
enum address_parser_training_data_fields {
|
||||
ADDRESS_PARSER_FIELD_LANGUAGE,
|
||||
ADDRESS_PARSER_FIELD_COUNTRY,
|
||||
ADDRESS_PARSER_FIELD_ADDRESS,
|
||||
ADDRESS_PARSER_FILE_NUM_TOKENS
|
||||
};
|
||||
|
||||
typedef struct address_parser_data_set {
|
||||
FILE *f;
|
||||
token_array *tokens;
|
||||
tokenized_string_t *tokenized_str;
|
||||
cstring_array *labels;
|
||||
uint32_array *separators;
|
||||
char_array *language;
|
||||
char_array *country;
|
||||
} address_parser_data_set_t;
|
||||
|
||||
|
||||
address_parser_data_set_t *address_parser_data_set_init(char *filename);
|
||||
bool address_parser_data_set_tokenize_line(char *input, token_array *tokens, uint32_array *separators, cstring_array *labels);
|
||||
bool address_parser_data_set_next(address_parser_data_set_t *data_set);
|
||||
void address_parser_data_set_destroy(address_parser_data_set_t *self);
|
||||
|
||||
#endif
|
||||
196
src/address_parser_test.c
Normal file
196
src/address_parser_test.c
Normal file
@@ -0,0 +1,196 @@
|
||||
#include "address_parser.h"
|
||||
#include "address_parser_io.h"
|
||||
#include "address_dictionary.h"
|
||||
#include "averaged_perceptron_trainer.h"
|
||||
#include "collections.h"
|
||||
#include "constants.h"
|
||||
#include "file_utils.h"
|
||||
#include "geodb.h"
|
||||
#include "normalize.h"
|
||||
|
||||
#include "log/log.h"
|
||||
|
||||
|
||||
typedef struct address_parser_test_results {
|
||||
size_t num_errors;
|
||||
size_t num_predictions;
|
||||
size_t num_address_errors;
|
||||
size_t num_address_predictions;
|
||||
uint32_t *confusion;
|
||||
} address_parser_test_results_t;
|
||||
|
||||
|
||||
uint32_t get_class_index(address_parser_t *parser, char *name) {
|
||||
uint32_t i;
|
||||
char *str;
|
||||
|
||||
cstring_array_foreach(parser->model->classes, i, str, {
|
||||
if (strcmp(name, str) == 0) {
|
||||
return i;
|
||||
}
|
||||
})
|
||||
|
||||
return parser->model->num_classes;
|
||||
}
|
||||
|
||||
#define EMPTY_ADDRESS_PARSER_TEST_RESULT (address_parser_test_results_t){0, 0, 0, 0, NULL}
|
||||
|
||||
bool address_parser_test(address_parser_t *parser, char *filename, address_parser_test_results_t *result) {
|
||||
if (filename == NULL) {
|
||||
log_error("Filename was NULL\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
uint32_t num_classes = parser->model->num_classes;
|
||||
|
||||
result->confusion = calloc(num_classes * num_classes, sizeof(uint32_t));
|
||||
|
||||
address_parser_data_set_t *data_set = address_parser_data_set_init(filename);
|
||||
|
||||
if (data_set == NULL) {
|
||||
log_error("Error initializing data set\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
address_parser_context_t *context = address_parser_context_new();
|
||||
|
||||
bool success = false;
|
||||
|
||||
size_t examples = 0;
|
||||
|
||||
bool logged = false;
|
||||
|
||||
while (address_parser_data_set_next(data_set)) {
|
||||
char *language = char_array_get_string(data_set->language);
|
||||
if (strcmp(language, UNKNOWN_LANGUAGE) == 0 || strcmp(language, AMBIGUOUS_LANGUAGE) == 0) {
|
||||
language = NULL;
|
||||
}
|
||||
char *country = char_array_get_string(data_set->country);
|
||||
|
||||
address_parser_context_fill(context, data_set->tokenized_str, language, country);
|
||||
|
||||
cstring_array *token_labels = cstring_array_new_size(data_set->tokenized_str->strings->str->n);
|
||||
|
||||
char *prev_label = NULL;
|
||||
|
||||
address_parser_response_t *response = NULL;
|
||||
|
||||
size_t starting_errors = result->num_errors;
|
||||
|
||||
if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, token_labels, &address_parser_features, data_set->tokenized_str)) {
|
||||
uint32_t i;
|
||||
char *predicted;
|
||||
cstring_array_foreach(token_labels, i, predicted, {
|
||||
char *truth = cstring_array_get_string(data_set->labels, i);
|
||||
|
||||
if (strcmp(predicted, truth) != 0) {
|
||||
result->num_errors++;
|
||||
|
||||
uint32_t predicted_index = get_class_index(parser, predicted);
|
||||
uint32_t truth_index = get_class_index(parser, truth);
|
||||
|
||||
result->confusion[predicted_index * num_classes + truth_index]++;
|
||||
}
|
||||
result->num_predictions++;
|
||||
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
cstring_array_destroy(token_labels);
|
||||
|
||||
if (result->num_errors > starting_errors) {
|
||||
result->num_address_errors++;
|
||||
}
|
||||
|
||||
result->num_address_predictions++;
|
||||
|
||||
if (result->num_address_predictions % 1000 == 0 && result->num_address_predictions > 0) {
|
||||
log_info("Did %zu examples\n", result->num_address_predictions);
|
||||
}
|
||||
|
||||
tokenized_string_destroy(data_set->tokenized_str);
|
||||
data_set->tokenized_str = NULL;
|
||||
|
||||
}
|
||||
|
||||
address_parser_data_set_destroy(data_set);
|
||||
address_parser_context_destroy(context);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
char *address_parser_dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
|
||||
|
||||
if (argc < 2) {
|
||||
log_error("Usage: ./address_parser_test filename [parser_dir]\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
char *filename = argv[1];
|
||||
|
||||
if (argc > 2) {
|
||||
address_parser_dir = argv[2];
|
||||
}
|
||||
|
||||
if (!address_dictionary_module_setup(NULL)) {
|
||||
log_error("Could not load address dictionaries\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
log_info("address dictionary module loaded\n");
|
||||
|
||||
if (!geodb_module_setup(NULL)) {
|
||||
log_error("Could not load geodb dictionaries\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
log_info("geodb module loaded\n");
|
||||
|
||||
if (!address_parser_load(address_parser_dir)) {
|
||||
log_error("Could not initialize parser\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
log_info("Finished initialization\n");
|
||||
|
||||
address_parser_t *parser = get_address_parser();
|
||||
|
||||
address_parser_test_results_t results = EMPTY_ADDRESS_PARSER_TEST_RESULT;
|
||||
|
||||
if (!address_parser_test(parser, filename, &results)) {
|
||||
log_error("Error in training\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
printf("Errors: %zu / %zu (%f%%)\n", results.num_errors, results.num_predictions, (double)results.num_errors / results.num_predictions);
|
||||
printf("Addresses: %zu / %zu (%f%%)\n\n", results.num_address_errors, results.num_address_predictions, (double)results.num_address_errors / results.num_address_predictions);
|
||||
|
||||
|
||||
printf("Confusion matrix:\n\n");
|
||||
uint32_t num_classes = parser->model->num_classes;
|
||||
for (uint32_t i = 0; i < num_classes; i++) {
|
||||
for (uint32_t j = 0; j < num_classes; j++) {
|
||||
if (i == j) {
|
||||
continue;
|
||||
}
|
||||
uint32_t class_errors = results.confusion[i * num_classes + j];
|
||||
|
||||
if (class_errors > 0) {
|
||||
char *predicted = cstring_array_get_string(parser->model->classes, i);
|
||||
char *truth = cstring_array_get_string(parser->model->classes, j);
|
||||
|
||||
printf("(%s, %s): %d\n", predicted, truth, class_errors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(results.confusion);
|
||||
|
||||
address_parser_module_teardown();
|
||||
|
||||
address_dictionary_module_teardown();
|
||||
geodb_module_teardown();
|
||||
}
|
||||
300
src/address_parser_train.c
Normal file
300
src/address_parser_train.c
Normal file
@@ -0,0 +1,300 @@
|
||||
#include "address_parser.h"
|
||||
#include "address_parser_io.h"
|
||||
#include "address_dictionary.h"
|
||||
#include "averaged_perceptron_trainer.h"
|
||||
#include "collections.h"
|
||||
#include "constants.h"
|
||||
#include "file_utils.h"
|
||||
#include "geodb.h"
|
||||
#include "shuffle.h"
|
||||
|
||||
#include "log/log.h"
|
||||
|
||||
// Training
|
||||
|
||||
#define DEFAULT_ITERATIONS 5
|
||||
|
||||
#define MIN_VOCAB_COUNT 5
|
||||
|
||||
address_parser_t *address_parser_init(char *filename) {
|
||||
if (filename == NULL) {
|
||||
log_error("Filename was NULL\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
address_parser_data_set_t *data_set = address_parser_data_set_init(filename);
|
||||
|
||||
if (data_set == NULL) {
|
||||
log_error("Error initializing data set\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
address_parser_t *parser = address_parser_new();
|
||||
if (parser == NULL) {
|
||||
log_error("Error allocating parser\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
khash_t(str_uint32) *vocab = kh_init(str_uint32);
|
||||
|
||||
khiter_t k;
|
||||
char *str;
|
||||
|
||||
uint32_t vocab_size = 0;
|
||||
size_t examples = 0;
|
||||
|
||||
const char *word;
|
||||
|
||||
uint32_t i;
|
||||
char *token;
|
||||
char *normalized;
|
||||
uint32_t count;
|
||||
|
||||
char_array *token_array = char_array_new();
|
||||
|
||||
while (address_parser_data_set_next(data_set)) {
|
||||
tokenized_string_t *tokenized_str = data_set->tokenized_str;
|
||||
|
||||
if (tokenized_str == NULL) {
|
||||
log_error("tokenized str is NULL\n");
|
||||
kh_destroy(str_uint32, vocab);
|
||||
return false;
|
||||
}
|
||||
|
||||
str = tokenized_str->str;
|
||||
|
||||
cstring_array_foreach(tokenized_str->strings, i, token, {
|
||||
token_t t = tokenized_str->tokens->a[i];
|
||||
|
||||
char_array_clear(token_array);
|
||||
add_normalized_token(token_array, str, t, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS);
|
||||
if (token_array->n == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
normalized = char_array_get_string(token_array);
|
||||
|
||||
|
||||
k = kh_get(str_uint32, vocab, normalized);
|
||||
if (k == kh_end(vocab)) {
|
||||
int ret;
|
||||
char *key = strdup(normalized);
|
||||
k = kh_put(str_uint32, vocab, key, &ret);
|
||||
if (ret < 0) {
|
||||
log_error("Error in kh_put\n");
|
||||
free(key);
|
||||
tokenized_string_destroy(tokenized_str);
|
||||
kh_foreach(vocab, word, count, {
|
||||
free((char *)word);
|
||||
})
|
||||
kh_destroy(str_uint32, vocab);
|
||||
char_array_destroy(token_array);
|
||||
return false;
|
||||
}
|
||||
kh_value(vocab, k) = 1;
|
||||
vocab_size++;
|
||||
} else {
|
||||
kh_value(vocab, k)++;
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
tokenized_string_destroy(tokenized_str);
|
||||
examples++;
|
||||
if (examples % 10000 == 0 && examples != 0) {
|
||||
log_info("Counting vocab: did %zu examples\n", examples);
|
||||
}
|
||||
}
|
||||
|
||||
log_debug("Done with vocab, total size=%d\n", vocab_size);
|
||||
|
||||
for (k = kh_begin(vocab); k != kh_end(vocab); ++k) {
|
||||
char *word = (char *)kh_key(vocab, k);
|
||||
if (!kh_exist(vocab, k)) {
|
||||
continue;
|
||||
}
|
||||
uint32_t count = kh_value(vocab, k);
|
||||
if (count < MIN_VOCAB_COUNT) {
|
||||
kh_del(str_uint32, vocab, k);
|
||||
free(word);
|
||||
}
|
||||
}
|
||||
|
||||
parser->vocab = trie_new_from_hash(vocab);
|
||||
|
||||
for (k = kh_begin(vocab); k != kh_end(vocab); ++k) {
|
||||
if (!kh_exist(vocab, k)) {
|
||||
continue;
|
||||
}
|
||||
char *word = (char *)kh_key(vocab, k);
|
||||
free(word);
|
||||
}
|
||||
|
||||
kh_destroy(str_uint32, vocab);
|
||||
|
||||
char_array_destroy(token_array);
|
||||
address_parser_data_set_destroy(data_set);
|
||||
if (parser->vocab == NULL) {
|
||||
log_error("Error initializing vocabulary\n");
|
||||
address_parser_destroy(parser);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return parser;
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool address_parser_train_epoch(address_parser_t *self, averaged_perceptron_trainer_t *trainer, char *filename) {
|
||||
if (filename == NULL) {
|
||||
log_error("Filename was NULL\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
address_parser_data_set_t *data_set = address_parser_data_set_init(filename);
|
||||
if (data_set == NULL) {
|
||||
log_error("Error initializing data set\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
address_parser_context_t *context = address_parser_context_new();
|
||||
|
||||
bool success = false;
|
||||
|
||||
size_t examples = 0;
|
||||
size_t errors = trainer->num_errors;
|
||||
|
||||
bool logged = false;
|
||||
|
||||
while (address_parser_data_set_next(data_set)) {
|
||||
char *language = char_array_get_string(data_set->language);
|
||||
if (strcmp(language, UNKNOWN_LANGUAGE) == 0 || strcmp(language, AMBIGUOUS_LANGUAGE) == 0) {
|
||||
language = NULL;
|
||||
}
|
||||
char *country = char_array_get_string(data_set->country);
|
||||
|
||||
address_parser_context_fill(context, data_set->tokenized_str, language, country);
|
||||
|
||||
bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, &address_parser_features, data_set->tokenized_str, data_set->labels);
|
||||
|
||||
if (!example_success) {
|
||||
log_error("Error training example\n");
|
||||
goto exit_epoch_training_started;
|
||||
}
|
||||
|
||||
tokenized_string_destroy(data_set->tokenized_str);
|
||||
data_set->tokenized_str = NULL;
|
||||
|
||||
if (!example_success) {
|
||||
log_error("Error training example without country/language\n");
|
||||
goto exit_epoch_training_started;
|
||||
}
|
||||
|
||||
examples++;
|
||||
if (examples % 1000 == 0 && examples > 0) {
|
||||
log_info("Iter %d: Did %zu examples with %llu errors\n", trainer->iterations, examples, trainer->num_errors - errors);
|
||||
errors = trainer->num_errors;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
success = true;
|
||||
|
||||
exit_epoch_training_started:
|
||||
address_parser_data_set_destroy(data_set);
|
||||
address_parser_context_destroy(context);
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_iterations) {
|
||||
averaged_perceptron_trainer_t *trainer = averaged_perceptron_trainer_new();
|
||||
|
||||
for (uint32_t iter = 0; iter < num_iterations; iter++) {
|
||||
log_info("Doing epoch %d\n", iter);
|
||||
|
||||
trainer->iterations = iter;
|
||||
|
||||
log_debug("Shuffling\n");
|
||||
|
||||
/*
|
||||
if (!shuffle_file(filename)) {
|
||||
log_error("Error in shuffle\n");
|
||||
averaged_perceptron_trainer_destroy(trainer);
|
||||
return false;
|
||||
}
|
||||
|
||||
log_debug("Shuffle complete\n");
|
||||
*/
|
||||
if (!address_parser_train_epoch(self, trainer, filename)) {
|
||||
log_error("Error in epoch\n");
|
||||
averaged_perceptron_trainer_destroy(trainer);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
log_debug("Done with training, averaging weights\n");
|
||||
|
||||
self->model = averaged_perceptron_trainer_finalize(trainer);
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 3) {
|
||||
printf("Usage: ./address_parser_train filename output_dir\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#if !defined(HAVE_SHUF) && !defined(HAVE_GSHUF)
|
||||
log_error("shuf or gshuf must be installed to train address parser. Please install and reconfigure libpostal\n");
|
||||
exit(EXIT_FAILURE);
|
||||
#endif
|
||||
|
||||
char *filename = argv[1];
|
||||
char *output_dir = argv[2];
|
||||
|
||||
if (!address_dictionary_module_setup(NULL)) {
|
||||
log_error("Could not load address dictionaries\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
log_info("address dictionary module loaded\n");
|
||||
|
||||
if (!geodb_module_setup(NULL)) {
|
||||
log_error("Could not load geodb dictionaries\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
log_info("geodb module loaded\n");
|
||||
|
||||
address_parser_t *parser = address_parser_init(filename);
|
||||
|
||||
if (parser == NULL) {
|
||||
log_error("Could not initialize parser\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
log_info("Finished initialization\n");
|
||||
|
||||
if (!address_parser_train(parser, filename, DEFAULT_ITERATIONS)) {
|
||||
log_error("Error in training\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
log_debug("Finished training\n");
|
||||
|
||||
if (!address_parser_save(parser, output_dir)) {
|
||||
log_error("Error saving address parser\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
address_parser_destroy(parser);
|
||||
|
||||
address_dictionary_module_teardown();
|
||||
geodb_module_teardown();
|
||||
log_debug("Done\n");
|
||||
}
|
||||
Reference in New Issue
Block a user