[parsing] Initial commit of the address parser, training/testing, feature function, I/O

This commit is contained in:
Al
2015-11-30 14:48:13 -05:00
parent e62eb1e697
commit 89677d94a3
7 changed files with 1602 additions and 4 deletions

View File

@@ -351,13 +351,13 @@ def osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude):
DROP_PROBABILITIES = {
AddressFormatter.HOUSE: 0.8,
AddressFormatter.HOUSE: 0.6,
AddressFormatter.HOUSE_NUMBER: 0.5,
AddressFormatter.ROAD: 0.5,
AddressFormatter.SUBURB: 1.0,
AddressFormatter.CITY_DISTRICT: 1.0,
AddressFormatter.SUBURB: 0.8,
AddressFormatter.CITY_DISTRICT: 0.8,
AddressFormatter.CITY: 0.6,
AddressFormatter.STATE_DISTRICT: 1.0,
AddressFormatter.STATE_DISTRICT: 0.8,
AddressFormatter.STATE: 0.8,
AddressFormatter.POSTCODE: 0.7,
AddressFormatter.COUNTRY: 0.8

748
src/address_parser.c Normal file
View File

@@ -0,0 +1,748 @@
#include "address_parser.h"
#include "address_dictionary.h"
#include "features.h"
#include "geodb.h"
#include "scanner.h"
#include "log/log.h"
#define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat"
#define ADDRESS_PARSER_VOCAB_FILENAME "address_parser_vocab.trie"
#define UNKNOWN_WORD "UNKNOWN"
static address_parser_t *parser = NULL;
address_parser_t *address_parser_new(void) {
address_parser_t *parser = malloc(sizeof(address_parser_t));
return parser;
}
address_parser_t *get_address_parser(void) {
return parser;
}
bool address_parser_save(address_parser_t *self, char *output_dir) {
if (self == NULL || output_dir == NULL) return false;
char_array *path = char_array_new_size(strlen(output_dir));
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_MODEL_FILENAME);
char *model_path = char_array_get_string(path);
if (!averaged_perceptron_save(self->model, model_path)) {
char_array_destroy(path);
return false;
}
char_array_clear(path);
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_VOCAB_FILENAME);
char *vocab_path = char_array_get_string(path);
if (!trie_save(self->vocab, vocab_path)) {
return false;
}
char_array_destroy(path);
return true;
}
bool address_parser_load(char *dir) {
if (parser != NULL) return false;
if (dir == NULL) {
dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
}
char_array *path = char_array_new_size(strlen(dir));
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME);
char *model_path = char_array_get_string(path);
averaged_perceptron_t *model = averaged_perceptron_load(model_path);
if (model == NULL) {
char_array_destroy(path);
return false;
}
parser = address_parser_new();
parser->model = model;
char_array_clear(path);
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_VOCAB_FILENAME);
char *vocab_path = char_array_get_string(path);
trie_t *vocab = trie_load(vocab_path);
if (vocab == NULL) {
address_parser_destroy(parser);
char_array_destroy(path);
return false;
}
parser->vocab = vocab;
char_array_destroy(path);
return true;
}
void address_parser_destroy(address_parser_t *self) {
if (self == NULL) return;
if (self->model != NULL) {
averaged_perceptron_destroy(self->model);
}
if (self->vocab != NULL) {
trie_destroy(self->vocab);
}
free(self);
}
static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) {
uint32_t count = 0;
bool has_key = trie_get_data(parser->vocab, word, &count);
return count;
}
inline void address_parser_normalize_token(cstring_array *array, char *str, token_t token) {
normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS);
}
inline char *address_parser_normalize_string(char *str) {
return normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS);
}
void address_parser_context_destroy(address_parser_context_t *self) {
if (self == NULL) return;
if (self->phrase != NULL) {
char_array_destroy(self->phrase);
}
if (self->separators != NULL) {
uint32_array_destroy(self->separators);
}
if (self->normalized != NULL) {
cstring_array_destroy(self->normalized);
}
if (self->features != NULL) {
cstring_array_destroy(self->features);
}
if (self->tokenized_str != NULL) {
tokenized_string_destroy(self->tokenized_str);
}
if (self->address_dictionary_phrases != NULL) {
phrase_array_destroy(self->address_dictionary_phrases);
}
if (self->address_phrase_memberships != NULL) {
int64_array_destroy(self->address_phrase_memberships);
}
if (self->geodb_phrases != NULL) {
phrase_array_destroy(self->geodb_phrases);
}
if (self->geodb_phrase_memberships != NULL) {
int64_array_destroy(self->geodb_phrase_memberships);
}
free(self);
}
address_parser_context_t *address_parser_context_new(void) {
address_parser_context_t *context = malloc(sizeof(address_parser_context_t));
if (context == NULL) return NULL;
context->language = NULL;
context->country = NULL;
context->phrase = char_array_new();
if (context->phrase == NULL) {
goto exit_address_parser_context_allocated;
}
context->separators = uint32_array_new();
if (context->separators == NULL) {
goto exit_address_parser_context_allocated;
}
context->normalized = cstring_array_new();
if (context->normalized == NULL) {
goto exit_address_parser_context_allocated;
}
context->features = cstring_array_new();
if (context->features == NULL) {
goto exit_address_parser_context_allocated;
}
context->tokenized_str = tokenized_string_new();
if (context->tokenized_str == NULL) {
goto exit_address_parser_context_allocated;
}
context->address_dictionary_phrases = phrase_array_new();
if (context->address_dictionary_phrases == NULL) {
goto exit_address_parser_context_allocated;
}
context->address_phrase_memberships = int64_array_new();
if (context->address_phrase_memberships == NULL) {
goto exit_address_parser_context_allocated;
}
context->geodb_phrases = phrase_array_new();
if (context->geodb_phrases == NULL) {
goto exit_address_parser_context_allocated;
}
context->geodb_phrase_memberships = int64_array_new();
if (context->geodb_phrase_memberships == NULL) {
goto exit_address_parser_context_allocated;
}
return context;
exit_address_parser_context_allocated:
address_parser_context_destroy(context);
return NULL;
}
void address_parser_context_fill(address_parser_context_t *context, tokenized_string_t *tokenized_str, char *language, char *country) {
int64_t i, j;
uint32_t token_index;
char *word;
phrase_t phrase;
context->language = language;
context->country = country;
cstring_array *normalized = context->normalized;
cstring_array_clear(normalized);
char *str = tokenized_str->str;
token_array *tokens = tokenized_str->tokens;
cstring_array_foreach(tokenized_str->strings, token_index, word, {
token_t token = tokens->a[token_index];
address_parser_normalize_token(normalized, str, token);
})
phrase_array_clear(context->address_dictionary_phrases);
int64_array_clear(context->address_phrase_memberships);
i = 0;
phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
int64_array *address_phrase_memberships = context->address_phrase_memberships;
if (search_address_dictionaries_tokens_with_phrases(str, tokens, context->language, &context->address_dictionary_phrases)) {
for (j = 0; j < address_dictionary_phrases->n; j++) {
phrase = address_dictionary_phrases->a[j];
for (; i < phrase.start; i++) {
int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
log_debug("token i=%lld, null phrase membership\n", i);
}
for (i = phrase.start; i < phrase.start + phrase.len; i++) {
log_debug("token i=%lld, phrase membership=%lld\n", i, j);
int64_array_push(address_phrase_memberships, j);
}
}
}
for (; i < tokens->n; i++) {
log_debug("token i=%lld, null phrase membership\n", i);
int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
}
phrase_array_clear(context->geodb_phrases);
int64_array_clear(context->geodb_phrase_memberships);
phrase_array *geodb_phrases = context->geodb_phrases;
int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships;
i = 0;
if (search_geodb_tokens_with_phrases(str, tokens, &context->geodb_phrases)) {
for (j = 0; j < geodb_phrases->n; j++) {
phrase = geodb_phrases->a[j];
for (; i < phrase.start; i++) {
log_debug("token i=%lld, null geo phrase membership\n", i);
int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
}
for (i = phrase.start; i < phrase.start + phrase.len; i++) {
log_debug("token i=%lld, geo phrase membership=%lld\n", i, j);
int64_array_push(geodb_phrase_memberships, j);
}
}
}
for (; i < tokens->n; i++) {
log_debug("token i=%lld, null geo phrase membership\n", i);
int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP);
}
}
static inline char *get_phrase_string(tokenized_string_t *str, char_array *phrase_tokens, phrase_t phrase) {
size_t phrase_len = 0;
char_array_clear(phrase_tokens);
size_t phrase_end = phrase.start + phrase.len;
for (int k = phrase.start; k < phrase_end; k++) {
char *w = tokenized_string_get_token(str, k);
char_array_append(phrase_tokens, w);
if (k < phrase_end - 1) {
char_array_append(phrase_tokens, " ");
}
}
char_array_terminate(phrase_tokens);
return char_array_get_string(phrase_tokens);
}
/*
typedef struct adjacent_phrase {
phrase_t phrase;
uint32_t num_separators;
} adjacent_phrase_t;
#define NULL_ADJACENT_PHRASE (adjacent_phrase_t){NULL_PHRASE, 0};
static inline adjacent_phrase_t get_adjacent_phrase(int64_array *phrase_memberships, phrase_array *phrases, uint32_array *separator_positions, uint32_t i, int32_t direction) {
uint32_t *separators = separator_positions->a;
int64_t *memberships = phrase_memberships->a;
uint32_t num_strings = (uint32_t)phrase_memberships->n;
adjacent_phrase_t adjacent = NULL_ADJACENT_PHRASE;
if (direction == -1) {
for (uint32_t idx = i; idx >= 0; idx--) {
uint32_t separator = separators[idx];
if (separator > ADDRESS_SEPARATOR_NONE) {
adjacent.num_separators++;
}
int64_t membership = memberships[ids];
if (membership != NULL_PHRASE_MEMBERSHIP) {
adjacent.phrase = phrases->a[membership];
break;
}
}
} else if (direction == 1) {
for (uint32_t idx = i; idx < num_strings; idx++) {
uint32_t separator = separators[idx];
if (separator > ADDRESS_SEPARATOR_NONE) {
adjacent.num_separators++;
}
int64_t membership = memberships[ids];
if (membership != NULL_PHRASE_MEMBERSHIP) {
adjacent.phrase = phrases->a[membership];
break;
}
}
}
return adjacent;
}
*/
static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string, char *prev2, char *prev) {
if (phrase_types == component) {
log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types);
feature_array_add(features, 2, "unambiguous phrase type", phrase_type);
feature_array_add(features, 3, "unambiguous phrase type+phrase", phrase_type, phrase_string);
} else if (phrase_types & component) {
feature_array_add(features, 3, "phrase type+phrase", phrase_type, phrase_string);
}
}
/*
address_parser_features
-----------------------
This is a feature function similar to those found in MEMM and CRF models.
Follows the signature of an ap_feature_function so it can be called
as a function pointer by the averaged perceptron model.
Parameters:
address_parser_t *self: a pointer to the address_parser struct, which contains
word frequencies and perhaps other useful corpus-wide statistics.
address_parser_context_t *context: The context struct containing:
- phrase dictionary memberships for all the tokens
- country (if knkown)
- language (if known)
- features array
tokenized_string_t *tokenized: the sequence of tokens for parsing
uint32_t i: the current token index
char *prev: the predicted tag at index i - 1
char *prev2: the predicted tag at index i - 2
*/
bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t i, char *prev, char *prev2) {
if (self == NULL || ctx == NULL) return false;
address_parser_t *parser = (address_parser_t *)self;
address_parser_context_t *context = (address_parser_context_t *)ctx;
cstring_array *features = context->features;
char *language = context->language;
char *country = context->country;
phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
int64_array *address_phrase_memberships = context->address_phrase_memberships;
phrase_array *geodb_phrases = context->geodb_phrases;
int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships;
cstring_array *normalized = context->normalized;
uint32_array *separators = context->separators;
cstring_array_clear(features);
// Bias unit, acts as an intercept
feature_array_add(features, 1, "bias");
char *original_word = tokenized_string_get_token(tokenized, i);
token_t token = tokenized->tokens->a[i];
ssize_t last_index = (ssize_t)i - 1;
ssize_t next_index = (ssize_t)i + 1;
char *word = cstring_array_get_string(normalized, i);
if (word == NULL) {
log_error("got NULL word at %d\n", i);
return false;
}
size_t word_len = strlen(word);
char *current_word = word;
log_debug("word=%s\n", word);
expansion_value_t expansion;
phrase_t phrase = NULL_PHRASE;
char *phrase_string = NULL;
char *geo_phrase_string = NULL;
int64_t address_phrase_index = address_phrase_memberships->a[i];
char_array *phrase_tokens = context->phrase;
// Address dictionary phrases
if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) {
phrase = address_dictionary_phrases->a[address_phrase_index];
log_debug("phrase\n");
last_index = (ssize_t)phrase.start - 1;
next_index = (ssize_t)phrase.start + phrase.len;
expansion.value = phrase.data;
uint32_t address_phrase_types = expansion.components;
log_debug("expansion=%d\n", expansion.value);
if (address_phrase_types & (ADDRESS_STREET | ADDRESS_NAME)) {
phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase);
if (phrase_string != NULL) {
word = phrase_string;
}
log_debug("phrase_string=%s\n", phrase_string);
add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string, prev2, prev);
add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string, prev2, prev);
}
}
// Prefixes like hinter, etc.
phrase_t prefix_phrase = search_address_dictionaries_prefix(original_word, token.len, language);
if (prefix_phrase.len > 0) {
expansion.value = prefix_phrase.data;
// Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category
if (expansion.components ^ ADDRESS_ANY) {
char_array_clear(phrase_tokens);
char_array_add_len(phrase_tokens, original_word, prefix_phrase.len);
char *prefix = char_array_get_string(phrase_tokens);
log_debug("got prefix: %s\n", prefix);
feature_array_add(features, 2, "prefix", prefix);
}
}
// Suffixes like straße, etc.
phrase_t suffix_phrase = search_address_dictionaries_suffix(original_word, token.len, language);
if (suffix_phrase.len > 0) {
expansion.value = suffix_phrase.data;
if (expansion.components & ADDRESS_STREET) {
char_array_clear(phrase_tokens);
char_array_add_len(phrase_tokens, original_word + (token.len - suffix_phrase.len), suffix_phrase.len);
char *suffix = char_array_get_string(phrase_tokens);
log_debug("got suffix: %s\n", suffix);
feature_array_add(features, 2, "suffix", suffix);
}
}
int64_t geodb_phrase_index = geodb_phrase_memberships->a[i];
phrase = NULL_PHRASE;
geodb_value_t geo;
// GeoDB phrases
if (geodb_phrase_index != NULL_PHRASE_MEMBERSHIP) {
phrase = geodb_phrases->a[geodb_phrase_index];
geo_phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase);
geo.value = phrase.data;
uint32_t geodb_phrase_types = geo.components;
if (last_index <= (ssize_t)phrase.start - 1 && next_index >= (ssize_t)phrase.start + phrase.len - 1) {
last_index = (ssize_t)phrase.start - 1;
next_index = (ssize_t)phrase.start + phrase.len;
if (geo_phrase_string != NULL && geodb_phrase_types ^ ADDRESS_POSTAL_CODE) {
word = geo_phrase_string;
}
}
if (geodb_phrase_types ^ ADDRESS_ANY) {
add_phrase_features(features, geodb_phrase_types, ADDRESS_LOCALITY, "city", geo_phrase_string, prev2, prev);
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN1, "admin1", geo_phrase_string, prev2, prev);
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN2, "admin2", geo_phrase_string, prev2, prev);
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN3, "admin3", geo_phrase_string, prev2, prev);
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN4, "admin4", geo_phrase_string, prev2, prev);
add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN_OTHER, "admin other", geo_phrase_string, prev2, prev);
add_phrase_features(features, geodb_phrase_types, ADDRESS_NEIGHBORHOOD, "neighborhood", geo_phrase_string, prev2, prev);
add_phrase_features(features, geodb_phrase_types, ADDRESS_COUNTRY, "country", geo_phrase_string, prev2, prev);
add_phrase_features(features, geodb_phrase_types, ADDRESS_POSTAL_CODE, "postal code", geo_phrase_string, prev2, prev);
}
}
uint32_t word_freq = word_vocab_frequency(parser, word);
if (phrase_string == NULL && geo_phrase_string == NULL) {
if (word_freq > 0) {
// The individual word
feature_array_add(features, 2, "word", word);
} else {
log_debug("word not in vocab: %s\n", original_word);
word = UNKNOWN_WORD;
}
}
if (prev != NULL) {
// Previous tag and current word
feature_array_add(features, 3, "i-1 tag+word", prev, current_word);
feature_array_add(features, 2, "i-1 tag", prev);
if (prev2 != NULL) {
// Previous two tags and current word
feature_array_add(features, 4, "i-2 tag+i-1 tag+word", prev2, prev, current_word);
feature_array_add(features, 3, "i-2 tag+i-1 tag", prev2, prev);
}
}
if (last_index >= 0) {
char *prev_word = cstring_array_get_string(normalized, last_index);
uint32_t prev_word_freq = word_vocab_frequency(parser, prev_word);
if (prev_word_freq == 0) {
prev_word = UNKNOWN_WORD;
}
// Previous word
feature_array_add(features, 2, "i-1 word", prev_word);
// Previous tag + previous word
if (last_index == i - 1) {
feature_array_add(features, 3, "i-1 tag+i-1 word", prev, prev_word);
}
// Previous word and current word
feature_array_add(features, 3, "i-1 word+word", prev_word, word);
}
size_t num_tokens = tokenized->tokens->n;
if (next_index < num_tokens) {
char *next_word = cstring_array_get_string(normalized, next_index);
uint32_t next_word_freq = word_vocab_frequency(parser, next_word);
if (next_word_freq == 0) {
next_word = UNKNOWN_WORD;
}
// Next word e.g. if the current word is unknown and the next word is "street"
feature_array_add(features, 2, "i+1 word", next_word);
// Current word and next word
feature_array_add(features, 3, "word+i+1 word", word, next_word);
}
return true;
}
address_parser_response_t *address_parser_response_new(void) {
address_parser_response_t *response = malloc(sizeof(address_parser_response_t));
return response;
}
void address_parser_response_destroy(address_parser_response_t *self) {
if (self == NULL) return;
for (int i = 0; i < self->num_components; i++) {
if (self->components != NULL) {
free(self->components[i]);
}
if (self->labels != NULL) {
free(self->labels[i]);
}
}
if (self->components != NULL) {
free(self->components);
}
if (self->labels != NULL) {
free(self->labels);
}
free(self);
}
address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context) {
if (address == NULL || context == NULL) return NULL;
char *normalized = address_parser_normalize_string(address);
bool is_normalized = normalized != NULL;
if (!is_normalized) {
normalized = address;
}
address_parser_t *parser = get_address_parser();
averaged_perceptron_t *model = parser->model;
token_array *tokens = tokenize(normalized);
char_array *token_array = char_array_new();
tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);
for (int i = 0; i < tokens->n; i++) {
token_t token = tokens->a[i];
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
uint32_array_push(context->separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
continue;
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
continue;
}
tokenized_string_add_token(tokenized_str, (const char *)normalized, token.len, token.type, token.offset);
uint32_array_push(context->separators, ADDRESS_SEPARATOR_NONE);
}
address_parser_context_fill(context, tokenized_str, language, country);
cstring_array *token_labels = cstring_array_new_size(tokens->n);
char *prev_label = NULL;
address_parser_response_t *response = NULL;
if (averaged_perceptron_tagger_predict(model, parser, context, context->features, token_labels, &address_parser_features, tokenized_str)) {
response = address_parser_response_new();
size_t num_strings = cstring_array_num_strings(tokenized_str->strings);
cstring_array *labels = cstring_array_new_size(num_strings);
cstring_array *components = cstring_array_new_size(strlen(address) + num_strings);
for (int i = 0; i < num_strings; i++) {
char *str = tokenized_string_get_token(tokenized_str, i);
char *label = cstring_array_get_string(token_labels, i);
if (prev_label == NULL || strcmp(label, prev_label) != 0) {
cstring_array_add_string(labels, label);
cstring_array_start_token(components);
}
if (prev_label != NULL && strcmp(label, prev_label) == 0) {
cstring_array_cat_string(components, " ");
cstring_array_cat_string(components, str);
} else {
cstring_array_append_string(components, str);
cstring_array_terminate(components);
}
prev_label = label;
}
response->num_components = cstring_array_num_strings(components);
response->components = cstring_array_to_strings(components);
response->labels = cstring_array_to_strings(labels);
}
token_array_destroy(tokens);
tokenized_string_destroy(tokenized_str);
cstring_array_destroy(token_labels);
return response;
}
bool address_parser_module_setup(char *dir) {
if (parser == NULL) {
return address_parser_load(dir);
}
return true;
}
void address_parser_module_teardown(void) {
if (parser != NULL) {
address_parser_destroy(parser);
}
parser = NULL;
}

134
src/address_parser.h Normal file
View File

@@ -0,0 +1,134 @@
/*
address_parser.h
----------------
International address parser, designed to use OSM training data,
over 40M addresses formatted with the OpenCage address formatting
templates: https://github.com/OpenCageData/address-formatting.
This is a sequence modeling problem similar to e.g. part-of-speech
tagging, named entity recognition, etc. in which we have a sequence
of inputs (words/tokens) and want to predict a sequence of outputs
(labeled part-of-address tags). This is a supervised learning model
and the training data is created in the Python geodata package
included with this repo. Example record:
en us 123/house_number Fake/road Street/road Brooklyn/city NY/state 12345/postcode
Where the fields are: {language, country, tagged address}.
After training, the address parser can take as input a tokenized
input string e.g. "123 Fake Street Brooklyn NY 12345" and parse
it into:
{
"house_number": "123",
"road": "Fake Street",
"city": "Brooklyn",
"state": "NY",
"postcode": "12345"
}
The model used is a greedy averaged perceptron rather than something
like a CRF since there's ample training data from OSM and the accuracy
on this task is already very high with the simpler model.
However, it is still worth investigating CRFs as they are relatively fast
at prediction time for a small number of tags, can often achieve better
performance and are robust to correlated features, which may not be true
with the general error-driven averaged perceptron.
*/
#ifndef ADDRESS_PARSER_H
#define ADDRESS_PARSER_H
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include "averaged_perceptron.h"
#include "averaged_perceptron_tagger.h"
#include "bloom.h"
#include "libpostal_config.h"
#include "collections.h"
#include "normalize.h"
#include "string_utils.h"
#define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat"
#define NULL_PHRASE_MEMBERSHIP -1
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
#define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS
#define ADDRESS_SEPARATOR_NONE 0
#define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0
#define ADDRESS_SEPARATOR_FIELD 1 << 1
#define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH )
#define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD)
#define SEPARATOR_LABEL "sep"
#define FIELD_SEPARATOR_LABEL "fsep"
typedef struct address_parser_context {
char *language;
char *country;
cstring_array *features;
char_array *phrase;
uint32_array *separators;
cstring_array *normalized;
phrase_array *address_dictionary_phrases;
// Index in address_dictionary_phrases or -1
int64_array *address_phrase_memberships;
phrase_array *geodb_phrases;
// Index in gedob_phrases or -1
int64_array *geodb_phrase_memberships;
tokenized_string_t *tokenized_str;
} address_parser_context_t;
typedef struct address_parser_response {
size_t num_components;
char **components;
char **labels;
} address_parser_response_t;
// Can add other gazetteers as well
typedef struct address_parser {
averaged_perceptron_t *model;
trie_t *vocab;
} address_parser_t;
// General usage
address_parser_t *address_parser_new(void);
address_parser_t *get_address_parser(void);
bool address_parser_load(char *dir);
void address_parser_response_destroy(address_parser_response_t *self);
address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context);
void address_parser_destroy(address_parser_t *self);
char *address_parser_normalize_string(char *str);
void address_parser_normalize_token(cstring_array *array, char *str, token_t token);
address_parser_context_t *address_parser_context_new(void);
void address_parser_context_destroy(address_parser_context_t *self);
void address_parser_context_fill(address_parser_context_t *context, tokenized_string_t *tokenized_str, char *language, char *country);
// Feature function
bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2);
// I/O methods
bool address_parser_load(char *dir);
bool address_parser_save(address_parser_t *self, char *output_dir);
// Module setup/teardown
bool address_parser_module_setup(char *dir);
void address_parser_module_teardown(void);
#endif

180
src/address_parser_io.c Normal file
View File

@@ -0,0 +1,180 @@
#include "address_parser_io.h"
address_parser_data_set_t *address_parser_data_set_init(char *filename) {
address_parser_data_set_t *data_set = malloc(sizeof(address_parser_data_set_t));
data_set->f = fopen(filename, "r");
if (data_set->f == NULL) {
free(data_set);
return NULL;
}
data_set->tokens = token_array_new();
data_set->tokenized_str = NULL;
data_set->labels = cstring_array_new();
data_set->separators = uint32_array_new();
data_set->language = char_array_new_size(MAX_LANGUAGE_LEN);
data_set->country = char_array_new_size(MAX_COUNTRY_CODE_LEN);
return data_set;
}
bool address_parser_data_set_tokenize_line(char *input, token_array *tokens, uint32_array *separators, cstring_array *labels) {
size_t count = 0;
token_t token;
uint32_t i = 0;
char *str = NULL;
cstring_array *pairs = cstring_array_split(input, " ", 1, &count);
size_t num_pairs = cstring_array_num_strings(pairs);
char *label = NULL;
// First populate token array
cstring_array_foreach(pairs, i, str, {
size_t pair_len = strlen(str);
char *last_separator = strrchr(str, (int)'/');
if (last_separator == NULL) {
log_error("All tokens must be delimited with '/'\n");
return false;
}
uint32_t last_separator_index = last_separator - str;
label = str + last_separator_index + 1;
uint32_t last_separator_type;
if (strcmp(label, FIELD_SEPARATOR_LABEL) == 0) {
last_separator_type = uint32_array_pop(separators);
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD | ADDRESS_SEPARATOR_FIELD_INTERNAL);
continue;
} else if (strcmp(label, SEPARATOR_LABEL) == 0) {
last_separator_type = uint32_array_pop(separators);
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
continue;
}
token.offset = pairs->indices->a[i];
token.len = last_separator_index;
scanner_t scanner = scanner_from_string(input + token.offset, token.len);
token.type = scan_token(&scanner);
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
continue;
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
// shouldn't happen but just in case
continue;
} else {
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
}
cstring_array_add_string(labels, label);
token_array_push(tokens, token);
})
cstring_array_destroy(pairs);
return true;
}
bool address_parser_data_set_next(address_parser_data_set_t *data_set) {
if (data_set == NULL) return false;
char *line = file_getline(data_set->f);
if (line == NULL) {
return false;
}
size_t token_count;
cstring_array *fields = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count);
free(line);
if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) {
log_error("Token count did not match, ected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count);
}
char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE);
char *country = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_COUNTRY);
char *address = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_ADDRESS);
log_debug("Doing: %s\n", address);
char *normalized = address_parser_normalize_string(address);
bool is_normalized = normalized != NULL;
if (!is_normalized) {
log_debug("could not normalize\n");
normalized = strdup(address);
}
log_debug("Normalized: %s\n", normalized);
token_array *tokens = data_set->tokens;
cstring_array *labels = data_set->labels;
uint32_array *separators = data_set->separators;
token_array_clear(tokens);
cstring_array_clear(labels);
uint32_array_clear(separators);
size_t len = strlen(normalized);
char_array_clear(data_set->country);
char_array_add(data_set->country, country);
char_array_clear(data_set->language);
char_array_add(data_set->language, language);
tokenized_string_t *tokenized_str = NULL;
if (address_parser_data_set_tokenize_line(normalized, tokens, separators, labels)) {
// Add tokens as discrete strings for easier use in feature functions
bool copy_tokens = true;
tokenized_str = tokenized_string_from_tokens(normalized, tokens, copy_tokens);
}
data_set->tokenized_str = tokenized_str;
cstring_array_destroy(fields);
return tokenized_str != NULL;
}
void address_parser_data_set_destroy(address_parser_data_set_t *self) {
if (self == NULL) return;
if (self->f != NULL) {
fclose(self->f);
}
if (self->tokens != NULL) {
token_array_destroy(self->tokens);
}
if (self->labels != NULL) {
cstring_array_destroy(self->labels);
}
if (self->separators != NULL) {
uint32_array_destroy(self->separators);
}
if (self->language != NULL) {
char_array_destroy(self->language);
}
if (self->country != NULL) {
char_array_destroy(self->country);
}
free(self);
}

40
src/address_parser_io.h Normal file
View File

@@ -0,0 +1,40 @@
#ifndef ADDRESS_PARSER_IO_H
#define ADDRESS_PARSER_IO_H
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include "address_parser.h"
#include "collections.h"
#include "file_utils.h"
#include "scanner.h"
#include "string_utils.h"
#define AMBIGUOUS_LANGUAGE "xxx"
#define UNKNOWN_LANGUAGE "unk"
enum address_parser_training_data_fields {
ADDRESS_PARSER_FIELD_LANGUAGE,
ADDRESS_PARSER_FIELD_COUNTRY,
ADDRESS_PARSER_FIELD_ADDRESS,
ADDRESS_PARSER_FILE_NUM_TOKENS
};
typedef struct address_parser_data_set {
FILE *f;
token_array *tokens;
tokenized_string_t *tokenized_str;
cstring_array *labels;
uint32_array *separators;
char_array *language;
char_array *country;
} address_parser_data_set_t;
address_parser_data_set_t *address_parser_data_set_init(char *filename);
bool address_parser_data_set_tokenize_line(char *input, token_array *tokens, uint32_array *separators, cstring_array *labels);
bool address_parser_data_set_next(address_parser_data_set_t *data_set);
void address_parser_data_set_destroy(address_parser_data_set_t *self);
#endif

196
src/address_parser_test.c Normal file
View File

@@ -0,0 +1,196 @@
#include "address_parser.h"
#include "address_parser_io.h"
#include "address_dictionary.h"
#include "averaged_perceptron_trainer.h"
#include "collections.h"
#include "constants.h"
#include "file_utils.h"
#include "geodb.h"
#include "normalize.h"
#include "log/log.h"
typedef struct address_parser_test_results {
size_t num_errors;
size_t num_predictions;
size_t num_address_errors;
size_t num_address_predictions;
uint32_t *confusion;
} address_parser_test_results_t;
uint32_t get_class_index(address_parser_t *parser, char *name) {
uint32_t i;
char *str;
cstring_array_foreach(parser->model->classes, i, str, {
if (strcmp(name, str) == 0) {
return i;
}
})
return parser->model->num_classes;
}
#define EMPTY_ADDRESS_PARSER_TEST_RESULT (address_parser_test_results_t){0, 0, 0, 0, NULL}
bool address_parser_test(address_parser_t *parser, char *filename, address_parser_test_results_t *result) {
if (filename == NULL) {
log_error("Filename was NULL\n");
return NULL;
}
uint32_t num_classes = parser->model->num_classes;
result->confusion = calloc(num_classes * num_classes, sizeof(uint32_t));
address_parser_data_set_t *data_set = address_parser_data_set_init(filename);
if (data_set == NULL) {
log_error("Error initializing data set\n");
return NULL;
}
address_parser_context_t *context = address_parser_context_new();
bool success = false;
size_t examples = 0;
bool logged = false;
while (address_parser_data_set_next(data_set)) {
char *language = char_array_get_string(data_set->language);
if (strcmp(language, UNKNOWN_LANGUAGE) == 0 || strcmp(language, AMBIGUOUS_LANGUAGE) == 0) {
language = NULL;
}
char *country = char_array_get_string(data_set->country);
address_parser_context_fill(context, data_set->tokenized_str, language, country);
cstring_array *token_labels = cstring_array_new_size(data_set->tokenized_str->strings->str->n);
char *prev_label = NULL;
address_parser_response_t *response = NULL;
size_t starting_errors = result->num_errors;
if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, token_labels, &address_parser_features, data_set->tokenized_str)) {
uint32_t i;
char *predicted;
cstring_array_foreach(token_labels, i, predicted, {
char *truth = cstring_array_get_string(data_set->labels, i);
if (strcmp(predicted, truth) != 0) {
result->num_errors++;
uint32_t predicted_index = get_class_index(parser, predicted);
uint32_t truth_index = get_class_index(parser, truth);
result->confusion[predicted_index * num_classes + truth_index]++;
}
result->num_predictions++;
})
}
cstring_array_destroy(token_labels);
if (result->num_errors > starting_errors) {
result->num_address_errors++;
}
result->num_address_predictions++;
if (result->num_address_predictions % 1000 == 0 && result->num_address_predictions > 0) {
log_info("Did %zu examples\n", result->num_address_predictions);
}
tokenized_string_destroy(data_set->tokenized_str);
data_set->tokenized_str = NULL;
}
address_parser_data_set_destroy(data_set);
address_parser_context_destroy(context);
return true;
}
int main(int argc, char **argv) {
char *address_parser_dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
if (argc < 2) {
log_error("Usage: ./address_parser_test filename [parser_dir]\n");
exit(EXIT_FAILURE);
}
char *filename = argv[1];
if (argc > 2) {
address_parser_dir = argv[2];
}
if (!address_dictionary_module_setup(NULL)) {
log_error("Could not load address dictionaries\n");
exit(EXIT_FAILURE);
}
log_info("address dictionary module loaded\n");
if (!geodb_module_setup(NULL)) {
log_error("Could not load geodb dictionaries\n");
exit(EXIT_FAILURE);
}
log_info("geodb module loaded\n");
if (!address_parser_load(address_parser_dir)) {
log_error("Could not initialize parser\n");
exit(EXIT_FAILURE);
}
log_info("Finished initialization\n");
address_parser_t *parser = get_address_parser();
address_parser_test_results_t results = EMPTY_ADDRESS_PARSER_TEST_RESULT;
if (!address_parser_test(parser, filename, &results)) {
log_error("Error in training\n");
exit(EXIT_FAILURE);
}
printf("Errors: %zu / %zu (%f%%)\n", results.num_errors, results.num_predictions, (double)results.num_errors / results.num_predictions);
printf("Addresses: %zu / %zu (%f%%)\n\n", results.num_address_errors, results.num_address_predictions, (double)results.num_address_errors / results.num_address_predictions);
printf("Confusion matrix:\n\n");
uint32_t num_classes = parser->model->num_classes;
for (uint32_t i = 0; i < num_classes; i++) {
for (uint32_t j = 0; j < num_classes; j++) {
if (i == j) {
continue;
}
uint32_t class_errors = results.confusion[i * num_classes + j];
if (class_errors > 0) {
char *predicted = cstring_array_get_string(parser->model->classes, i);
char *truth = cstring_array_get_string(parser->model->classes, j);
printf("(%s, %s): %d\n", predicted, truth, class_errors);
}
}
}
free(results.confusion);
address_parser_module_teardown();
address_dictionary_module_teardown();
geodb_module_teardown();
}

300
src/address_parser_train.c Normal file
View File

@@ -0,0 +1,300 @@
#include "address_parser.h"
#include "address_parser_io.h"
#include "address_dictionary.h"
#include "averaged_perceptron_trainer.h"
#include "collections.h"
#include "constants.h"
#include "file_utils.h"
#include "geodb.h"
#include "shuffle.h"
#include "log/log.h"
// Training
#define DEFAULT_ITERATIONS 5
#define MIN_VOCAB_COUNT 5
address_parser_t *address_parser_init(char *filename) {
if (filename == NULL) {
log_error("Filename was NULL\n");
return NULL;
}
address_parser_data_set_t *data_set = address_parser_data_set_init(filename);
if (data_set == NULL) {
log_error("Error initializing data set\n");
return NULL;
}
address_parser_t *parser = address_parser_new();
if (parser == NULL) {
log_error("Error allocating parser\n");
return NULL;
}
khash_t(str_uint32) *vocab = kh_init(str_uint32);
khiter_t k;
char *str;
uint32_t vocab_size = 0;
size_t examples = 0;
const char *word;
uint32_t i;
char *token;
char *normalized;
uint32_t count;
char_array *token_array = char_array_new();
while (address_parser_data_set_next(data_set)) {
tokenized_string_t *tokenized_str = data_set->tokenized_str;
if (tokenized_str == NULL) {
log_error("tokenized str is NULL\n");
kh_destroy(str_uint32, vocab);
return false;
}
str = tokenized_str->str;
cstring_array_foreach(tokenized_str->strings, i, token, {
token_t t = tokenized_str->tokens->a[i];
char_array_clear(token_array);
add_normalized_token(token_array, str, t, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS);
if (token_array->n == 0) {
continue;
}
normalized = char_array_get_string(token_array);
k = kh_get(str_uint32, vocab, normalized);
if (k == kh_end(vocab)) {
int ret;
char *key = strdup(normalized);
k = kh_put(str_uint32, vocab, key, &ret);
if (ret < 0) {
log_error("Error in kh_put\n");
free(key);
tokenized_string_destroy(tokenized_str);
kh_foreach(vocab, word, count, {
free((char *)word);
})
kh_destroy(str_uint32, vocab);
char_array_destroy(token_array);
return false;
}
kh_value(vocab, k) = 1;
vocab_size++;
} else {
kh_value(vocab, k)++;
}
})
tokenized_string_destroy(tokenized_str);
examples++;
if (examples % 10000 == 0 && examples != 0) {
log_info("Counting vocab: did %zu examples\n", examples);
}
}
log_debug("Done with vocab, total size=%d\n", vocab_size);
for (k = kh_begin(vocab); k != kh_end(vocab); ++k) {
char *word = (char *)kh_key(vocab, k);
if (!kh_exist(vocab, k)) {
continue;
}
uint32_t count = kh_value(vocab, k);
if (count < MIN_VOCAB_COUNT) {
kh_del(str_uint32, vocab, k);
free(word);
}
}
parser->vocab = trie_new_from_hash(vocab);
for (k = kh_begin(vocab); k != kh_end(vocab); ++k) {
if (!kh_exist(vocab, k)) {
continue;
}
char *word = (char *)kh_key(vocab, k);
free(word);
}
kh_destroy(str_uint32, vocab);
char_array_destroy(token_array);
address_parser_data_set_destroy(data_set);
if (parser->vocab == NULL) {
log_error("Error initializing vocabulary\n");
address_parser_destroy(parser);
return NULL;
}
return parser;
}
bool address_parser_train_epoch(address_parser_t *self, averaged_perceptron_trainer_t *trainer, char *filename) {
if (filename == NULL) {
log_error("Filename was NULL\n");
return false;
}
address_parser_data_set_t *data_set = address_parser_data_set_init(filename);
if (data_set == NULL) {
log_error("Error initializing data set\n");
return false;
}
address_parser_context_t *context = address_parser_context_new();
bool success = false;
size_t examples = 0;
size_t errors = trainer->num_errors;
bool logged = false;
while (address_parser_data_set_next(data_set)) {
char *language = char_array_get_string(data_set->language);
if (strcmp(language, UNKNOWN_LANGUAGE) == 0 || strcmp(language, AMBIGUOUS_LANGUAGE) == 0) {
language = NULL;
}
char *country = char_array_get_string(data_set->country);
address_parser_context_fill(context, data_set->tokenized_str, language, country);
bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, &address_parser_features, data_set->tokenized_str, data_set->labels);
if (!example_success) {
log_error("Error training example\n");
goto exit_epoch_training_started;
}
tokenized_string_destroy(data_set->tokenized_str);
data_set->tokenized_str = NULL;
if (!example_success) {
log_error("Error training example without country/language\n");
goto exit_epoch_training_started;
}
examples++;
if (examples % 1000 == 0 && examples > 0) {
log_info("Iter %d: Did %zu examples with %llu errors\n", trainer->iterations, examples, trainer->num_errors - errors);
errors = trainer->num_errors;
}
}
success = true;
exit_epoch_training_started:
address_parser_data_set_destroy(data_set);
address_parser_context_destroy(context);
return success;
}
bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_iterations) {
averaged_perceptron_trainer_t *trainer = averaged_perceptron_trainer_new();
for (uint32_t iter = 0; iter < num_iterations; iter++) {
log_info("Doing epoch %d\n", iter);
trainer->iterations = iter;
log_debug("Shuffling\n");
/*
if (!shuffle_file(filename)) {
log_error("Error in shuffle\n");
averaged_perceptron_trainer_destroy(trainer);
return false;
}
log_debug("Shuffle complete\n");
*/
if (!address_parser_train_epoch(self, trainer, filename)) {
log_error("Error in epoch\n");
averaged_perceptron_trainer_destroy(trainer);
return false;
}
}
log_debug("Done with training, averaging weights\n");
self->model = averaged_perceptron_trainer_finalize(trainer);
return true;
}
int main(int argc, char **argv) {
if (argc < 3) {
printf("Usage: ./address_parser_train filename output_dir\n");
exit(EXIT_FAILURE);
}
#if !defined(HAVE_SHUF) && !defined(HAVE_GSHUF)
log_error("shuf or gshuf must be installed to train address parser. Please install and reconfigure libpostal\n");
exit(EXIT_FAILURE);
#endif
char *filename = argv[1];
char *output_dir = argv[2];
if (!address_dictionary_module_setup(NULL)) {
log_error("Could not load address dictionaries\n");
exit(EXIT_FAILURE);
}
log_info("address dictionary module loaded\n");
if (!geodb_module_setup(NULL)) {
log_error("Could not load geodb dictionaries\n");
exit(EXIT_FAILURE);
}
log_info("geodb module loaded\n");
address_parser_t *parser = address_parser_init(filename);
if (parser == NULL) {
log_error("Could not initialize parser\n");
exit(EXIT_FAILURE);
}
log_info("Finished initialization\n");
if (!address_parser_train(parser, filename, DEFAULT_ITERATIONS)) {
log_error("Error in training\n");
exit(EXIT_FAILURE);
}
log_debug("Finished training\n");
if (!address_parser_save(parser, output_dir)) {
log_error("Error saving address parser\n");
exit(EXIT_FAILURE);
}
address_parser_destroy(parser);
address_dictionary_module_teardown();
geodb_module_teardown();
log_debug("Done\n");
}