1856 lines
70 KiB
C
1856 lines
70 KiB
C
#include "address_parser.h"
|
|
#include "address_dictionary.h"
|
|
#include "features.h"
|
|
#include "ngrams.h"
|
|
#include "scanner.h"
|
|
|
|
#include "graph_builder.h"
|
|
|
|
#include "klib/ksort.h"
|
|
#include "log/log.h"
|
|
|
|
#define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat"
|
|
#define ADDRESS_PARSER_MODEL_FILENAME_CRF "address_parser_crf.dat"
|
|
#define ADDRESS_PARSER_VOCAB_FILENAME "address_parser_vocab.trie"
|
|
#define ADDRESS_PARSER_PHRASE_FILENAME "address_parser_phrases.dat"
|
|
#define ADDRESS_PARSER_POSTAL_CODES_FILENAME "address_parser_postal_codes.dat"
|
|
|
|
#define UNKNOWN_WORD "UNKNOWN"
|
|
#define UNKNOWN_NUMERIC "UNKNOWN_NUMERIC"
|
|
|
|
#define DEFAULT_RARE_WORD_THRESHOLD 50
|
|
|
|
static address_parser_t *parser = NULL;
|
|
|
|
typedef enum {
|
|
ADDRESS_PARSER_NULL_PHRASE,
|
|
ADDRESS_PARSER_DICTIONARY_PHRASE,
|
|
ADDRESS_PARSER_COMPONENT_PHRASE,
|
|
ADDRESS_PARSER_PREFIX_PHRASE,
|
|
ADDRESS_PARSER_SUFFIX_PHRASE
|
|
} address_parser_phrase_type_t;
|
|
|
|
static parser_options_t PARSER_DEFAULT_OPTIONS = {
|
|
.rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD,
|
|
.print_features = false
|
|
};
|
|
|
|
address_parser_t *address_parser_new_options(parser_options_t options) {
|
|
address_parser_t *parser = calloc(1, sizeof(address_parser_t));
|
|
parser->options = options;
|
|
return parser;
|
|
}
|
|
|
|
address_parser_t *address_parser_new(void) {
|
|
return address_parser_new_options(PARSER_DEFAULT_OPTIONS);
|
|
}
|
|
|
|
address_parser_t *get_address_parser(void) {
|
|
return parser;
|
|
}
|
|
|
|
bool address_parser_print_features(bool print_features) {
|
|
if (parser == NULL) return false;
|
|
|
|
parser->options.print_features = print_features;
|
|
return true;
|
|
}
|
|
|
|
bool address_parser_save(address_parser_t *self, char *output_dir) {
|
|
if (self == NULL || output_dir == NULL) return false;
|
|
|
|
char *model_filename = NULL;
|
|
if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
|
|
model_filename = ADDRESS_PARSER_MODEL_FILENAME;
|
|
} else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
|
|
model_filename = ADDRESS_PARSER_MODEL_FILENAME_CRF;
|
|
} else {
|
|
return false;
|
|
}
|
|
|
|
char_array *path = char_array_new_size(strlen(output_dir));
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, model_filename);
|
|
char *model_path = char_array_get_string(path);
|
|
|
|
if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
|
|
if (!averaged_perceptron_save(self->model.ap, model_path)) {
|
|
log_info("Error in averaged_perceptron_save\n");
|
|
char_array_destroy(path);
|
|
return false;
|
|
}
|
|
} else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
|
|
if (!crf_save(self->model.crf, model_path)) {
|
|
log_info("Error in crf_save\n");
|
|
char_array_destroy(path);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_VOCAB_FILENAME);
|
|
char *vocab_path = char_array_get_string(path);
|
|
|
|
if (!trie_save(self->vocab, vocab_path)) {
|
|
return false;
|
|
}
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_PHRASE_FILENAME);
|
|
char *phrases_path = char_array_get_string(path);
|
|
|
|
FILE *phrases_file = fopen(phrases_path, "w+");
|
|
if (phrases_file == NULL || self->phrases == NULL) {
|
|
return false;
|
|
}
|
|
|
|
if (!trie_write(self->phrases, phrases_file)) {
|
|
return false;
|
|
}
|
|
|
|
if (self->phrase_types == NULL) {
|
|
return false;
|
|
}
|
|
|
|
size_t num_phrase_types = self->phrase_types->n;
|
|
if (!file_write_uint64(phrases_file, num_phrase_types)) {
|
|
return false;
|
|
}
|
|
|
|
for (size_t i = 0; i < self->phrase_types->n; i++) {
|
|
address_parser_types_t phrase_type_value = self->phrase_types->a[i];
|
|
if (!file_write_uint32(phrases_file, phrase_type_value.value)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
fclose(phrases_file);
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME);
|
|
char *postal_codes_path = char_array_get_string(path);
|
|
|
|
FILE *postal_codes_file = fopen(postal_codes_path, "w+");
|
|
if (postal_codes_file == NULL || self->postal_codes == NULL) {
|
|
return false;
|
|
}
|
|
|
|
if (!trie_write(self->postal_codes, postal_codes_file)) {
|
|
return false;
|
|
}
|
|
|
|
if (self->postal_code_contexts == NULL) {
|
|
return false;
|
|
}
|
|
|
|
if (!graph_write(self->postal_code_contexts, postal_codes_file)) {
|
|
return false;
|
|
}
|
|
|
|
fclose(postal_codes_file);
|
|
|
|
char_array_destroy(path);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool postal_code_context_exists(address_parser_t *self, uint32_t postal_code_id, uint32_t admin_id) {
|
|
graph_t *g = self->postal_code_contexts;
|
|
|
|
return graph_has_edge(g, postal_code_id, admin_id);
|
|
}
|
|
|
|
bool address_parser_load(char *dir) {
|
|
if (parser != NULL) return false;
|
|
if (dir == NULL) {
|
|
dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
|
|
}
|
|
|
|
char_array *path = char_array_new_size(strlen(dir));
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME);
|
|
char *model_path = char_array_get_string(path);
|
|
|
|
if (file_exists(model_path)) {
|
|
averaged_perceptron_t *ap_model = averaged_perceptron_load(model_path);
|
|
if (ap_model != NULL) {
|
|
parser = address_parser_new();
|
|
parser->model_type = ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON;
|
|
parser->model.ap = ap_model;
|
|
} else {
|
|
char_array_destroy(path);
|
|
log_error("Averaged perceptron model could not be loaded\n");
|
|
return false;
|
|
}
|
|
} else {
|
|
model_path = NULL;
|
|
}
|
|
|
|
if (model_path == NULL) {
|
|
char_array_clear(path);
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME_CRF);
|
|
model_path = char_array_get_string(path);
|
|
|
|
if (file_exists(model_path)) {
|
|
crf_t *crf_model = crf_load(model_path);
|
|
if (crf_model != NULL) {
|
|
parser = address_parser_new();
|
|
parser->model_type = ADDRESS_PARSER_TYPE_CRF;
|
|
parser->model.crf = crf_model;
|
|
} else {
|
|
char_array_destroy(path);
|
|
log_error("Averaged perceptron model could not be loaded\n");
|
|
return false;
|
|
}
|
|
} else {
|
|
model_path = NULL;
|
|
}
|
|
}
|
|
|
|
if (parser == NULL) {
|
|
char_array_destroy(path);
|
|
log_error("Could not find parser model file of known type\n");
|
|
return false;
|
|
}
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_VOCAB_FILENAME);
|
|
|
|
char *vocab_path = char_array_get_string(path);
|
|
|
|
trie_t *vocab = trie_load(vocab_path);
|
|
|
|
if (vocab == NULL) {
|
|
goto exit_address_parser_created;
|
|
}
|
|
|
|
parser->vocab = vocab;
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_PHRASE_FILENAME);
|
|
char *phrases_path = char_array_get_string(path);
|
|
|
|
FILE *phrases_file = fopen(phrases_path, "rb");
|
|
if (phrases_file == NULL) {
|
|
goto exit_address_parser_created;
|
|
}
|
|
|
|
parser->phrases = trie_read(phrases_file);
|
|
if (parser->phrases == NULL) {
|
|
goto exit_address_parser_created;
|
|
}
|
|
|
|
uint64_t num_phrase_types;
|
|
|
|
if (!file_read_uint64(phrases_file, &num_phrase_types)) {
|
|
goto exit_address_parser_created;
|
|
}
|
|
|
|
parser->phrase_types = address_parser_types_array_new_size(num_phrase_types);
|
|
|
|
uint32_array *phrase_type_values = uint32_array_new_size(num_phrase_types);
|
|
if (!file_read_uint32_array(phrases_file, phrase_type_values->a, num_phrase_types)) {
|
|
uint32_array_destroy(phrase_type_values);
|
|
goto exit_address_parser_created;
|
|
}
|
|
phrase_type_values->n = num_phrase_types;
|
|
|
|
for (size_t i = 0; i < phrase_type_values->n; i++) {
|
|
uint32_t phrase_type_value = phrase_type_values->a[i];
|
|
address_parser_types_t phrase_type = {.value = phrase_type_value};
|
|
address_parser_types_array_push(parser->phrase_types, phrase_type);
|
|
}
|
|
|
|
uint32_array_destroy(phrase_type_values);
|
|
|
|
fclose(phrases_file);
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME);
|
|
|
|
char *postal_codes_path = char_array_get_string(path);
|
|
|
|
FILE *postal_codes_file = fopen(postal_codes_path, "rb");
|
|
if (postal_codes_file == NULL) {
|
|
goto exit_address_parser_created;
|
|
}
|
|
|
|
parser->postal_codes = trie_read(postal_codes_file);
|
|
if (parser->postal_codes == NULL) {
|
|
goto exit_address_parser_created;
|
|
}
|
|
|
|
parser->postal_code_contexts = graph_read(postal_codes_file);
|
|
|
|
if (parser->postal_code_contexts == NULL) {
|
|
goto exit_address_parser_created;
|
|
}
|
|
|
|
fclose(postal_codes_file);
|
|
|
|
parser->context = address_parser_context_new();
|
|
if (parser->context == NULL) {
|
|
goto exit_address_parser_created;
|
|
}
|
|
|
|
char_array_destroy(path);
|
|
return true;
|
|
|
|
exit_address_parser_created:
|
|
address_parser_destroy(parser);
|
|
char_array_destroy(path);
|
|
return false;
|
|
}
|
|
|
|
void address_parser_destroy(address_parser_t *self) {
|
|
if (self == NULL) return;
|
|
|
|
if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON && self->model.ap != NULL) {
|
|
averaged_perceptron_destroy(self->model.ap);
|
|
} else if (self->model_type == ADDRESS_PARSER_TYPE_CRF && self->model.crf != NULL) {
|
|
crf_destroy(self->model.crf);
|
|
}
|
|
|
|
if (self->context != NULL) {
|
|
address_parser_context_destroy(self->context);
|
|
}
|
|
|
|
if (self->vocab != NULL) {
|
|
trie_destroy(self->vocab);
|
|
}
|
|
|
|
if (self->phrases != NULL) {
|
|
trie_destroy(self->phrases);
|
|
}
|
|
|
|
if (self->phrase_types != NULL) {
|
|
address_parser_types_array_destroy(self->phrase_types);
|
|
}
|
|
|
|
if (self->postal_codes != NULL) {
|
|
trie_destroy(self->postal_codes);
|
|
}
|
|
|
|
if (self->postal_code_contexts != NULL) {
|
|
graph_destroy(self->postal_code_contexts);
|
|
}
|
|
|
|
free(self);
|
|
}
|
|
|
|
static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) {
|
|
uint32_t count = 0;
|
|
bool has_key = trie_get_data(parser->vocab, word, &count);
|
|
return count;
|
|
}
|
|
|
|
inline void address_parser_normalize_token(cstring_array *array, char *str, token_t token) {
|
|
normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS);
|
|
}
|
|
|
|
static inline void address_parser_normalize_phrase_token(cstring_array *array, char *str, token_t token) {
|
|
normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS);
|
|
}
|
|
|
|
inline char *address_parser_normalize_string(char *str) {
|
|
return normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS);
|
|
}
|
|
|
|
|
|
void address_parser_context_destroy(address_parser_context_t *self) {
|
|
if (self == NULL) return;
|
|
|
|
if (self->phrase != NULL) {
|
|
char_array_destroy(self->phrase);
|
|
}
|
|
|
|
if (self->context_phrase != NULL) {
|
|
char_array_destroy(self->context_phrase);
|
|
}
|
|
|
|
if (self->long_context_phrase != NULL) {
|
|
char_array_destroy(self->long_context_phrase);
|
|
}
|
|
|
|
if (self->component_phrase != NULL) {
|
|
char_array_destroy(self->component_phrase);
|
|
}
|
|
|
|
if (self->context_component_phrase != NULL) {
|
|
char_array_destroy(self->context_component_phrase);
|
|
}
|
|
|
|
if (self->long_context_component_phrase != NULL) {
|
|
char_array_destroy(self->long_context_component_phrase);
|
|
}
|
|
|
|
if (self->prefix_phrase != NULL) {
|
|
char_array_destroy(self->prefix_phrase);
|
|
}
|
|
|
|
if (self->context_prefix_phrase != NULL) {
|
|
char_array_destroy(self->context_prefix_phrase);
|
|
}
|
|
|
|
if (self->long_context_prefix_phrase != NULL) {
|
|
char_array_destroy(self->long_context_prefix_phrase);
|
|
}
|
|
|
|
if (self->suffix_phrase != NULL) {
|
|
char_array_destroy(self->suffix_phrase);
|
|
}
|
|
|
|
if (self->context_suffix_phrase != NULL) {
|
|
char_array_destroy(self->context_suffix_phrase);
|
|
}
|
|
|
|
if (self->long_context_suffix_phrase != NULL) {
|
|
char_array_destroy(self->long_context_suffix_phrase);
|
|
}
|
|
|
|
if (self->ngrams != NULL) {
|
|
cstring_array_destroy(self->ngrams);
|
|
}
|
|
|
|
if (self->sub_token != NULL) {
|
|
char_array_destroy(self->sub_token);
|
|
}
|
|
|
|
if (self->sub_tokens != NULL) {
|
|
token_array_destroy(self->sub_tokens);
|
|
}
|
|
|
|
if (self->separators != NULL) {
|
|
uint32_array_destroy(self->separators);
|
|
}
|
|
|
|
if (self->normalized != NULL) {
|
|
cstring_array_destroy(self->normalized);
|
|
}
|
|
|
|
if (self->normalized_tokens != NULL) {
|
|
token_array_destroy(self->normalized_tokens);
|
|
}
|
|
|
|
if (self->normalized_admin != NULL) {
|
|
cstring_array_destroy(self->normalized_admin);
|
|
}
|
|
|
|
if (self->normalized_admin_tokens != NULL) {
|
|
token_array_destroy(self->normalized_admin_tokens);
|
|
}
|
|
|
|
if (self->features != NULL) {
|
|
cstring_array_destroy(self->features);
|
|
}
|
|
|
|
if (self->prev_tag_features != NULL) {
|
|
cstring_array_destroy(self->prev_tag_features);
|
|
}
|
|
|
|
if (self->prev2_tag_features != NULL) {
|
|
cstring_array_destroy(self->prev2_tag_features);
|
|
}
|
|
|
|
if (self->tokenized_str != NULL) {
|
|
tokenized_string_destroy(self->tokenized_str);
|
|
}
|
|
|
|
if (self->address_dictionary_phrases != NULL) {
|
|
phrase_array_destroy(self->address_dictionary_phrases);
|
|
}
|
|
|
|
if (self->address_phrase_memberships != NULL) {
|
|
int64_array_destroy(self->address_phrase_memberships);
|
|
}
|
|
|
|
if (self->component_phrases != NULL) {
|
|
phrase_array_destroy(self->component_phrases);
|
|
}
|
|
|
|
if (self->component_phrase_memberships != NULL) {
|
|
int64_array_destroy(self->component_phrase_memberships);
|
|
}
|
|
|
|
if (self->postal_code_phrases != NULL) {
|
|
phrase_array_destroy(self->postal_code_phrases);
|
|
}
|
|
|
|
if (self->postal_code_phrase_memberships != NULL) {
|
|
int64_array_destroy(self->postal_code_phrase_memberships);
|
|
}
|
|
|
|
if (self->prefix_phrases != NULL) {
|
|
phrase_array_destroy(self->prefix_phrases);
|
|
}
|
|
|
|
if (self->suffix_phrases != NULL) {
|
|
phrase_array_destroy(self->suffix_phrases);
|
|
}
|
|
|
|
free(self);
|
|
}
|
|
|
|
address_parser_context_t *address_parser_context_new(void) {
|
|
address_parser_context_t *context = malloc(sizeof(address_parser_context_t));
|
|
|
|
if (context == NULL) return NULL;
|
|
|
|
context->language = NULL;
|
|
context->country = NULL;
|
|
|
|
context->phrase = char_array_new();
|
|
if (context->phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->context_phrase = char_array_new();
|
|
if (context->context_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->long_context_phrase = char_array_new();
|
|
if (context->long_context_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->component_phrase = char_array_new();
|
|
if (context->component_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->context_component_phrase = char_array_new();
|
|
if (context->context_component_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->long_context_component_phrase = char_array_new();
|
|
if (context->long_context_component_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->prefix_phrase = char_array_new();
|
|
if (context->prefix_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->context_prefix_phrase = char_array_new();
|
|
if (context->context_prefix_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->long_context_prefix_phrase = char_array_new();
|
|
if (context->long_context_prefix_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->suffix_phrase = char_array_new();
|
|
if (context->suffix_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->context_suffix_phrase = char_array_new();
|
|
if (context->context_suffix_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->long_context_suffix_phrase = char_array_new();
|
|
if (context->long_context_suffix_phrase == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->ngrams = cstring_array_new();
|
|
if (context->ngrams == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->sub_token = char_array_new();
|
|
if (context->sub_token == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->sub_tokens = token_array_new();
|
|
if (context->sub_tokens == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->separators = uint32_array_new();
|
|
if (context->separators == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->normalized = cstring_array_new();
|
|
if (context->normalized == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->normalized_tokens = token_array_new();
|
|
if (context->normalized_tokens == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->normalized_admin = cstring_array_new();
|
|
if (context->normalized_admin == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->normalized_admin_tokens = token_array_new();
|
|
if (context->normalized_admin_tokens == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->features = cstring_array_new();
|
|
if (context->features == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->prev_tag_features = cstring_array_new();
|
|
if (context->prev_tag_features == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->prev2_tag_features = cstring_array_new();
|
|
if (context->prev2_tag_features == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->tokenized_str = tokenized_string_new();
|
|
if (context->tokenized_str == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->address_dictionary_phrases = phrase_array_new();
|
|
if (context->address_dictionary_phrases == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->address_phrase_memberships = int64_array_new();
|
|
if (context->address_phrase_memberships == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->component_phrases = phrase_array_new();
|
|
if (context->component_phrases == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->component_phrase_memberships = int64_array_new();
|
|
if (context->component_phrase_memberships == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->postal_code_phrases = phrase_array_new();
|
|
if (context->postal_code_phrases == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->postal_code_phrase_memberships = int64_array_new();
|
|
if (context->postal_code_phrase_memberships == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->prefix_phrases = phrase_array_new();
|
|
if (context->prefix_phrases == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
context->suffix_phrases = phrase_array_new();
|
|
if (context->suffix_phrases == NULL) {
|
|
goto exit_address_parser_context_allocated;
|
|
}
|
|
|
|
return context;
|
|
|
|
exit_address_parser_context_allocated:
|
|
address_parser_context_destroy(context);
|
|
return NULL;
|
|
}
|
|
|
|
bool is_valid_component_phrase(cstring_array *strings, phrase_t phrase) {
|
|
bool valid = false;
|
|
for (uint32_t i = phrase.start; i < phrase.start + phrase.len; i++) {
|
|
char *s = cstring_array_get_string(strings, i);
|
|
if (!string_is_digit(s, strlen(s))) {
|
|
valid = true;
|
|
break;
|
|
}
|
|
}
|
|
return valid;
|
|
}
|
|
|
|
void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) {
|
|
uint32_t token_index;
|
|
char *word;
|
|
phrase_t phrase;
|
|
|
|
context->language = language;
|
|
context->country = country;
|
|
|
|
cstring_array *normalized = context->normalized;
|
|
token_array *normalized_tokens = context->normalized_tokens;
|
|
cstring_array_clear(normalized);
|
|
token_array_clear(normalized_tokens);
|
|
|
|
cstring_array *normalized_admin = context->normalized_admin;
|
|
token_array *normalized_admin_tokens = context->normalized_admin_tokens;
|
|
cstring_array_clear(normalized_admin);
|
|
token_array_clear(normalized_admin_tokens);
|
|
|
|
char *str = tokenized_str->str;
|
|
token_array *tokens = tokenized_str->tokens;
|
|
|
|
cstring_array_foreach(tokenized_str->strings, token_index, word, {
|
|
token_t token = tokens->a[token_index];
|
|
|
|
size_t token_offset = normalized->str->n;
|
|
address_parser_normalize_token(normalized, str, token);
|
|
size_t token_len;
|
|
if (normalized->str->n > token_offset) {
|
|
token_len = normalized->str->n - 1 - token_offset;
|
|
} else {
|
|
token_len = 0;
|
|
}
|
|
token_t normalized_token;
|
|
normalized_token.offset = token_offset;
|
|
normalized_token.len = token_len;
|
|
normalized_token.type = token.type;
|
|
token_array_push(normalized_tokens, normalized_token);
|
|
|
|
size_t admin_token_offset = normalized_admin->str->n;
|
|
address_parser_normalize_phrase_token(normalized_admin, str, token);
|
|
size_t admin_token_len;
|
|
if (normalized_admin->str->n > admin_token_offset) {
|
|
admin_token_len = normalized_admin->str->n - 1 - admin_token_offset;
|
|
} else {
|
|
admin_token_len = 0;
|
|
}
|
|
token_t normalized_admin_token;
|
|
normalized_admin_token.offset = admin_token_offset;
|
|
normalized_admin_token.len = admin_token_len;
|
|
normalized_admin_token.type = token.type;
|
|
token_array_push(normalized_admin_tokens, normalized_admin_token);
|
|
})
|
|
|
|
char *normalized_str = normalized->str->a;
|
|
char *normalized_str_admin = normalized_admin->str->a;
|
|
|
|
/*
|
|
Address dictionary phrases
|
|
--------------------------
|
|
Recognizing phrases that occur in libpostal's dictionaries.
|
|
|
|
Note: if the dictionaries are updates to try to improve the parser,
|
|
we'll need to retrain. This can be done without rebuilding the
|
|
training data (a long-running process which can take up to a week),
|
|
but will require running address_parser_train, the main training script.
|
|
*/
|
|
|
|
phrase_array_clear(context->address_dictionary_phrases);
|
|
int64_array_clear(context->address_phrase_memberships);
|
|
|
|
phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
|
|
int64_array *address_phrase_memberships = context->address_phrase_memberships;
|
|
|
|
size_t num_tokens = tokens->n;
|
|
|
|
bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases);
|
|
token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens);
|
|
|
|
phrase_array_clear(context->prefix_phrases);
|
|
phrase_array_clear(context->suffix_phrases);
|
|
|
|
for (size_t i = 0; i < num_tokens; i++) {
|
|
token_t token = tokens->a[i];
|
|
char *word_pre_norm = tokenized_string_get_token(tokenized_str, i);
|
|
|
|
phrase_t prefix_phrase = search_address_dictionaries_prefix(word_pre_norm, token.len, NULL);
|
|
phrase_array_push(context->prefix_phrases, prefix_phrase);
|
|
|
|
phrase_t suffix_phrase = search_address_dictionaries_suffix(word_pre_norm, token.len, NULL);
|
|
phrase_array_push(context->suffix_phrases, suffix_phrase);
|
|
}
|
|
|
|
/*
|
|
Component phrases
|
|
-----------------
|
|
Precomputed phrases for cities, states, countries, etc. from the training data
|
|
|
|
Note: if the training data has lots of mislabeled examples (e.g. Brooklyn as city
|
|
instead of a city_district), this may cause the parser to get confused. It will
|
|
penalize itself for getting the wrong answer when really the underlying data
|
|
is simply ambiguous. In the OSM training data a lot of work has been done to
|
|
ensure that there's little or no systematic mislabeling. As such, other data
|
|
sets shouldn't be added willy-nilly unless the labels are consistent.
|
|
*/
|
|
|
|
phrase_array_clear(context->component_phrases);
|
|
int64_array_clear(context->component_phrase_memberships);
|
|
|
|
phrase_array *component_phrases = context->component_phrases;
|
|
int64_array *component_phrase_memberships = context->component_phrase_memberships;
|
|
|
|
bool have_component_phrases = trie_search_tokens_with_phrases(parser->phrases, normalized_str_admin, normalized_admin_tokens, &component_phrases);
|
|
token_phrase_memberships(component_phrases, component_phrase_memberships, num_tokens);
|
|
|
|
for (size_t i = 0; i < component_phrases->n; i++) {
|
|
phrase_t phrase = component_phrases->a[i];
|
|
if (!is_valid_component_phrase(context->normalized_admin, phrase)) {
|
|
for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
|
|
component_phrase_memberships->a[j] = NULL_PHRASE_MEMBERSHIP;
|
|
}
|
|
}
|
|
}
|
|
|
|
phrase_array_clear(context->postal_code_phrases);
|
|
int64_array_clear(context->postal_code_phrase_memberships);
|
|
|
|
phrase_array *postal_code_phrases = context->postal_code_phrases;
|
|
int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships;
|
|
|
|
bool have_postal_code_phrases = trie_search_tokens_with_phrases(parser->postal_codes, normalized_str_admin, normalized_admin_tokens, &postal_code_phrases);
|
|
token_phrase_memberships(postal_code_phrases, postal_code_phrase_memberships, num_tokens);
|
|
|
|
}
|
|
|
|
static inline phrase_t phrase_at_index(phrase_array *phrases, int64_array *phrase_memberships, uint32_t i) {
|
|
if (phrases == NULL || phrase_memberships == NULL || i > phrase_memberships->n - 1) {
|
|
return NULL_PHRASE;
|
|
}
|
|
|
|
int64_t phrase_index = phrase_memberships->a[i];
|
|
if (phrase_index != NULL_PHRASE_MEMBERSHIP) {
|
|
phrase_t phrase = phrases->a[phrase_index];
|
|
return phrase;
|
|
}
|
|
|
|
return NULL_PHRASE;
|
|
}
|
|
|
|
char *phrase_prefix(char *word, size_t len, phrase_t prefix_phrase, char_array *prefix_phrase_array) {
|
|
char_array_clear(prefix_phrase_array);
|
|
size_t prefix_len = prefix_phrase.len;
|
|
char_array_add_len(prefix_phrase_array, word, prefix_len);
|
|
char *prefix = char_array_get_string(prefix_phrase_array);
|
|
return prefix;
|
|
}
|
|
|
|
char *phrase_suffix(char *word, size_t len, phrase_t suffix_phrase, char_array *suffix_phrase_array) {
|
|
char_array_clear(suffix_phrase_array);
|
|
size_t suffix_len = suffix_phrase.len;
|
|
char_array_add_len(suffix_phrase_array, word + (len - suffix_len), suffix_len);
|
|
char *suffix = char_array_get_string(suffix_phrase_array);
|
|
return suffix;
|
|
}
|
|
|
|
bool is_valid_dictionary_phrase(phrase_t phrase) {
|
|
uint32_t expansion_index = phrase.data;
|
|
address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index);
|
|
|
|
if (expansion_value == NULL) {
|
|
log_warn("expansion_value is NULL for index %u\n", expansion_index);
|
|
return false;
|
|
}
|
|
uint32_t address_phrase_types = expansion_value->components;
|
|
|
|
if (address_phrase_types & (LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_PO_BOX)) {
|
|
for (size_t i = 0; i < expansion_value->expansions->n; i++) {
|
|
address_expansion_t expansion = expansion_value->expansions->a[i];
|
|
if (!address_expansion_in_dictionary(expansion, DICTIONARY_TOPONYM)) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
typedef struct address_parser_phrase {
|
|
char *str;
|
|
address_parser_phrase_type_t type;
|
|
phrase_t phrase;
|
|
} address_parser_phrase_t;
|
|
|
|
static inline bool is_plain_word_phrase_type(address_parser_phrase_type_t type) {
|
|
return type == ADDRESS_PARSER_NULL_PHRASE || type == ADDRESS_PARSER_SUFFIX_PHRASE || type == ADDRESS_PARSER_PREFIX_PHRASE;
|
|
}
|
|
|
|
static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, tokenized_string_t *tokenized, address_parser_context_t *context, uint32_t i, bool long_context) {
|
|
phrase_t phrase;
|
|
address_parser_phrase_t response;
|
|
char *phrase_string = NULL;
|
|
|
|
phrase = phrase_at_index(context->address_dictionary_phrases, context->address_phrase_memberships, i);
|
|
|
|
phrase_t component_phrase = phrase_at_index(context->component_phrases, context->component_phrase_memberships, i);
|
|
|
|
if (phrase.len > 0 && is_valid_dictionary_phrase(phrase) && component_phrase.len <= phrase.len) {
|
|
phrase_string = cstring_array_get_phrase(context->normalized, long_context ? context->long_context_phrase : context->context_phrase, phrase),
|
|
|
|
response = (address_parser_phrase_t){
|
|
phrase_string,
|
|
ADDRESS_PARSER_DICTIONARY_PHRASE,
|
|
phrase
|
|
};
|
|
return response;
|
|
}
|
|
|
|
phrase = component_phrase;
|
|
|
|
if (phrase.len > 0) {
|
|
phrase_string = cstring_array_get_phrase(context->normalized_admin, long_context ? context->long_context_component_phrase : context->context_component_phrase, phrase);
|
|
|
|
response = (address_parser_phrase_t){
|
|
phrase_string,
|
|
ADDRESS_PARSER_COMPONENT_PHRASE,
|
|
phrase
|
|
};
|
|
return response;
|
|
}
|
|
|
|
phrase_t prefix_phrase = context->prefix_phrases->a[i];
|
|
phrase_t suffix_phrase = context->suffix_phrases->a[i];
|
|
|
|
uint32_t expansion_index;
|
|
address_expansion_value_t *expansion_value;
|
|
|
|
cstring_array *normalized = context->normalized;
|
|
|
|
char *word = cstring_array_get_string(normalized, i);
|
|
token_t token = tokenized->tokens->a[i];
|
|
|
|
// Suffixes like straße, etc.
|
|
if (suffix_phrase.len > 0) {
|
|
expansion_index = suffix_phrase.data;
|
|
expansion_value = address_dictionary_get_expansions(expansion_index);
|
|
|
|
if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
|
|
response = (address_parser_phrase_t){
|
|
word,
|
|
ADDRESS_PARSER_SUFFIX_PHRASE,
|
|
suffix_phrase
|
|
};
|
|
return response;
|
|
}
|
|
}
|
|
|
|
// Prefixes like hinter, etc.
|
|
if (prefix_phrase.len > 0) {
|
|
expansion_index = prefix_phrase.data;
|
|
expansion_value = address_dictionary_get_expansions(expansion_index);
|
|
|
|
// Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
|
|
if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
|
|
response = (address_parser_phrase_t){
|
|
word,
|
|
ADDRESS_PARSER_PREFIX_PHRASE,
|
|
prefix_phrase
|
|
};
|
|
return response;
|
|
}
|
|
}
|
|
|
|
response = (address_parser_phrase_t){
|
|
word,
|
|
ADDRESS_PARSER_NULL_PHRASE,
|
|
NULL_PHRASE
|
|
};
|
|
return response;
|
|
|
|
}
|
|
|
|
static inline int64_t phrase_index(int64_array *phrase_memberships, size_t start, int8_t direction) {
|
|
if (phrase_memberships == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
int64_t *memberships = phrase_memberships->a;
|
|
int64_t membership;
|
|
|
|
if (direction == -1) {
|
|
for (ssize_t idx = start; idx >= 0; idx--) {
|
|
if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
|
|
return (int64_t)idx;
|
|
}
|
|
}
|
|
} else if (direction == 1) {
|
|
size_t n = phrase_memberships->n;
|
|
for (size_t idx = start; idx < n; idx++) {
|
|
if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
|
|
return (int64_t)idx;
|
|
}
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
|
|
static inline int64_t next_numeric_token_index(tokenized_string_t *tokenized, address_parser_context_t *context, size_t start) {
|
|
if (context == NULL) return -1;
|
|
|
|
token_array *tokens = tokenized->tokens;
|
|
|
|
if (tokens == NULL || start > tokens->n - 1) return -1;
|
|
|
|
phrase_t phrase;
|
|
|
|
for (size_t i = start; i < tokens->n; i++) {
|
|
if (context->address_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP &&
|
|
context->component_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP) {
|
|
token_t token = tokens->a[i];
|
|
if (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) {
|
|
return i;
|
|
}
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
|
|
static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string) {
|
|
if (phrase_types == component) {
|
|
log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types);
|
|
feature_array_add(features, 2, "unambiguous phrase type", phrase_type);
|
|
feature_array_add(features, 3, "unambiguous phrase type+phrase", phrase_type, phrase_string);
|
|
} else if (phrase_types & component) {
|
|
feature_array_add(features, 3, "phrase type+phrase", phrase_type, phrase_string);
|
|
}
|
|
}
|
|
|
|
static bool add_ngram_features(cstring_array *features, char *feature_prefix, cstring_array *ngrams, char *str, size_t n, size_t prefix_len, size_t suffix_len) {
|
|
if (features == NULL || ngrams == NULL) return false;
|
|
|
|
size_t len = strlen(str);
|
|
|
|
if (n == 0 || n > len - 1) return false;
|
|
|
|
size_t ngram_num_chars_len = INT64_MAX_STRING_SIZE;
|
|
char ngram_num_chars[ngram_num_chars_len];
|
|
sprintf(ngram_num_chars, "%zu", n);
|
|
|
|
bool known_prefix = prefix_len > 0;
|
|
bool known_suffix = suffix_len > 0;
|
|
|
|
cstring_array_clear(ngrams);
|
|
if (!add_ngrams(ngrams, n, str + prefix_len, len - suffix_len - prefix_len, !known_prefix, !known_suffix)) {
|
|
return false;
|
|
}
|
|
|
|
uint32_t idx;
|
|
char *ngram;
|
|
|
|
if (feature_prefix != NULL) {
|
|
cstring_array_foreach(ngrams, idx, ngram, {
|
|
feature_array_add(features, 4, feature_prefix, "ngrams", ngram_num_chars, ngram);
|
|
})
|
|
} else {
|
|
cstring_array_foreach(ngrams, idx, ngram, {
|
|
feature_array_add(features, 3, "ngrams", ngram_num_chars, ngram);
|
|
})
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
address_parser_features
|
|
-----------------------
|
|
|
|
This is a feature function similar to those found in MEMM and CRF models.
|
|
|
|
Follows the signature of a tagger_feature_function so it can be called
|
|
as a function pointer by the averaged perceptron or CRF model.
|
|
|
|
Parameters:
|
|
|
|
address_parser_t *self: a pointer to the address_parser struct, which contains
|
|
word frequencies and perhaps other useful corpus-wide statistics.
|
|
|
|
address_parser_context_t *context: The context struct containing:
|
|
- phrase dictionary memberships for all the tokens
|
|
- country (if knkown)
|
|
- language (if known)
|
|
- features array
|
|
|
|
tokenized_string_t *tokenized: the sequence of tokens for parsing
|
|
uint32_t i: the current token index
|
|
char *prev: the predicted tag at index i - 1
|
|
char *prev2: the predicted tag at index i - 2
|
|
|
|
*/
|
|
|
|
bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) {
|
|
if (self == NULL || ctx == NULL) return false;
|
|
|
|
address_parser_t *parser = (address_parser_t *)self;
|
|
address_parser_context_t *context = (address_parser_context_t *)ctx;
|
|
|
|
cstring_array *features = context->features;
|
|
cstring_array *prev_tag_features = context->prev_tag_features;
|
|
cstring_array *prev2_tag_features = context->prev2_tag_features;
|
|
char *language = context->language;
|
|
char *country = context->country;
|
|
|
|
phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
|
|
int64_array *address_phrase_memberships = context->address_phrase_memberships;
|
|
phrase_array *component_phrases = context->component_phrases;
|
|
int64_array *component_phrase_memberships = context->component_phrase_memberships;
|
|
phrase_array *postal_code_phrases = context->postal_code_phrases;
|
|
int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships;
|
|
cstring_array *normalized = context->normalized;
|
|
|
|
uint32_array *separators = context->separators;
|
|
|
|
cstring_array_clear(features);
|
|
cstring_array_clear(prev_tag_features);
|
|
cstring_array_clear(prev2_tag_features);
|
|
|
|
token_array *tokens = tokenized->tokens;
|
|
|
|
token_t token = tokens->a[idx];
|
|
|
|
ssize_t last_index = (ssize_t)idx - 1;
|
|
ssize_t next_index = (ssize_t)idx + 1;
|
|
|
|
char *word_pre_norm = tokenized_string_get_token(tokenized, idx);
|
|
|
|
char *word = cstring_array_get_string(normalized, idx);
|
|
if (word == NULL) {
|
|
log_error("got NULL word at %d\n", idx);
|
|
return false;
|
|
}
|
|
|
|
size_t word_len = strlen(word);
|
|
|
|
log_debug("word=%s\n", word);
|
|
|
|
phrase_t phrase = NULL_PHRASE;
|
|
phrase_t component_phrase = NULL_PHRASE;
|
|
|
|
char *phrase_string = NULL;
|
|
char *component_phrase_string = NULL;
|
|
|
|
int64_t address_phrase_index = address_phrase_memberships->a[idx];
|
|
int64_t component_phrase_index = component_phrase_memberships->a[idx];
|
|
|
|
if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) {
|
|
phrase = address_dictionary_phrases->a[address_phrase_index];
|
|
}
|
|
|
|
if (component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
|
|
component_phrase = component_phrases->a[component_phrase_index];
|
|
}
|
|
|
|
char_array *phrase_tokens = context->phrase;
|
|
char_array *component_phrase_tokens = context->component_phrase;
|
|
|
|
uint32_t expansion_index;
|
|
address_expansion_value_t *expansion_value;
|
|
|
|
bool add_word_feature = true;
|
|
|
|
size_t num_tokens = tokenized->tokens->n;
|
|
|
|
// Address dictionary phrases
|
|
if (phrase.len > 0 && phrase.len >= component_phrase.len) {
|
|
log_debug("phrase\n");
|
|
|
|
last_index = (ssize_t)phrase.start - 1;
|
|
next_index = (ssize_t)phrase.start + phrase.len;
|
|
|
|
if(is_valid_dictionary_phrase(phrase)) {
|
|
uint32_t expansion_index = phrase.data;
|
|
address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index);
|
|
|
|
if (expansion_value == NULL) {
|
|
log_warn("expansion_value is NULL for index %u\n", expansion_index);
|
|
return false;
|
|
}
|
|
uint32_t address_phrase_types = expansion_value->components;
|
|
|
|
phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase);
|
|
|
|
add_word_feature = false;
|
|
log_debug("phrase_string=%s\n", phrase_string);
|
|
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STREET, "street", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_NAME, "name", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_CATEGORY, "category", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_UNIT, "unit", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_PO_BOX, "po_box", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_LEVEL, "level", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_ENTRANCE, "entrance", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STAIRCASE, "staircase", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_HOUSE_NUMBER, "house_number", phrase_string);
|
|
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_POSTAL_CODE, "postal_code", phrase_string);
|
|
}
|
|
}
|
|
|
|
|
|
address_parser_types_t types;
|
|
|
|
// Component phrases
|
|
if (component_phrase.len > 0 && component_phrase.len >= phrase.len) {
|
|
component_phrase = component_phrases->a[component_phrase_index];
|
|
|
|
component_phrase_string = cstring_array_get_phrase(context->normalized_admin, component_phrase_tokens, component_phrase);
|
|
|
|
uint32_t component_phrase_index = component_phrase.data;
|
|
if (component_phrase_index > parser->phrase_types->n) {
|
|
log_error("Invalid component_phrase_index: %u (parser->phrase_types->n=%zu)\n", component_phrase_index, parser->phrase_types->n);
|
|
return false;
|
|
}
|
|
|
|
types = parser->phrase_types->a[component_phrase_index];
|
|
|
|
uint32_t component_phrase_types = types.components;
|
|
uint32_t most_common = types.most_common;
|
|
|
|
if (last_index >= (ssize_t)component_phrase.start - 1) {
|
|
last_index = (ssize_t)component_phrase.start - 1;
|
|
}
|
|
|
|
if (next_index < (ssize_t)component_phrase.start + component_phrase.len) {
|
|
next_index = (ssize_t)component_phrase.start + component_phrase.len;
|
|
}
|
|
|
|
if (component_phrase_string != NULL && component_phrase_types > 0) {
|
|
feature_array_add(features, 2, "phrase", component_phrase_string);
|
|
add_word_feature = false;
|
|
}
|
|
|
|
if (component_phrase_types > 0) {
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_SUBURB, "suburb", component_phrase_string);
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY, "city", component_phrase_string);
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY_DISTRICT, "city_district", component_phrase_string);
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string);
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string);
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string);
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string);
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string);
|
|
add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_WORLD_REGION, "world_region", component_phrase_string);
|
|
}
|
|
|
|
if (component_phrase_types != most_common) {
|
|
if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
|
|
feature_array_add(features, 2, "commonly city", component_phrase_string);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
|
|
feature_array_add(features, 2, "commonly country", component_phrase_string);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
|
|
feature_array_add(features, 2, "commonly suburb", component_phrase_string);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
|
|
feature_array_add(features, 2, "commonly city_district", component_phrase_string);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
|
|
feature_array_add(features, 2, "commonly state", component_phrase_string);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
|
|
feature_array_add(features, 2, "commonly country_region", component_phrase_string);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
|
|
feature_array_add(features, 2, "commonly state_district", component_phrase_string);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_ISLAND) {
|
|
feature_array_add(features, 2, "commonly island", component_phrase_string);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool possible_postal_code = false;
|
|
bool postal_code_have_admin = false;
|
|
int64_t postal_code_phrase_index = postal_code_phrase_memberships->a[idx];
|
|
phrase_t postal_code_phrase = NULL_PHRASE;
|
|
|
|
if (postal_code_phrase_index != NULL_PHRASE_MEMBERSHIP) {
|
|
postal_code_phrase = postal_code_phrases->a[postal_code_phrase_index];
|
|
|
|
uint32_t postal_code_id = postal_code_phrase.data;
|
|
|
|
possible_postal_code = true;
|
|
|
|
if (last_index >= (ssize_t)postal_code_phrase.start - 1) {
|
|
last_index = (ssize_t)postal_code_phrase.start - 1;
|
|
}
|
|
|
|
if (next_index < (ssize_t)postal_code_phrase.start + postal_code_phrase.len) {
|
|
next_index = (ssize_t)postal_code_phrase.start + postal_code_phrase.len;
|
|
}
|
|
|
|
uint32_t admin_id;
|
|
uint64_t postal_code_context;
|
|
|
|
khiter_t k;
|
|
|
|
if (last_index >= 0) {
|
|
int64_t last_component_phrase_index = component_phrase_memberships->a[last_index];
|
|
if (last_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
|
|
phrase_t last_component_phrase = component_phrases->a[last_component_phrase_index];
|
|
admin_id = last_component_phrase.data;
|
|
|
|
if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
|
|
postal_code_have_admin = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!postal_code_have_admin && next_index < num_tokens) {
|
|
int64_t next_component_phrase_index = component_phrase_memberships->a[next_index];
|
|
if (next_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
|
|
phrase_t next_component_phrase = component_phrases->a[next_component_phrase_index];
|
|
admin_id = next_component_phrase.data;
|
|
if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
|
|
postal_code_have_admin = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if (possible_postal_code) {
|
|
if (postal_code_have_admin) {
|
|
feature_array_add(features, 1, "postcode have context");
|
|
feature_array_add(features, 2, "postcode have context", word);
|
|
} else {
|
|
feature_array_add(features, 2, "postcode no context", word);
|
|
}
|
|
}
|
|
|
|
uint32_t word_freq = word_vocab_frequency(parser, word);
|
|
|
|
bool is_word = is_word_token(token.type);
|
|
|
|
bool is_unknown_word = false;
|
|
bool is_unknown = false;
|
|
|
|
bool known_prefix = false;
|
|
bool known_suffix = false;
|
|
|
|
size_t prefix_len = 0;
|
|
size_t suffix_len = 0;
|
|
|
|
char *prefix = NULL;
|
|
char *suffix = NULL;
|
|
|
|
if (add_word_feature) {
|
|
// Bias unit, acts as an intercept
|
|
feature_array_add(features, 1, "bias");
|
|
|
|
phrase_t prefix_phrase = context->prefix_phrases->a[idx];
|
|
phrase_t suffix_phrase = context->suffix_phrases->a[idx];
|
|
|
|
// Prefixes like hinter, etc.
|
|
if (prefix_phrase.len > 0) {
|
|
expansion_index = prefix_phrase.data;
|
|
expansion_value = address_dictionary_get_expansions(expansion_index);
|
|
|
|
// Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
|
|
if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
|
|
known_prefix = true;
|
|
char_array_clear(phrase_tokens);
|
|
prefix_len = prefix_phrase.len;
|
|
char_array_add_len(phrase_tokens, word_pre_norm, prefix_len);
|
|
prefix = char_array_get_string(phrase_tokens);
|
|
log_debug("got prefix: %s\n", prefix);
|
|
feature_array_add(features, 2, "prefix", prefix);
|
|
}
|
|
}
|
|
|
|
// Suffixes like straße, etc.
|
|
if (suffix_phrase.len > 0) {
|
|
expansion_index = suffix_phrase.data;
|
|
expansion_value = address_dictionary_get_expansions(expansion_index);
|
|
|
|
if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
|
|
known_suffix = true;
|
|
char_array_clear(context->suffix_phrase);
|
|
suffix_len = suffix_phrase.len;
|
|
size_t word_pre_norm_len = cstring_array_token_length(tokenized->strings, idx);
|
|
size_t suffix_offset = word_pre_norm_len - suffix_len;
|
|
char_array_add_len(context->suffix_phrase, word_pre_norm + suffix_offset, suffix_len);
|
|
suffix = char_array_get_string(context->suffix_phrase);
|
|
log_debug("got suffix: %s\n", suffix);
|
|
feature_array_add(features, 2, "suffix", suffix);
|
|
}
|
|
}
|
|
|
|
bool is_hyphenated = false;
|
|
|
|
// For rare words and unknown words (so unknown words can benefit from statistics of known but super common words)
|
|
if (word_freq <= parser->options.rare_word_threshold && is_word) {
|
|
log_debug("rare word: %s\n", word);
|
|
bool ngrams_added = false;
|
|
size_t hyphenated_word_offset = 0;
|
|
bool first_sub_token = true;
|
|
bool last_sub_token = true;
|
|
|
|
ssize_t next_hyphen_index;
|
|
|
|
token_array_clear(context->sub_tokens);
|
|
|
|
do {
|
|
next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset);
|
|
char *sub_word = word;
|
|
size_t sub_word_len = word_len;
|
|
|
|
if (next_hyphen_index >= 0) {
|
|
is_hyphenated = true;
|
|
char_array_clear(context->sub_token);
|
|
char_array_add_len(context->sub_token, word + hyphenated_word_offset, next_hyphen_index);
|
|
token_array_push(context->sub_tokens, (token_t){hyphenated_word_offset, next_hyphen_index, token.type});
|
|
sub_word = char_array_get_string(context->sub_token);
|
|
sub_word_len = context->sub_token->n;
|
|
last_sub_token = false;
|
|
} else if (is_hyphenated) {
|
|
char_array_clear(context->sub_token);
|
|
char_array_add_len(context->sub_token, word + hyphenated_word_offset, word_len - hyphenated_word_offset);
|
|
sub_word = char_array_get_string(context->sub_token);
|
|
sub_word_len = context->sub_token->n;
|
|
last_sub_token = true;
|
|
}
|
|
|
|
bool add_prefix = first_sub_token && prefix_len < sub_word_len;
|
|
bool add_suffix = last_sub_token && suffix_len < sub_word_len;
|
|
|
|
uint32_t sub_word_freq = word_freq;
|
|
if (is_hyphenated) {
|
|
sub_word_freq = word_vocab_frequency(parser, sub_word);
|
|
if (sub_word_freq > 0) {
|
|
feature_array_add(features, 2, "sub_word", sub_word);
|
|
}
|
|
|
|
}
|
|
|
|
if (sub_word_freq <= parser->options.rare_word_threshold) {
|
|
// prefix/suffix features from 3-6 characters
|
|
for (size_t ng = 3; ng <= 6; ng++) {
|
|
ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0);
|
|
}
|
|
}
|
|
|
|
hyphenated_word_offset += next_hyphen_index + 1;
|
|
first_sub_token = false;
|
|
|
|
log_debug("next_hyphen_index=%zd\n", next_hyphen_index);
|
|
} while(next_hyphen_index >= 0);
|
|
|
|
}
|
|
|
|
if (word_freq > 0) {
|
|
// The individual word
|
|
feature_array_add(features, 2, "word", word);
|
|
} else {
|
|
log_debug("word not in vocab: %s\n", word);
|
|
|
|
is_unknown = true;
|
|
word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
|
|
|
|
if (is_word_token(token.type)) {
|
|
is_unknown_word = true;
|
|
}
|
|
}
|
|
|
|
if (idx == 0 && !is_unknown_word) {
|
|
feature_array_add(features, 2, "first word", word);
|
|
//feature_array_add(features, 3, "first word+next word", word, next_word);
|
|
}
|
|
|
|
} else if (component_phrase_string != NULL) {
|
|
word = component_phrase_string;
|
|
} else if (phrase_string != NULL) {
|
|
word = phrase_string;
|
|
}
|
|
|
|
if (last_index == idx - 1) {
|
|
// Previous tag and current word
|
|
feature_array_add(prev_tag_features, 2, "word", word);
|
|
|
|
// Previous two tags and current word
|
|
if (parser->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
|
|
// In the CRF this is accounted for by the transition weights
|
|
// so only need it for the averaged perceptron
|
|
feature_array_add(prev_tag_features, 1, "trans");
|
|
|
|
// Averaged perceptron uses two tags of history, CRF uses one
|
|
feature_array_add(prev2_tag_features, 2, "word", word);
|
|
feature_array_add(prev2_tag_features, 1, "trans");
|
|
}
|
|
}
|
|
|
|
if (last_index >= 0) {
|
|
address_parser_phrase_t prev_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, last_index, false);
|
|
char *prev_word = prev_word_or_phrase.str;
|
|
|
|
if (is_plain_word_phrase_type(prev_word_or_phrase.type)) {
|
|
uint32_t prev_word_freq = word_vocab_frequency(parser, prev_word);
|
|
token_t prev_token = tokenized->tokens->a[last_index];
|
|
bool prev_token_numeric = is_numeric_token(prev_token.type);
|
|
if (prev_word_freq == 0) {
|
|
prev_word = !prev_token_numeric ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
|
|
}
|
|
}
|
|
|
|
// Previous word
|
|
feature_array_add(features, 2, "prev word", prev_word);
|
|
|
|
|
|
if (last_index == idx - 1) {
|
|
feature_array_add(prev_tag_features, 2, "prev word", prev_word);
|
|
}
|
|
|
|
// Previous word and current word
|
|
feature_array_add(features, 3, "prev word+word", prev_word, word);
|
|
}
|
|
|
|
if (next_index < num_tokens) {
|
|
address_parser_phrase_t next_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, next_index, false);
|
|
char *next_word = next_word_or_phrase.str;
|
|
size_t next_word_len = 1;
|
|
|
|
if (is_plain_word_phrase_type(next_word_or_phrase.type)) {
|
|
uint32_t next_word_freq = word_vocab_frequency(parser, next_word);
|
|
token_t next_token = tokenized->tokens->a[next_index];
|
|
bool next_token_numeric = is_numeric_token(next_token.type);
|
|
if (next_word_freq == 0) {
|
|
next_word = !next_token_numeric ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
|
|
}
|
|
} else {
|
|
next_word_len = next_word_or_phrase.phrase.len;
|
|
}
|
|
|
|
// Next word e.g. if the current word is unknown and the next word is "street"
|
|
feature_array_add(features, 2, "next word", next_word);
|
|
|
|
// Current word and next word
|
|
feature_array_add(features, 3, "word+next word", word, next_word);
|
|
|
|
// Prev tag, current word and next word
|
|
//feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word);
|
|
|
|
// Venue names ("house") are almost always at the beginning of the string
|
|
// and often contain out-of-vocabulary words. Consider a case like "Barboncino 781 Franklin Ave".
|
|
// The features available to classify "Barboncino" are going to be unknown word featuers (n-grams),
|
|
// next word features (unknown word where next word=DDD is just as likely to be a street)
|
|
// and no previous tags of history since it's the first word. If the parser predicts the
|
|
// first token correctly, it's going to have an easier time getting the rest of the sequence
|
|
// correct (unknown word + prev tag was "house" is probably still part of the venue, etc.) so
|
|
// we're only really worried about that first token. This group of features, called
|
|
// "long-context features" finds the relative position of the next numeric token as well
|
|
// as the next street-level phrase (words like "ave", "street", etc.) in the right context.
|
|
// In an English or French address, if we know there's a number somewhere to our right,
|
|
// and that a word like "Ave" appears to the right of the number, it's very likely that
|
|
// the current unknown word is part of a venue name. Similarly, if a venue-word like "Pizzeria"
|
|
// occurred prior to the number, that would also be strong evidence that we're in a venue name.
|
|
// Conversely, if we're in a Spanish address and a word like "Calle" comes before the first number
|
|
// to our right, it's also likely that we're in a venue name, but we'd need to note that the
|
|
// phrase we saw was "Calle" and not an English thoroughfare type.
|
|
|
|
if (idx == 0 && add_word_feature && is_unknown_word) {
|
|
bool seen_number = false;
|
|
bool seen_phrase = false;
|
|
for (uint32_t right_idx = idx + 1; right_idx < num_tokens; right_idx++) {
|
|
token_t right_token = tokens->a[right_idx];
|
|
|
|
/* Check */
|
|
address_parser_phrase_t right_context_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, right_idx, true);
|
|
address_parser_phrase_type_t right_context_phrase_type = right_context_word_or_phrase.type;
|
|
if (right_context_phrase_type != ADDRESS_PARSER_NULL_PHRASE &&
|
|
right_context_phrase_type != ADDRESS_PARSER_DICTIONARY_PHRASE &&
|
|
right_context_phrase_type != ADDRESS_PARSER_SUFFIX_PHRASE &&
|
|
right_context_phrase_type != ADDRESS_PARSER_PREFIX_PHRASE) {
|
|
continue;
|
|
}
|
|
char *right_context_word = right_context_word_or_phrase.str;
|
|
phrase_t right_context_phrase = right_context_word_or_phrase.phrase;
|
|
|
|
phrase_t suffix_phrase = context->suffix_phrases->a[right_idx];
|
|
|
|
uint32_t right_context_expansion_index;
|
|
address_expansion_value_t *right_context_expansion_value;
|
|
|
|
uint32_t right_context_components = 0;
|
|
bool right_context_name = false;
|
|
bool right_context_street = false;
|
|
|
|
if (right_context_phrase.len > 0) {
|
|
right_context_expansion_index = right_context_phrase.data;
|
|
right_context_expansion_value = address_dictionary_get_expansions(right_context_expansion_index);
|
|
right_context_components = right_context_expansion_value->components;
|
|
|
|
char *right_affix_type = NULL;
|
|
char *right_context_affix = NULL;
|
|
|
|
char *relation_to_number = seen_number ? "after number" : "before number";
|
|
|
|
seen_phrase = true;
|
|
|
|
char *right_context_word_pre_norm;
|
|
|
|
if (right_context_phrase_type == ADDRESS_PARSER_SUFFIX_PHRASE) {
|
|
right_affix_type = "suffix";
|
|
right_context_word_pre_norm = tokenized_string_get_token(tokenized, right_idx);
|
|
right_context_affix = phrase_suffix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
|
|
} else if (right_context_word_or_phrase.type == ADDRESS_PARSER_PREFIX_PHRASE) {
|
|
right_affix_type = "prefix";
|
|
right_context_word_pre_norm = tokenized_string_get_token(tokenized, right_idx);
|
|
right_context_affix = phrase_prefix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
|
|
}
|
|
|
|
if (right_context_components & LIBPOSTAL_ADDRESS_STREET && !(right_context_components & LIBPOSTAL_ADDRESS_NAME)) {
|
|
feature_array_add(features, 2, "first word unknown+street phrase right", relation_to_number);
|
|
feature_array_add(features, 3, "first word unknown+street phrase right", relation_to_number, right_context_word);
|
|
if (right_context_affix != NULL && right_affix_type != NULL) {
|
|
feature_array_add(features, 4, "first word unknown+street affix right", relation_to_number, right_affix_type, right_context_affix);
|
|
}
|
|
break;
|
|
} else if (right_context_components & LIBPOSTAL_ADDRESS_NAME && !(right_context_components & LIBPOSTAL_ADDRESS_STREET)) {
|
|
feature_array_add(features, 2, "first word unknown+venue phrase right", relation_to_number);
|
|
feature_array_add(features, 3, "first word unknown+venue phrase right", relation_to_number, right_context_word);
|
|
if (right_context_affix != NULL && right_affix_type != NULL) {
|
|
feature_array_add(features, 4, "first word unknown+venue affix right", relation_to_number, right_affix_type, right_context_affix);
|
|
}
|
|
} else if (right_context_components & (LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET)) {
|
|
if (seen_number) {
|
|
feature_array_add(features, 1, "first word unknown+number+ambiguous phrase right");
|
|
feature_array_add(features, 2, "first word unknown+number+ambiguous phrase right", right_context_word);
|
|
if (right_context_affix != NULL && right_affix_type != NULL) {
|
|
feature_array_add(features, 3, "first word unknown+number+ambiguous affix right", right_affix_type, right_context_affix);
|
|
}
|
|
break;
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (seen_number) break;
|
|
}
|
|
|
|
if (is_numeric_token(right_token.type)) {
|
|
seen_number = true;
|
|
char *relation_to_phrase = seen_phrase ? "after phrase" : "before phrase";
|
|
feature_array_add(features, 2, "first word unknown+number right", relation_to_phrase);
|
|
feature_array_add(features, 3, "first word unknown+number right", relation_to_phrase, right_context_word);
|
|
if (seen_phrase) break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str) {
|
|
if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
|
|
return averaged_perceptron_tagger_predict(self->model.ap, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features);
|
|
} else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
|
|
return crf_tagger_predict(self->model.crf, self, context, context->features, context->prev_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features);
|
|
} else {
|
|
log_error("Parser has unknown model type\n");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
libpostal_address_parser_response_t *address_parser_response_new(void) {
|
|
libpostal_address_parser_response_t *response = malloc(sizeof(libpostal_address_parser_response_t));
|
|
return response;
|
|
}
|
|
|
|
libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country) {
|
|
if (address == NULL) return NULL;
|
|
|
|
address_parser_t *parser = get_address_parser();
|
|
if (parser == NULL || parser->context == NULL) {
|
|
log_error("parser is not setup, call libpostal_setup_address_parser()\n");
|
|
return NULL;
|
|
}
|
|
|
|
address_parser_context_t *context = parser->context;
|
|
|
|
char *normalized = address_parser_normalize_string(address);
|
|
bool is_normalized = normalized != NULL;
|
|
if (!is_normalized) {
|
|
normalized = address;
|
|
}
|
|
|
|
token_array *tokens = tokenize(normalized);
|
|
|
|
tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);
|
|
|
|
for (size_t i = 0; i < tokens->n; i++) {
|
|
token_t token = tokens->a[i];
|
|
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
|
|
uint32_array_pop(context->separators);
|
|
uint32_array_push(context->separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
|
continue;
|
|
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
|
|
continue;
|
|
}
|
|
|
|
tokenized_string_add_token(tokenized_str, (const char *)normalized, token.len, token.type, token.offset);
|
|
uint32_array_push(context->separators, ADDRESS_SEPARATOR_NONE);
|
|
}
|
|
|
|
// This parser was trained without knowing language/country.
|
|
// If at some point we build country-specific/language-specific
|
|
// parsers, these parameters could be used to select a model.
|
|
// The language parameter does technically control which dictionaries
|
|
// are searched at the street level. It's possible with e.g. a phrase
|
|
// like "de", which can be either the German country code or a stopword
|
|
// in Spanish, that even in the case where it's being used as a country code,
|
|
// it's possible that both the street-level and admin-level phrase features
|
|
// may be working together as a kind of intercept. Depriving the model
|
|
// of the street-level phrase features by passing in a known language
|
|
// may change the decision threshold so explicitly ignore these
|
|
// options until there's a use for them (country-specific or language-specific
|
|
// parser models).
|
|
|
|
language = NULL;
|
|
country = NULL;
|
|
address_parser_context_fill(context, parser, tokenized_str, language, country);
|
|
|
|
libpostal_address_parser_response_t *response = NULL;
|
|
|
|
// If the whole input string is a single known phrase at the SUBURB level or higher, bypass sequence prediction altogether
|
|
phrase_t only_phrase = NULL_PHRASE;
|
|
token_t token, prev_token;
|
|
bool is_postal = false;
|
|
if (context->component_phrases->n == 1) {
|
|
only_phrase = context->component_phrases->a[0];
|
|
} else if (context->postal_code_phrases->n == 1) {
|
|
only_phrase = context->postal_code_phrases->a[0];
|
|
is_postal = true;
|
|
}
|
|
|
|
if (only_phrase.start == 0 && only_phrase.len == tokenized_str->tokens->n && only_phrase.len > 0) {
|
|
uint32_t most_common = 0;
|
|
|
|
char *label = NULL;
|
|
|
|
if (!is_postal) {
|
|
uint32_t component_phrase_index = only_phrase.data;
|
|
address_parser_types_t types = parser->phrase_types->a[component_phrase_index];
|
|
most_common = types.most_common;
|
|
|
|
if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
|
|
label = strdup(ADDRESS_PARSER_LABEL_CITY);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
|
|
label = strdup(ADDRESS_PARSER_LABEL_STATE);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
|
|
label = strdup(ADDRESS_PARSER_LABEL_COUNTRY);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
|
|
label = strdup(ADDRESS_PARSER_LABEL_STATE_DISTRICT);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
|
|
label = strdup(ADDRESS_PARSER_LABEL_COUNTRY_REGION);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
|
|
label = strdup(ADDRESS_PARSER_LABEL_SUBURB);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
|
|
label = strdup(ADDRESS_PARSER_LABEL_CITY_DISTRICT);
|
|
} else if (most_common == ADDRESS_PARSER_BOUNDARY_WORLD_REGION) {
|
|
label = strdup(ADDRESS_PARSER_LABEL_WORLD_REGION);
|
|
}
|
|
} else {
|
|
label = strdup(ADDRESS_PARSER_LABEL_POSTAL_CODE);
|
|
}
|
|
|
|
response = address_parser_response_new();
|
|
|
|
// Implicit: if most_common is not one of the above, ignore and parse regularly
|
|
if (label != NULL) {
|
|
char **single_label = malloc(sizeof(char *));
|
|
single_label[0] = label;
|
|
char **single_component = malloc(sizeof(char *));
|
|
single_component[0] = strdup(normalized);
|
|
|
|
response->num_components = 1;
|
|
response->labels = single_label;
|
|
response->components = single_component;
|
|
|
|
token_array_destroy(tokens);
|
|
tokenized_string_destroy(tokenized_str);
|
|
|
|
if (is_normalized) {
|
|
free(normalized);
|
|
}
|
|
return response;
|
|
}
|
|
}
|
|
|
|
cstring_array *token_labels = cstring_array_new_size(tokens->n);
|
|
|
|
char *prev_label = NULL;
|
|
|
|
bool prediction_success = address_parser_predict(parser, context, token_labels, &address_parser_features, tokenized_str);
|
|
|
|
if (prediction_success) {
|
|
response = address_parser_response_new();
|
|
|
|
size_t num_strings = cstring_array_num_strings(tokenized_str->strings);
|
|
|
|
cstring_array *labels = cstring_array_new_size(num_strings);
|
|
cstring_array *components = cstring_array_new_size(strlen(address) + num_strings);
|
|
|
|
token_t *tokens = tokenized_str->tokens->a;
|
|
|
|
for (size_t i = 0; i < num_strings; i++) {
|
|
char *str = tokenized_string_get_token(tokenized_str, i);
|
|
|
|
char *label = cstring_array_get_string(token_labels, i);
|
|
|
|
if (prev_label == NULL || strcmp(label, prev_label) != 0) {
|
|
cstring_array_add_string(labels, label);
|
|
cstring_array_start_token(components);
|
|
|
|
}
|
|
|
|
if (prev_label != NULL && strcmp(label, prev_label) == 0) {
|
|
token = tokens[i];
|
|
prev_token = tokens[i - 1];
|
|
if (token.offset > prev_token.offset + prev_token.len) {
|
|
cstring_array_cat_string(components, " ");
|
|
}
|
|
cstring_array_cat_string(components, str);
|
|
} else {
|
|
cstring_array_append_string(components, str);
|
|
cstring_array_terminate(components);
|
|
}
|
|
|
|
prev_label = label;
|
|
}
|
|
response->num_components = cstring_array_num_strings(components);
|
|
response->components = cstring_array_to_strings(components);
|
|
response->labels = cstring_array_to_strings(labels);
|
|
|
|
} else {
|
|
log_error("Error in prediction\n");
|
|
}
|
|
|
|
token_array_destroy(tokens);
|
|
tokenized_string_destroy(tokenized_str);
|
|
cstring_array_destroy(token_labels);
|
|
|
|
if (is_normalized) {
|
|
free(normalized);
|
|
}
|
|
|
|
return response;
|
|
}
|
|
|
|
|
|
|
|
bool address_parser_module_setup(char *dir) {
|
|
if (parser == NULL) {
|
|
return address_parser_load(dir);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void address_parser_module_teardown(void) {
|
|
if (parser != NULL) {
|
|
address_parser_destroy(parser);
|
|
}
|
|
parser = NULL;
|
|
}
|