[api] Adding parse_address implementation to the libpostal API. GeoDB and address parser are now required. Stripping punctuation from the normalized output
This commit is contained in:
@@ -8,6 +8,7 @@
|
|||||||
#include "constants.h"
|
#include "constants.h"
|
||||||
#include "file_utils.h"
|
#include "file_utils.h"
|
||||||
#include "geodb.h"
|
#include "geodb.h"
|
||||||
|
#include "libpostal.h"
|
||||||
#include "normalize.h"
|
#include "normalize.h"
|
||||||
#include "scanner.h"
|
#include "scanner.h"
|
||||||
#include "shuffle.h"
|
#include "shuffle.h"
|
||||||
@@ -16,6 +17,7 @@
|
|||||||
#include "linenoise/linenoise.h"
|
#include "linenoise/linenoise.h"
|
||||||
#include "log/log.h"
|
#include "log/log.h"
|
||||||
|
|
||||||
|
|
||||||
bool load_address_parser_dependencies(void) {
|
bool load_address_parser_dependencies(void) {
|
||||||
if (!address_dictionary_module_setup(NULL)) {
|
if (!address_dictionary_module_setup(NULL)) {
|
||||||
log_error("Could not load address dictionaries\n");
|
log_error("Could not load address dictionaries\n");
|
||||||
@@ -34,22 +36,6 @@ bool load_address_parser_dependencies(void) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
address_parser_response_t *parse_address(char *address, char *country, char *language) {
|
|
||||||
address_parser_context_t *context = address_parser_context_new();
|
|
||||||
address_parser_response_t *parsed = address_parser_parse(address, language, country, context);
|
|
||||||
|
|
||||||
if (parsed == NULL) {
|
|
||||||
log_error("Parser returned NULL\n");
|
|
||||||
address_parser_context_destroy(context);
|
|
||||||
address_parser_response_destroy(parsed);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
address_parser_context_destroy(context);
|
|
||||||
|
|
||||||
return parsed;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
char *address_parser_dir = "./ap_test";
|
char *address_parser_dir = "./ap_test";
|
||||||
char *history_file = "address_parser.history";
|
char *history_file = "address_parser.history";
|
||||||
@@ -58,12 +44,7 @@ int main(int argc, char **argv) {
|
|||||||
address_parser_dir = argv[1];
|
address_parser_dir = argv[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!load_address_parser_dependencies()) {
|
if (!libpostal_setup()) {
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!address_parser_load(address_parser_dir)) {
|
|
||||||
log_error("Error loading address parser\n");
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -123,8 +104,9 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
address_parser_response_t *parsed;
|
address_parser_response_t *parsed;
|
||||||
|
address_parser_options_t options = LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS;
|
||||||
|
|
||||||
if ((parsed = parse_address(input, country, language))) {
|
if ((parsed = parse_address(input, options))) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Result:\n\n");
|
printf("Result:\n\n");
|
||||||
printf("{\n");
|
printf("{\n");
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
#include "log/log.h"
|
#include "log/log.h"
|
||||||
|
|
||||||
#include "address_dictionary.h"
|
#include "address_dictionary.h"
|
||||||
|
#include "address_parser.h"
|
||||||
#include "collections.h"
|
#include "collections.h"
|
||||||
#include "constants.h"
|
#include "constants.h"
|
||||||
#include "geodb.h"
|
#include "geodb.h"
|
||||||
@@ -41,6 +42,10 @@ inline bool is_numeric_token(uint16_t type) {
|
|||||||
return type == NUMERIC;
|
return type == NUMERIC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool is_punctuation(uint16_t type) {
|
||||||
|
return type >= PERIOD && type < OTHER;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
inline uint64_t get_normalize_token_options(normalize_options_t options) {
|
inline uint64_t get_normalize_token_options(normalize_options_t options) {
|
||||||
uint64_t normalize_token_options = 0;
|
uint64_t normalize_token_options = 0;
|
||||||
@@ -119,6 +124,8 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
|
|||||||
|
|
||||||
log_debug("tokenized, num tokens=%zu\n", tokens->n);
|
log_debug("tokenized, num tokens=%zu\n", tokens->n);
|
||||||
|
|
||||||
|
bool last_was_punctuation = false;
|
||||||
|
|
||||||
phrase_language_array *phrases = NULL;
|
phrase_language_array *phrases = NULL;
|
||||||
phrase_array *lang_phrases = NULL;
|
phrase_array *lang_phrases = NULL;
|
||||||
|
|
||||||
@@ -154,6 +161,7 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
|
|||||||
phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
|
phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
|
||||||
}
|
}
|
||||||
phrase_array_destroy(lang_phrases);
|
phrase_array_destroy(lang_phrases);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
string_tree_t *tree = string_tree_new_size(len);
|
string_tree_t *tree = string_tree_new_size(len);
|
||||||
@@ -189,8 +197,17 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
|
|||||||
end = phrase.start;
|
end = phrase.start;
|
||||||
|
|
||||||
for (int j = start; j < end; j++) {
|
for (int j = start; j < end; j++) {
|
||||||
token_t token = tokens->a[j];
|
token_t token = tokens->a[j];
|
||||||
|
if (is_punctuation(token.type)) {
|
||||||
|
last_was_punctuation = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (token.type != WHITESPACE) {
|
if (token.type != WHITESPACE) {
|
||||||
|
if (last_was_punctuation) {
|
||||||
|
string_tree_add_string(tree, " ");
|
||||||
|
string_tree_finalize_token(tree);
|
||||||
|
}
|
||||||
log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
|
log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
|
||||||
|
|
||||||
string_tree_add_string_len(tree, str + token.offset, token.len);
|
string_tree_add_string_len(tree, str + token.offset, token.len);
|
||||||
@@ -198,6 +215,8 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
|
|||||||
log_debug("Adding space\n");
|
log_debug("Adding space\n");
|
||||||
string_tree_add_string(tree, " ");
|
string_tree_add_string(tree, " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
last_was_punctuation = false;
|
||||||
string_tree_finalize_token(tree);
|
string_tree_finalize_token(tree);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -269,7 +288,7 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
|
|||||||
} else {
|
} else {
|
||||||
for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
|
for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
|
||||||
token = tokens->a[j];
|
token = tokens->a[j];
|
||||||
|
|
||||||
if (token.type != WHITESPACE) {
|
if (token.type != WHITESPACE) {
|
||||||
log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
|
log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
|
||||||
string_tree_add_string_len(tree, str + token.offset, token.len);
|
string_tree_add_string_len(tree, str + token.offset, token.len);
|
||||||
@@ -308,8 +327,17 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
|
|||||||
|
|
||||||
|
|
||||||
for (int j = start; j < end; j++) {
|
for (int j = start; j < end; j++) {
|
||||||
token_t token = tokens->a[j];
|
token_t token = tokens->a[j];
|
||||||
|
if (is_punctuation(token.type)) {
|
||||||
|
last_was_punctuation = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (token.type != WHITESPACE) {
|
if (token.type != WHITESPACE) {
|
||||||
|
if (last_was_punctuation) {
|
||||||
|
string_tree_add_string(tree, " ");
|
||||||
|
string_tree_finalize_token(tree);
|
||||||
|
}
|
||||||
log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
|
log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
|
||||||
|
|
||||||
string_tree_add_string_len(tree, str + token.offset, token.len);
|
string_tree_add_string_len(tree, str + token.offset, token.len);
|
||||||
@@ -317,6 +345,8 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
|
|||||||
log_debug("Adding space\n");
|
log_debug("Adding space\n");
|
||||||
string_tree_add_string(tree, " ");
|
string_tree_add_string(tree, " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
last_was_punctuation = false;
|
||||||
string_tree_finalize_token(tree);
|
string_tree_finalize_token(tree);
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -792,6 +822,22 @@ char **expand_address(char *input, normalize_options_t options, uint64_t *n) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
address_parser_response_t *parse_address(char *address, address_parser_options_t options) {
|
||||||
|
address_parser_context_t *context = address_parser_context_new();
|
||||||
|
address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country, context);
|
||||||
|
|
||||||
|
if (parsed == NULL) {
|
||||||
|
log_error("Parser returned NULL\n");
|
||||||
|
address_parser_context_destroy(context);
|
||||||
|
address_parser_response_destroy(parsed);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
address_parser_context_destroy(context);
|
||||||
|
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
bool libpostal_setup(void) {
|
bool libpostal_setup(void) {
|
||||||
if (!transliteration_module_setup(NULL)) {
|
if (!transliteration_module_setup(NULL)) {
|
||||||
log_error("Error loading transliteration module\n");
|
log_error("Error loading transliteration module\n");
|
||||||
@@ -808,6 +854,16 @@ bool libpostal_setup(void) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!geodb_module_setup(NULL)) {
|
||||||
|
log_error("Error loading geodb module\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!address_parser_module_setup(NULL)) {
|
||||||
|
log_error("Error loading address parser module\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -818,4 +874,8 @@ void libpostal_teardown(void) {
|
|||||||
numex_module_teardown();
|
numex_module_teardown();
|
||||||
|
|
||||||
address_dictionary_module_teardown();
|
address_dictionary_module_teardown();
|
||||||
|
|
||||||
|
geodb_module_teardown();
|
||||||
|
|
||||||
|
address_parser_module_teardown();
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user