From 42d169feeebc573b44f44b01272c95698201de32 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Jan 2016 03:20:55 -0500 Subject: [PATCH] [api] Libpostal expand API will now detect language automatically using a high accuracy language classifier trained on OSM streets/addresses/toponyms. Hooray batch geocoding! --- src/libpostal.c | 27 +++++++++++++++++++++++++++ src/libpostal.h | 3 +++ 2 files changed, 30 insertions(+) diff --git a/src/libpostal.c b/src/libpostal.c index 1d38a738..bb408c0a 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -11,6 +11,7 @@ #include "collections.h" #include "constants.h" #include "geodb.h" +#include "language_classifier.h" #include "numex.h" #include "normalize.h" #include "scanner.h" @@ -811,6 +812,16 @@ char **expand_address(char *input, normalize_options_t options, size_t *n) { size_t len = strlen(input); + language_classifier_response_t *lang_response = NULL; + + if (options.num_languages == 0) { + lang_response = classify_languages(input); + if (lang_response != NULL) { + options.num_languages = lang_response->num_languages; + options.languages = lang_response->languages; + } + } + string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages); cstring_array *strings = cstring_array_new_size(len * 2); @@ -860,6 +871,10 @@ char **expand_address(char *input, normalize_options_t options, size_t *n) { kh_destroy(str_set, unique_strings); + if (lang_response != NULL) { + language_classifier_response_destroy(lang_response); + } + char_array_destroy(temp_string); string_tree_destroy(tree); @@ -930,6 +945,14 @@ bool libpostal_setup(void) { return true; } +bool libpostal_setup_language_classifier(void) { + if (!language_classifier_module_setup(NULL)) { + log_error("Error loading language classifier\n"); + return false; + } + return true; +} + bool libpostal_setup_parser(void) { if (!geodb_module_setup(NULL)) { log_error("Error loading geodb module\n"); @@ -952,6 +975,10 @@ void libpostal_teardown(void) { address_dictionary_module_teardown(); } +void libpostal_teardown_language_classifier(void) { + language_classifier_module_teardown(); +} + void libpostal_teardown_parser(void) { geodb_module_teardown(); address_parser_module_teardown(); diff --git a/src/libpostal.h b/src/libpostal.h index 1f672186..f48838a1 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -123,6 +123,9 @@ void libpostal_teardown(void); bool libpostal_setup_parser(void); void libpostal_teardown_parser(void); +bool libpostal_setup_language_classifier(void); +void libpostal_teardown_language_classifier(void); + #ifdef __cplusplus } #endif