[api] Libpostal expand API will now detect language automatically using a high accuracy language classifier trained on OSM streets/addresses/toponyms. Hooray batch geocoding!
This commit is contained in:
@@ -11,6 +11,7 @@
|
|||||||
#include "collections.h"
|
#include "collections.h"
|
||||||
#include "constants.h"
|
#include "constants.h"
|
||||||
#include "geodb.h"
|
#include "geodb.h"
|
||||||
|
#include "language_classifier.h"
|
||||||
#include "numex.h"
|
#include "numex.h"
|
||||||
#include "normalize.h"
|
#include "normalize.h"
|
||||||
#include "scanner.h"
|
#include "scanner.h"
|
||||||
@@ -811,6 +812,16 @@ char **expand_address(char *input, normalize_options_t options, size_t *n) {
|
|||||||
|
|
||||||
size_t len = strlen(input);
|
size_t len = strlen(input);
|
||||||
|
|
||||||
|
language_classifier_response_t *lang_response = NULL;
|
||||||
|
|
||||||
|
if (options.num_languages == 0) {
|
||||||
|
lang_response = classify_languages(input);
|
||||||
|
if (lang_response != NULL) {
|
||||||
|
options.num_languages = lang_response->num_languages;
|
||||||
|
options.languages = lang_response->languages;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages);
|
string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages);
|
||||||
|
|
||||||
cstring_array *strings = cstring_array_new_size(len * 2);
|
cstring_array *strings = cstring_array_new_size(len * 2);
|
||||||
@@ -860,6 +871,10 @@ char **expand_address(char *input, normalize_options_t options, size_t *n) {
|
|||||||
|
|
||||||
kh_destroy(str_set, unique_strings);
|
kh_destroy(str_set, unique_strings);
|
||||||
|
|
||||||
|
if (lang_response != NULL) {
|
||||||
|
language_classifier_response_destroy(lang_response);
|
||||||
|
}
|
||||||
|
|
||||||
char_array_destroy(temp_string);
|
char_array_destroy(temp_string);
|
||||||
string_tree_destroy(tree);
|
string_tree_destroy(tree);
|
||||||
|
|
||||||
@@ -930,6 +945,14 @@ bool libpostal_setup(void) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool libpostal_setup_language_classifier(void) {
|
||||||
|
if (!language_classifier_module_setup(NULL)) {
|
||||||
|
log_error("Error loading language classifier\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool libpostal_setup_parser(void) {
|
bool libpostal_setup_parser(void) {
|
||||||
if (!geodb_module_setup(NULL)) {
|
if (!geodb_module_setup(NULL)) {
|
||||||
log_error("Error loading geodb module\n");
|
log_error("Error loading geodb module\n");
|
||||||
@@ -952,6 +975,10 @@ void libpostal_teardown(void) {
|
|||||||
address_dictionary_module_teardown();
|
address_dictionary_module_teardown();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void libpostal_teardown_language_classifier(void) {
|
||||||
|
language_classifier_module_teardown();
|
||||||
|
}
|
||||||
|
|
||||||
void libpostal_teardown_parser(void) {
|
void libpostal_teardown_parser(void) {
|
||||||
geodb_module_teardown();
|
geodb_module_teardown();
|
||||||
address_parser_module_teardown();
|
address_parser_module_teardown();
|
||||||
|
|||||||
@@ -123,6 +123,9 @@ void libpostal_teardown(void);
|
|||||||
bool libpostal_setup_parser(void);
|
bool libpostal_setup_parser(void);
|
||||||
void libpostal_teardown_parser(void);
|
void libpostal_teardown_parser(void);
|
||||||
|
|
||||||
|
bool libpostal_setup_language_classifier(void);
|
||||||
|
void libpostal_teardown_language_classifier(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user