diff --git a/src/address_dictionary.h b/src/address_dictionary.h index c174b002..ccd99b05 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -21,7 +21,8 @@ #define ALL_LANGUAGES "all" -#define DEFAULT_ADDRESS_EXPANSION_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "address_expansions" PATH_SEPARATOR "address_dictionary.dat" +#define ADDRESS_DICTIONARY_DATA_FILE "address_dictionary.dat" +#define DEFAULT_ADDRESS_EXPANSION_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR PATH_SEPARATOR ADDRESS_DICTIONARY_DATA_FILE #define NULL_CANONICAL_INDEX -1 diff --git a/src/file_utils.c b/src/file_utils.c index 1747bbd8..4a320d83 100644 --- a/src/file_utils.c +++ b/src/file_utils.c @@ -36,6 +36,21 @@ bool is_relative_path(struct dirent *ent) { return strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0; } +char *path_vjoin(int n, va_list args) { + char_array *path = char_array_new(); + if (path == NULL) return NULL; + char_array_add_vjoined(path, PATH_SEPARATOR, true, n, args); + return char_array_to_string(path); +} + +char *path_join(int n, ...) { + va_list args; + va_start(args, n); + char *path = path_vjoin(n, args); + va_end(args); + return path; +} + inline uint64_t file_deserialize_uint64(unsigned char *buf) { return ((uint64_t)buf[0] << 56) | ((uint64_t)buf[1] << 48) | diff --git a/src/file_utils.h b/src/file_utils.h index 13fa1e39..b6648500 100644 --- a/src/file_utils.h +++ b/src/file_utils.h @@ -9,6 +9,7 @@ #include #include "libpostal_config.h" +#include "string_utils.h" #ifdef HAVE_DIRENT_H #include @@ -55,6 +56,9 @@ char *file_getline(FILE * f); bool is_relative_path(struct dirent *ent); +char *path_join(int n, ...); +char *path_vjoin(int n, va_list args); + uint64_t file_deserialize_uint64(unsigned char *buf); bool file_read_uint64(FILE *file, uint64_t *value); bool file_write_uint64(FILE *file, uint64_t value); diff --git a/src/libpostal.c b/src/libpostal.c index 978f9b70..06401146 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -1054,45 +1054,106 @@ address_parser_response_t *parse_address(char *address, address_parser_options_t return parsed; } +bool libpostal_setup_datadir(char *datadir) { + char *transliteration_path = NULL; + char *numex_path = NULL; + char *address_dictionary_path = NULL; + + if (datadir != NULL) { + transliteration_path = path_join(3, datadir, LIBPOSTAL_TRANSLITERATION_SUBDIR, TRANSLITERATION_DATA_FILE); + numex_path = path_join(3, datadir, LIBPOSTAL_NUMEX_SUBDIR, NUMEX_DATA_FILE); + address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE); + } + + if (!transliteration_module_setup(transliteration_path)) { + log_error("Error loading transliteration module, dir=%s\n", transliteration_path); + return false; + } + + if (!numex_module_setup(numex_path)) { + log_error("Error loading numex module, dir=%s\n", numex_path); + return false; + } + + if (!address_dictionary_module_setup(address_dictionary_path)) { + log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path); + return false; + } + + if (transliteration_path != NULL) { + free(transliteration_path); + } + + if (numex_path != NULL) { + free(numex_path); + } + + if (address_dictionary_path != NULL) { + free(address_dictionary_path); + } + + return true; +} + bool libpostal_setup(void) { - if (!transliteration_module_setup(NULL)) { - log_error("Error loading transliteration module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); + return libpostal_setup_datadir(NULL); +} + +bool libpostal_setup_language_classifier_datadir(char *datadir) { + char *language_classifier_dir = NULL; + + if (datadir != NULL) { + language_classifier_dir = path_join(2, datadir, LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR); + } + + if (!language_classifier_module_setup(language_classifier_dir)) { + log_error("Error loading language classifier, dir=%s\n", language_classifier_dir); return false; } - if (!numex_module_setup(NULL)) { - log_error("Error loading numex module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); - return false; - } - - if (!address_dictionary_module_setup(NULL)) { - log_error("Error loading dictionary module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); - return false; + if (language_classifier_dir != NULL) { + free(language_classifier_dir); } return true; } bool libpostal_setup_language_classifier(void) { - if (!language_classifier_module_setup(NULL)) { - log_error("Error loading language classifier, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); + return libpostal_setup_language_classifier_datadir(NULL); +} + +bool libpostal_setup_parser_datadir(char *datadir) { + char *parser_dir = NULL; + char *geodb_dir = NULL; + + if (datadir != NULL) { + parser_dir = path_join(2, datadir, LIBPOSTAL_ADDRESS_PARSER_SUBDIR); + geodb_dir = path_join(2, datadir, LIBPOSTAL_GEODB_SUBDIR); + } + + if (!geodb_module_setup(geodb_dir)) { + log_error("Error loading geodb module, dir=%s\n", geodb_dir); return false; } + + if (!address_parser_module_setup(parser_dir)) { + log_error("Error loading address parser module, dir=%s\n", parser_dir); + return false; + } + + if (parser_dir != NULL) { + free(parser_dir); + } + + if (geodb_dir != NULL) { + free(geodb_dir); + } + return true; } bool libpostal_setup_parser(void) { - if (!geodb_module_setup(NULL)) { - log_error("Error loading geodb module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); - return false; - } - - if (!address_parser_module_setup(NULL)) { - log_error("Error loading address parser module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); - return false; - } - - return true; + return libpostal_setup_parser_datadir(NULL); } void libpostal_teardown(void) { diff --git a/src/libpostal.h b/src/libpostal.h index 30646e59..178d6225 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -92,12 +92,15 @@ address_parser_response_t *parse_address(char *address, address_parser_options_t // Setup/teardown methods bool libpostal_setup(void); +bool libpostal_setup_datadir(char *datadir); void libpostal_teardown(void); bool libpostal_setup_parser(void); +bool libpostal_setup_parser_datadir(char *datadir); void libpostal_teardown_parser(void); bool libpostal_setup_language_classifier(void); +bool libpostal_setup_language_classifier_datadir(char *datadir); void libpostal_teardown_language_classifier(void); #ifdef __cplusplus diff --git a/src/libpostal_config.h b/src/libpostal_config.h index 6b07a9b8..4d935665 100644 --- a/src/libpostal_config.h +++ b/src/libpostal_config.h @@ -12,12 +12,20 @@ #error LIBPOSTAL_DATA_DIR not defined! #endif -#define LIBPOSTAL_ADDRESS_PARSER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "address_parser" -#define LIBPOSTAL_DICTIONARIES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "dictionaries" -#define LIBPOSTAL_GEONAMES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "geonames" -#define LIBPOSTAL_GEODB_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "geodb" -#define LIBPOSTAL_LANGUAGE_CLASSIFIER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "language_classifier" -#define LIBPOSTAL_TRANSLITERATION_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "transliteration" +#define LIBPOSTAL_ADDRESS_PARSER_SUBDIR "address_parser" +#define LIBPOSTAL_ADDRESS_PARSER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_PARSER_SUBDIR +#define LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR "address_expansions" +#define LIBPOSTAL_ADDRESS_EXPANSIONS_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR +#define LIBPOSTAL_GEONAMES_SUBDIR "geonames" +#define LIBPOSTAL_GEONAMES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_GEONAMES_SUBDIR +#define LIBPOSTAL_GEODB_SUBDIR "geodb" +#define LIBPOSTAL_GEODB_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_GEODB_SUBDIR +#define LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR "language_classifier" +#define LIBPOSTAL_LANGUAGE_CLASSIFIER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR +#define LIBPOSTAL_NUMEX_SUBDIR "numex" +#define LIBPOSTAL_NUMEX_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_NUMEX_SUBDIR +#define LIBPOSTAL_TRANSLITERATION_SUBDIR "transliteration" +#define LIBPOSTAL_TRANSLITERATION_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_TRANSLITERATION_SUBDIR #define GEODB_BLOOM_FILTER_SIZE 100000000 #define GEODB_BLOOM_FILTER_ERROR 0.001 diff --git a/src/numex.h b/src/numex.h index 421435ec..9d8d9f4a 100644 --- a/src/numex.h +++ b/src/numex.h @@ -20,7 +20,8 @@ #include "trie.h" #include "trie_search.h" -#define DEFAULT_NUMEX_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "numex" PATH_SEPARATOR "numex.dat" +#define NUMEX_DATA_FILE "numex.dat" +#define DEFAULT_NUMEX_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "numex" PATH_SEPARATOR NUMEX_DATA_FILE #define LATIN_LANGUAGE_CODE "la" diff --git a/src/transliterate.h b/src/transliterate.h index 885f9989..79230a56 100644 --- a/src/transliterate.h +++ b/src/transliterate.h @@ -15,7 +15,8 @@ #define LATIN_ASCII "latin-ascii" -#define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR "transliteration.dat" +#define TRANSLITERATION_DATA_FILE "transliteration.dat" +#define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR TRANSLITERATION_DATA_FILE #define MAX_TRANS_NAME_LEN 100