Merge pull request #154 from openvenues/setup_datadir_functions

Setup datadir functions
This commit is contained in:
Al Barrentine
2017-01-09 16:52:07 -05:00
committed by GitHub
12 changed files with 132 additions and 38 deletions

View File

@@ -1,7 +1,7 @@
# -*- Autoconf -*- # -*- Autoconf -*-
# Process this file with autoconf to produce a configure script. # Process this file with autoconf to produce a configure script.
AC_INIT([libpostal], [0.3]) AC_INIT([libpostal], [0.3.3])
AM_INIT_AUTOMAKE([foreign subdir-objects]) AM_INIT_AUTOMAKE([foreign subdir-objects])
AC_CONFIG_SRCDIR([src]) AC_CONFIG_SRCDIR([src])

View File

@@ -21,7 +21,8 @@
#define ALL_LANGUAGES "all" #define ALL_LANGUAGES "all"
#define DEFAULT_ADDRESS_EXPANSION_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "address_expansions" PATH_SEPARATOR "address_dictionary.dat" #define ADDRESS_DICTIONARY_DATA_FILE "address_dictionary.dat"
#define DEFAULT_ADDRESS_EXPANSION_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR PATH_SEPARATOR ADDRESS_DICTIONARY_DATA_FILE
#define NULL_CANONICAL_INDEX -1 #define NULL_CANONICAL_INDEX -1

View File

@@ -12,7 +12,7 @@ void feature_array_add(cstring_array *features, size_t count, ...) {
cstring_array_start_token(features); cstring_array_start_token(features);
bool strip_separator = true; bool strip_separator = true;
char_array_append_vjoined(features->str, FEATURE_SEPARATOR_CHAR, strip_separator, count, args); char_array_add_vjoined(features->str, FEATURE_SEPARATOR_CHAR, strip_separator, count, args);
va_end(args); va_end(args);
} }

View File

@@ -36,6 +36,21 @@ bool is_relative_path(struct dirent *ent) {
return strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0; return strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0;
} }
char *path_vjoin(int n, va_list args) {
char_array *path = char_array_new();
if (path == NULL) return NULL;
char_array_add_vjoined(path, PATH_SEPARATOR, true, n, args);
return char_array_to_string(path);
}
char *path_join(int n, ...) {
va_list args;
va_start(args, n);
char *path = path_vjoin(n, args);
va_end(args);
return path;
}
inline uint64_t file_deserialize_uint64(unsigned char *buf) { inline uint64_t file_deserialize_uint64(unsigned char *buf) {
return ((uint64_t)buf[0] << 56) | return ((uint64_t)buf[0] << 56) |
((uint64_t)buf[1] << 48) | ((uint64_t)buf[1] << 48) |

View File

@@ -9,6 +9,7 @@
#include <sys/types.h> #include <sys/types.h>
#include "libpostal_config.h" #include "libpostal_config.h"
#include "string_utils.h"
#ifdef HAVE_DIRENT_H #ifdef HAVE_DIRENT_H
#include <dirent.h> #include <dirent.h>
@@ -55,6 +56,9 @@ char *file_getline(FILE * f);
bool is_relative_path(struct dirent *ent); bool is_relative_path(struct dirent *ent);
char *path_join(int n, ...);
char *path_vjoin(int n, va_list args);
uint64_t file_deserialize_uint64(unsigned char *buf); uint64_t file_deserialize_uint64(unsigned char *buf);
bool file_read_uint64(FILE *file, uint64_t *value); bool file_read_uint64(FILE *file, uint64_t *value);
bool file_write_uint64(FILE *file, uint64_t value); bool file_write_uint64(FILE *file, uint64_t value);

View File

@@ -1054,45 +1054,106 @@ address_parser_response_t *parse_address(char *address, address_parser_options_t
return parsed; return parsed;
} }
bool libpostal_setup_datadir(char *datadir) {
char *transliteration_path = NULL;
char *numex_path = NULL;
char *address_dictionary_path = NULL;
if (datadir != NULL) {
transliteration_path = path_join(3, datadir, LIBPOSTAL_TRANSLITERATION_SUBDIR, TRANSLITERATION_DATA_FILE);
numex_path = path_join(3, datadir, LIBPOSTAL_NUMEX_SUBDIR, NUMEX_DATA_FILE);
address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
}
if (!transliteration_module_setup(transliteration_path)) {
log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
return false;
}
if (!numex_module_setup(numex_path)) {
log_error("Error loading numex module, dir=%s\n", numex_path);
return false;
}
if (!address_dictionary_module_setup(address_dictionary_path)) {
log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
return false;
}
if (transliteration_path != NULL) {
free(transliteration_path);
}
if (numex_path != NULL) {
free(numex_path);
}
if (address_dictionary_path != NULL) {
free(address_dictionary_path);
}
return true;
}
bool libpostal_setup(void) { bool libpostal_setup(void) {
if (!transliteration_module_setup(NULL)) { return libpostal_setup_datadir(NULL);
log_error("Error loading transliteration module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); }
bool libpostal_setup_language_classifier_datadir(char *datadir) {
char *language_classifier_dir = NULL;
if (datadir != NULL) {
language_classifier_dir = path_join(2, datadir, LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR);
}
if (!language_classifier_module_setup(language_classifier_dir)) {
log_error("Error loading language classifier, dir=%s\n", language_classifier_dir);
return false; return false;
} }
if (!numex_module_setup(NULL)) { if (language_classifier_dir != NULL) {
log_error("Error loading numex module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); free(language_classifier_dir);
return false;
}
if (!address_dictionary_module_setup(NULL)) {
log_error("Error loading dictionary module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR);
return false;
} }
return true; return true;
} }
bool libpostal_setup_language_classifier(void) { bool libpostal_setup_language_classifier(void) {
if (!language_classifier_module_setup(NULL)) { return libpostal_setup_language_classifier_datadir(NULL);
log_error("Error loading language classifier, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR); }
bool libpostal_setup_parser_datadir(char *datadir) {
char *parser_dir = NULL;
char *geodb_dir = NULL;
if (datadir != NULL) {
parser_dir = path_join(2, datadir, LIBPOSTAL_ADDRESS_PARSER_SUBDIR);
geodb_dir = path_join(2, datadir, LIBPOSTAL_GEODB_SUBDIR);
}
if (!geodb_module_setup(geodb_dir)) {
log_error("Error loading geodb module, dir=%s\n", geodb_dir);
return false; return false;
} }
if (!address_parser_module_setup(parser_dir)) {
log_error("Error loading address parser module, dir=%s\n", parser_dir);
return false;
}
if (parser_dir != NULL) {
free(parser_dir);
}
if (geodb_dir != NULL) {
free(geodb_dir);
}
return true; return true;
} }
bool libpostal_setup_parser(void) { bool libpostal_setup_parser(void) {
if (!geodb_module_setup(NULL)) { return libpostal_setup_parser_datadir(NULL);
log_error("Error loading geodb module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR);
return false;
}
if (!address_parser_module_setup(NULL)) {
log_error("Error loading address parser module, LIBPOSTAL_DATA_DIR=%s\n", LIBPOSTAL_DATA_DIR);
return false;
}
return true;
} }
void libpostal_teardown(void) { void libpostal_teardown(void) {

View File

@@ -92,12 +92,15 @@ address_parser_response_t *parse_address(char *address, address_parser_options_t
// Setup/teardown methods // Setup/teardown methods
bool libpostal_setup(void); bool libpostal_setup(void);
bool libpostal_setup_datadir(char *datadir);
void libpostal_teardown(void); void libpostal_teardown(void);
bool libpostal_setup_parser(void); bool libpostal_setup_parser(void);
bool libpostal_setup_parser_datadir(char *datadir);
void libpostal_teardown_parser(void); void libpostal_teardown_parser(void);
bool libpostal_setup_language_classifier(void); bool libpostal_setup_language_classifier(void);
bool libpostal_setup_language_classifier_datadir(char *datadir);
void libpostal_teardown_language_classifier(void); void libpostal_teardown_language_classifier(void);
#ifdef __cplusplus #ifdef __cplusplus

View File

@@ -12,12 +12,20 @@
#error LIBPOSTAL_DATA_DIR not defined! #error LIBPOSTAL_DATA_DIR not defined!
#endif #endif
#define LIBPOSTAL_ADDRESS_PARSER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "address_parser" #define LIBPOSTAL_ADDRESS_PARSER_SUBDIR "address_parser"
#define LIBPOSTAL_DICTIONARIES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "dictionaries" #define LIBPOSTAL_ADDRESS_PARSER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_PARSER_SUBDIR
#define LIBPOSTAL_GEONAMES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "geonames" #define LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR "address_expansions"
#define LIBPOSTAL_GEODB_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "geodb" #define LIBPOSTAL_ADDRESS_EXPANSIONS_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR
#define LIBPOSTAL_LANGUAGE_CLASSIFIER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "language_classifier" #define LIBPOSTAL_GEONAMES_SUBDIR "geonames"
#define LIBPOSTAL_TRANSLITERATION_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "transliteration" #define LIBPOSTAL_GEONAMES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_GEONAMES_SUBDIR
#define LIBPOSTAL_GEODB_SUBDIR "geodb"
#define LIBPOSTAL_GEODB_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_GEODB_SUBDIR
#define LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR "language_classifier"
#define LIBPOSTAL_LANGUAGE_CLASSIFIER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR
#define LIBPOSTAL_NUMEX_SUBDIR "numex"
#define LIBPOSTAL_NUMEX_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_NUMEX_SUBDIR
#define LIBPOSTAL_TRANSLITERATION_SUBDIR "transliteration"
#define LIBPOSTAL_TRANSLITERATION_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_TRANSLITERATION_SUBDIR
#define GEODB_BLOOM_FILTER_SIZE 100000000 #define GEODB_BLOOM_FILTER_SIZE 100000000
#define GEODB_BLOOM_FILTER_ERROR 0.001 #define GEODB_BLOOM_FILTER_ERROR 0.001

View File

@@ -20,7 +20,8 @@
#include "trie.h" #include "trie.h"
#include "trie_search.h" #include "trie_search.h"
#define DEFAULT_NUMEX_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "numex" PATH_SEPARATOR "numex.dat" #define NUMEX_DATA_FILE "numex.dat"
#define DEFAULT_NUMEX_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "numex" PATH_SEPARATOR NUMEX_DATA_FILE
#define LATIN_LANGUAGE_CODE "la" #define LATIN_LANGUAGE_CODE "la"

View File

@@ -595,7 +595,7 @@ inline void char_array_add_len(char_array *array, char *str, size_t len) {
} }
void char_array_append_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args) { void char_array_add_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args) {
if (count <= 0) { if (count <= 0) {
return; return;
} }
@@ -625,7 +625,7 @@ void char_array_append_vjoined(char_array *array, char *separator, bool strip_se
inline void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...) { inline void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...) {
va_list args; va_list args;
va_start(args, count); va_start(args, count);
char_array_append_vjoined(array, separator, strip_separator, count, args); char_array_add_vjoined(array, separator, strip_separator, count, args);
va_end(args); va_end(args);
} }
@@ -633,7 +633,7 @@ inline void char_array_cat_joined(char_array *array, char *separator, bool strip
char_array_strip_nul_byte(array); char_array_strip_nul_byte(array);
va_list args; va_list args;
va_start(args, count); va_start(args, count);
char_array_append_vjoined(array, separator, strip_separator, count, args); char_array_add_vjoined(array, separator, strip_separator, count, args);
va_end(args); va_end(args);
} }

View File

@@ -136,7 +136,7 @@ void char_array_cat_vprintf(char_array *array, char *format, va_list args);
void char_array_cat_printf(char_array *array, char *format, ...); void char_array_cat_printf(char_array *array, char *format, ...);
// Mainly for paths or delimited strings // Mainly for paths or delimited strings
void char_array_append_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args); void char_array_add_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args);
void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...); void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...);
void char_array_cat_joined(char_array *array, char *separator, bool strip_separator, int count, ...); void char_array_cat_joined(char_array *array, char *separator, bool strip_separator, int count, ...);

View File

@@ -15,7 +15,8 @@
#define LATIN_ASCII "latin-ascii" #define LATIN_ASCII "latin-ascii"
#define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR "transliteration.dat" #define TRANSLITERATION_DATA_FILE "transliteration.dat"
#define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR TRANSLITERATION_DATA_FILE
#define MAX_TRANS_NAME_LEN 100 #define MAX_TRANS_NAME_LEN 100