[merge] merging master

This commit is contained in:
Al
2017-01-13 19:58:49 -05:00
27 changed files with 181 additions and 55 deletions

View File

@@ -148,6 +148,20 @@ int main(int argc, char **argv) {
}
```
Parser labels
-------------
The address parser can use any string labels that are defined in the training data, but these are the default labels, based on the fields defined in [OpenCage's address-formatting library](https://github.com/OpenCageData/address-formatting):
- **house**: venue name e.g. "Brooklyn Academy of Music", and building names e.g. "Empire State Building"
- **house_number**: usually refers to the external (street-facing) building number. In some countries this may be a compount, hyphenated number which also includes an apartment number, or a block number (a la Japan), but libpostal will just call it the house_number for simplicity.
- **road**: street name(s)
- **suburb**: usually an unofficial neighborhood name like "Harlem", "South Bronx", or "Crown Heights"
- **city_district**: these are usually boroughs or districts within a city that serve some official purpose e.g. "Brooklyn" or "Hackney" or "Bratislava IV"
- **city**: any human settlement including cities, towns, villages, hamlets, localities, etc.
- **state_district**: usually a second-level administrative division or county.
- **state**: a first-level administrative division. Scotland, Northern Ireland, Wales, and England in the UK are mapped to "state" as well (convention used in OSM, GeoPlanet, etc.)
- **country**: sovereign nations and their dependent territories, anything with an [ISO-3166 code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2).
Examples of normalization
-------------------------

View File

@@ -1,7 +1,7 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
AC_INIT([libpostal], [0.3])
AC_INIT([libpostal], [0.3.3])
AC_CONFIG_MACRO_DIRS([m4])
@@ -47,10 +47,8 @@ AC_TYPE_UINT8_T
AC_CHECK_TYPES([ptrdiff_t])
# Checks for library functions.
AC_FUNC_MALLOC
AC_FUNC_MMAP
AC_FUNC_REALLOC
AC_CHECK_FUNCS([getcwd gettimeofday memmove memset munmap regcomp setlocale sqrt strdup strndup])
AC_CHECK_FUNCS([malloc realloc getcwd gettimeofday memmove memset munmap regcomp setlocale sqrt strdup strndup])
AC_CONFIG_FILES([Makefile
libpostal.pc
@@ -88,4 +86,18 @@ AC_ARG_ENABLE([data-download],
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],
[
if test "x$withval" = "xno"; then
CFLAGS_SCANNER_EXTRA=""
else
CFLAGS_SCANNER_EXTRA="$withval"
fi
],
[ CFLAGS_SCANNER_EXTRA="" ]
)
AC_MSG_NOTICE([extra cflags for scanner.c: $CFLAGS_SCANNER_EXTRA])
AC_SUBST(CFLAGS_SCANNER_EXTRA)
AC_OUTPUT

View File

@@ -21,9 +21,11 @@ libpostal_la_CFLAGS = $(CFLAGS_O2)
dist_bin_SCRIPTS = libpostal_data
# Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough
# On cross-compilation for ARM using gcc-4.7, there are "out of range" errors during compilation that can be fixed by adding
# -marm option. For that, CFLAGS_SCANNER_EXTRA is provided that can be filled during configuration stage (see ./configure --help).
noinst_LTLIBRARIES = libscanner.la
libscanner_la_SOURCES = scanner.c
libscanner_la_CFLAGS = $(CFLAGS_O0)
libscanner_la_CFLAGS = $(CFLAGS_O0) $(CFLAGS_SCANNER_EXTRA)
noinst_PROGRAMS = libpostal bench build_address_dictionary build_geodb build_numex_table build_trans_table address_parser_train address_parser_test address_parser language_classifier_train language_classifier language_classifier_test
libpostal_SOURCES = main.c json_encode.c

View File

@@ -288,7 +288,7 @@ phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang) {
bool address_dictionary_init(void) {
if (address_dict != NULL) return false;
address_dict = malloc(sizeof(address_dictionary_t));
address_dict = calloc(1, sizeof(address_dictionary_t));
if (address_dict == NULL) return false;
address_dict->canonical = cstring_array_new();

View File

@@ -21,7 +21,8 @@
#define ALL_LANGUAGES "all"
#define DEFAULT_ADDRESS_EXPANSION_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "address_expansions" PATH_SEPARATOR "address_dictionary.dat"
#define ADDRESS_DICTIONARY_DATA_FILE "address_dictionary.dat"
#define DEFAULT_ADDRESS_EXPANSION_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR PATH_SEPARATOR ADDRESS_DICTIONARY_DATA_FILE
#define NULL_CANONICAL_INDEX -1

View File

@@ -25,14 +25,13 @@ typedef enum {
ADDRESS_PARSER_SUFFIX_PHRASE
} address_parser_phrase_type_t;
static parser_options_t PARSER_DEFAULT_OPTIONS = {
.rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD,
.print_features = false
};
address_parser_t *address_parser_new_options(parser_options_t options) {
address_parser_t *parser = malloc(sizeof(address_parser_t));
address_parser_t *parser = calloc(1, sizeof(address_parser_t));
parser->options = options;
return parser;
}

View File

@@ -93,7 +93,7 @@ averaged_perceptron_t *averaged_perceptron_read(FILE *f) {
return NULL;
}
averaged_perceptron_t *perceptron = malloc(sizeof(averaged_perceptron_t));
averaged_perceptron_t *perceptron = calloc(1, sizeof(averaged_perceptron_t));
if (!file_read_uint32(f, &perceptron->num_features) ||
!file_read_uint32(f, &perceptron->num_classes) ||

View File

@@ -386,7 +386,7 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
}
averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) {
averaged_perceptron_trainer_t *self = malloc(sizeof(averaged_perceptron_trainer_t));
averaged_perceptron_trainer_t *self = calloc(1, sizeof(averaged_perceptron_trainer_t));
if (self == NULL) return NULL;

View File

@@ -56,7 +56,7 @@ int bloom_filter_add(bloom_filter_t *self, const char *key, size_t len) {
}
bloom_filter_t *bloom_filter_new(uint64_t capacity, double error) {
bloom_filter_t *bloom = malloc(sizeof(bloom_filter_t));
bloom_filter_t *bloom = calloc(1, sizeof(bloom_filter_t));
if (bloom == NULL) {
return NULL;

View File

@@ -36,6 +36,21 @@ bool is_relative_path(struct dirent *ent) {
return strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0;
}
char *path_vjoin(int n, va_list args) {
char_array *path = char_array_new();
if (path == NULL) return NULL;
char_array_add_vjoined(path, PATH_SEPARATOR, true, n, args);
return char_array_to_string(path);
}
char *path_join(int n, ...) {
va_list args;
va_start(args, n);
char *path = path_vjoin(n, args);
va_end(args);
return path;
}
inline uint64_t file_deserialize_uint64(unsigned char *buf) {
return ((uint64_t)buf[0] << 56) |
((uint64_t)buf[1] << 48) |

View File

@@ -9,6 +9,7 @@
#include <sys/types.h>
#include "libpostal_config.h"
#include "string_utils.h"
#ifdef HAVE_DIRENT_H
#include <dirent.h>
@@ -55,6 +56,9 @@ char *file_getline(FILE * f);
bool is_relative_path(struct dirent *ent);
char *path_join(int n, ...);
char *path_vjoin(int n, va_list args);
uint64_t file_deserialize_uint64(unsigned char *buf);
bool file_read_uint64(FILE *file, uint64_t *value);
bool file_write_uint64(FILE *file, uint64_t value);

View File

@@ -47,7 +47,7 @@ void geodb_destroy(geodb_t *self) {
geodb_t *geodb_init(char *dir) {
if (dir == NULL) return NULL;
geodb_t *gdb = malloc(sizeof(geodb_t));
geodb_t *gdb = calloc(1, sizeof(geodb_t));
if (gdb == NULL) return NULL;

View File

@@ -338,7 +338,7 @@ void geodb_builder_destroy(geodb_builder_t *self) {
}
geodb_builder_t *geodb_builder_new(char *log_filename) {
geodb_builder_t *builder = malloc(sizeof(geodb_builder_t));
geodb_builder_t *builder = calloc(1, sizeof(geodb_builder_t));
if (builder == NULL) return NULL;

View File

@@ -1,7 +1,7 @@
#include "graph.h"
graph_t *graph_new_dims(graph_type_t type, uint32_t m, uint32_t n, size_t nnz, bool fixed_rows) {
graph_t *graph = malloc(sizeof(graph_t));
graph_t *graph = calloc(1, sizeof(graph_t));
graph->m = m;
graph->fixed_rows = fixed_rows;
graph->n = n;

View File

@@ -35,7 +35,7 @@ void language_classifier_destroy(language_classifier_t *self) {
}
language_classifier_t *language_classifier_new(void) {
language_classifier_t *language_classifier = malloc(sizeof(language_classifier_t));
language_classifier_t *language_classifier = calloc(1, sizeof(language_classifier_t));
return language_classifier;
}

View File

@@ -1036,40 +1036,106 @@ address_parser_response_t *parse_address(char *address, address_parser_options_t
return parsed;
}
bool libpostal_setup_datadir(char *datadir) {
char *transliteration_path = NULL;
char *numex_path = NULL;
char *address_dictionary_path = NULL;
if (datadir != NULL) {
transliteration_path = path_join(3, datadir, LIBPOSTAL_TRANSLITERATION_SUBDIR, TRANSLITERATION_DATA_FILE);
numex_path = path_join(3, datadir, LIBPOSTAL_NUMEX_SUBDIR, NUMEX_DATA_FILE);
address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
}
if (!transliteration_module_setup(transliteration_path)) {
log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
return false;
}
if (!numex_module_setup(numex_path)) {
log_error("Error loading numex module, dir=%s\n", numex_path);
return false;
}
if (!address_dictionary_module_setup(address_dictionary_path)) {
log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
return false;
}
if (transliteration_path != NULL) {
free(transliteration_path);
}
if (numex_path != NULL) {
free(numex_path);
}
if (address_dictionary_path != NULL) {
free(address_dictionary_path);
}
return true;
}
bool libpostal_setup(void) {
if (!transliteration_module_setup(NULL)) {
log_error("Error loading transliteration module\n");
return libpostal_setup_datadir(NULL);
}
bool libpostal_setup_language_classifier_datadir(char *datadir) {
char *language_classifier_dir = NULL;
if (datadir != NULL) {
language_classifier_dir = path_join(2, datadir, LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR);
}
if (!language_classifier_module_setup(language_classifier_dir)) {
log_error("Error loading language classifier, dir=%s\n", language_classifier_dir);
return false;
}
if (!numex_module_setup(NULL)) {
log_error("Error loading numex module\n");
return false;
}
if (!address_dictionary_module_setup(NULL)) {
log_error("Error loading dictionary module\n");
return false;
if (language_classifier_dir != NULL) {
free(language_classifier_dir);
}
return true;
}
bool libpostal_setup_language_classifier(void) {
if (!language_classifier_module_setup(NULL)) {
log_error("Error loading language classifier\n");
return libpostal_setup_language_classifier_datadir(NULL);
}
bool libpostal_setup_parser_datadir(char *datadir) {
char *parser_dir = NULL;
char *geodb_dir = NULL;
if (datadir != NULL) {
parser_dir = path_join(2, datadir, LIBPOSTAL_ADDRESS_PARSER_SUBDIR);
geodb_dir = path_join(2, datadir, LIBPOSTAL_GEODB_SUBDIR);
}
if (!geodb_module_setup(geodb_dir)) {
log_error("Error loading geodb module, dir=%s\n", geodb_dir);
return false;
}
if (!address_parser_module_setup(parser_dir)) {
log_error("Error loading address parser module, dir=%s\n", parser_dir);
return false;
}
if (parser_dir != NULL) {
free(parser_dir);
}
if (geodb_dir != NULL) {
free(geodb_dir);
}
return true;
}
bool libpostal_setup_parser(void) {
if (!address_parser_module_setup(NULL)) {
log_error("Error loading address parser module\n");
return false;
}
return true;
return libpostal_setup_parser_datadir(NULL);
}
void libpostal_teardown(void) {

View File

@@ -92,12 +92,15 @@ address_parser_response_t *parse_address(char *address, address_parser_options_t
// Setup/teardown methods
bool libpostal_setup(void);
bool libpostal_setup_datadir(char *datadir);
void libpostal_teardown(void);
bool libpostal_setup_parser(void);
bool libpostal_setup_parser_datadir(char *datadir);
void libpostal_teardown_parser(void);
bool libpostal_setup_language_classifier(void);
bool libpostal_setup_language_classifier_datadir(char *datadir);
void libpostal_teardown_language_classifier(void);
#ifdef __cplusplus

View File

@@ -12,12 +12,20 @@
#error LIBPOSTAL_DATA_DIR not defined!
#endif
#define LIBPOSTAL_ADDRESS_PARSER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "address_parser"
#define LIBPOSTAL_DICTIONARIES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "dictionaries"
#define LIBPOSTAL_GEONAMES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "geonames"
#define LIBPOSTAL_GEODB_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "geodb"
#define LIBPOSTAL_LANGUAGE_CLASSIFIER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "language_classifier"
#define LIBPOSTAL_TRANSLITERATION_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR "transliteration"
#define LIBPOSTAL_ADDRESS_PARSER_SUBDIR "address_parser"
#define LIBPOSTAL_ADDRESS_PARSER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_PARSER_SUBDIR
#define LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR "address_expansions"
#define LIBPOSTAL_ADDRESS_EXPANSIONS_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR
#define LIBPOSTAL_GEONAMES_SUBDIR "geonames"
#define LIBPOSTAL_GEONAMES_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_GEONAMES_SUBDIR
#define LIBPOSTAL_GEODB_SUBDIR "geodb"
#define LIBPOSTAL_GEODB_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_GEODB_SUBDIR
#define LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR "language_classifier"
#define LIBPOSTAL_LANGUAGE_CLASSIFIER_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR
#define LIBPOSTAL_NUMEX_SUBDIR "numex"
#define LIBPOSTAL_NUMEX_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_NUMEX_SUBDIR
#define LIBPOSTAL_TRANSLITERATION_SUBDIR "transliteration"
#define LIBPOSTAL_TRANSLITERATION_DIR LIBPOSTAL_DATA_DIR PATH_SEPARATOR LIBPOSTAL_TRANSLITERATION_SUBDIR
#define GEODB_BLOOM_FILTER_SIZE 100000000
#define GEODB_BLOOM_FILTER_ERROR 0.001

View File

@@ -51,7 +51,7 @@ numex_table_t *numex_table_init(void) {
numex_table_t *numex_table = get_numex_table();
if (numex_table == NULL) {
numex_table = malloc(sizeof(numex_table_t));
numex_table = calloc(1, sizeof(numex_table_t));
if (numex_table == NULL) return NULL;

View File

@@ -20,7 +20,8 @@
#include "trie.h"
#include "trie_search.h"
#define DEFAULT_NUMEX_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "numex" PATH_SEPARATOR "numex.dat"
#define NUMEX_DATA_FILE "numex.dat"
#define DEFAULT_NUMEX_PATH LIBPOSTAL_DATA_DIR PATH_SEPARATOR "numex" PATH_SEPARATOR NUMEX_DATA_FILE
#define LATIN_LANGUAGE_CODE "la"

View File

@@ -2,7 +2,7 @@
#include "klib/ksort.h"
sparse_matrix_t *sparse_matrix_new_shape(size_t m, size_t n) {
sparse_matrix_t *matrix = malloc(sizeof(sparse_matrix_t));
sparse_matrix_t *matrix = calloc(1, sizeof(sparse_matrix_t));
if (matrix == NULL) return NULL;
matrix->m = m;
matrix->n = n;

View File

@@ -782,18 +782,19 @@ cstring_array *cstring_array_new_size(size_t size) {
cstring_array *cstring_array_from_char_array(char_array *str) {
cstring_array *array = malloc(sizeof(cstring_array));
if (array == NULL) return NULL;
if (array == NULL || str == NULL) return NULL;
array->str = str;
array->indices = uint32_array_new_size(1);
uint32_array_push(array->indices, 0);
char *ptr = str->a;
uint32_t i = 0;
for (i = 0; i < str->n - 1; i++, ptr++) {
if (str->n > 0) {
for (uint32_t i = 0; i < str->n - 1; i++, ptr++) {
if (*ptr == '\0') {
uint32_array_push(array->indices, i + 1);
}
}
}
return array;
}

View File

@@ -141,7 +141,6 @@ void char_array_cat_vprintf(char_array *array, char *format, va_list args);
void char_array_cat_printf(char_array *array, char *format, ...);
// Mainly for paths or delimited strings
void char_array_append_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args);
void char_array_add_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args);
void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...);
void char_array_cat_joined(char_array *array, char *separator, bool strip_separator, int count, ...);

View File

@@ -1087,7 +1087,7 @@ transliteration_table_t *transliteration_table_init(void) {
transliteration_table_t *trans_table = get_transliteration_table();
if (trans_table == NULL) {
trans_table = malloc(sizeof(transliteration_table_t));
trans_table = calloc(1, sizeof(transliteration_table_t));
trans_table->trie = trie_new();
if (trans_table->trie == NULL) {

View File

@@ -17,7 +17,8 @@
#define LATIN_ASCII_SIMPLE "latin-ascii-simple"
#define HTML_ESCAPE "html-escape"
#define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR "transliteration.dat"
#define TRANSLITERATION_DATA_FILE "transliteration.dat"
#define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR TRANSLITERATION_DATA_FILE
#define MAX_TRANS_NAME_LEN 100

View File

@@ -32,7 +32,7 @@ Constructors
*/
static trie_t *trie_new_empty(uint8_t *alphabet, uint32_t alphabet_size) {
trie_t *self = malloc(sizeof(trie_t));
trie_t *self = calloc(1, sizeof(trie_t));
if (!self)
goto exit_no_malloc;

View File

@@ -33,7 +33,7 @@ static inline void _aligned_free(void *p)
name *array = malloc(sizeof(name)); \
if (array == NULL) return NULL; \
array->n = array->m = 0; \
array->a = malloc(size * sizeof(type)); \
array->a = malloc((size > 0 ? size : 1) * sizeof(type)); \
if (array->a == NULL) return NULL; \
array->m = size; \
return array; \