[api] doing this now since we're bumping a major version. Using a libpostal prefixes for all public header functions and definitions
This commit is contained in:
12
README.md
12
README.md
@@ -132,15 +132,15 @@ int main(int argc, char **argv) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
address_parser_response_t *parsed = parse_address("781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options);
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
libpostal_address_parser_response_t *parsed = libpostal_parse_address("781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options);
|
||||
|
||||
for (size_t i = 0; i < parsed->num_components; i++) {
|
||||
printf("%s: %s\n", parsed->labels[i], parsed->components[i]);
|
||||
}
|
||||
|
||||
// Free parse result
|
||||
address_parser_response_destroy(parsed);
|
||||
libpostal_address_parser_response_destroy(parsed);
|
||||
|
||||
// Teardown (only called once at the end of your program)
|
||||
libpostal_teardown();
|
||||
@@ -220,15 +220,15 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
size_t num_expansions;
|
||||
normalize_options_t options = get_libpostal_default_options();
|
||||
char **expansions = expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions);
|
||||
libpostal_normalize_options_t options = libpostal_get_default_options();
|
||||
char **expansions = libpostal_expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions);
|
||||
|
||||
for (size_t i = 0; i < num_expansions; i++) {
|
||||
printf("%s\n", expansions[i]);
|
||||
}
|
||||
|
||||
// Free expansions
|
||||
expansion_array_destroy(expansions, num_expansions);
|
||||
libpostal_expansion_array_destroy(expansions, num_expansions);
|
||||
|
||||
// Teardown (only called once at the end of your program)
|
||||
libpostal_teardown();
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
SUBDIRS = sparkey
|
||||
|
||||
# Inherited from autoconf / user-specified
|
||||
CFLAGS_CONF = @CFLAGS@
|
||||
CFLAGS_BASE = -Wall -Wextra -Wno-unused-function -Wdeclaration-after-statement -Wformat -Werror=format-security -Winit-self -Wno-sign-compare -DLIBPOSTAL_DATA_DIR='"$(datadir)/libpostal"' -g $(CFLAGS_CONF)
|
||||
@@ -14,9 +12,10 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include
|
||||
CFLAGS =
|
||||
|
||||
lib_LTLIBRARIES = libpostal.la
|
||||
libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c numex.c utf8proc/utf8proc.c cmp/cmp.c geodb.c geo_disambiguation.c normalize.c bloom.c features.c geonames.c geohash/geohash.c unicode_scripts.c msgpack_utils.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
|
||||
libpostal_la_LIBADD = libscanner.la sparkey/libsparkey.la $(CBLAS_LIBS)
|
||||
libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c numex.c utf8proc/utf8proc.c cmp/cmp.c normalize.c bloom.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
|
||||
libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS)
|
||||
libpostal_la_CFLAGS = $(CFLAGS_O2)
|
||||
libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@
|
||||
|
||||
dist_bin_SCRIPTS = libpostal_data
|
||||
|
||||
@@ -27,7 +26,7 @@ noinst_LTLIBRARIES = libscanner.la
|
||||
libscanner_la_SOURCES = scanner.c
|
||||
libscanner_la_CFLAGS = $(CFLAGS_O0) $(CFLAGS_SCANNER_EXTRA)
|
||||
|
||||
noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_geodb build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test
|
||||
noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test
|
||||
|
||||
libpostal_SOURCES = main.c json_encode.c
|
||||
libpostal_LDADD = libpostal.la
|
||||
@@ -41,9 +40,6 @@ address_parser_CFLAGS = $(CFLAGS_O3)
|
||||
|
||||
build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
|
||||
build_address_dictionary_CFLAGS = $(CFLAGS_O3)
|
||||
build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c graph.c graph_builder.c normalize.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c
|
||||
build_geodb_LDADD = sparkey/libsparkey.la
|
||||
build_geodb_CFLAGS = $(CFLAGS_O3)
|
||||
build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c
|
||||
build_numex_table_CFLAGS = $(CFLAGS_O3)
|
||||
build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
|
||||
|
||||
@@ -842,7 +842,7 @@ bool is_valid_dictionary_phrase(phrase_t phrase) {
|
||||
}
|
||||
uint32_t address_phrase_types = expansion_value->components;
|
||||
|
||||
if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME | ADDRESS_CATEGORY | ADDRESS_NEAR | ADDRESS_UNIT | ADDRESS_LEVEL | ADDRESS_ENTRANCE | ADDRESS_STAIRCASE | ADDRESS_POSTAL_CODE | ADDRESS_PO_BOX)) {
|
||||
if (address_phrase_types & (LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_PO_BOX)) {
|
||||
for (size_t i = 0; i < expansion_value->expansions->n; i++) {
|
||||
address_expansion_t expansion = expansion_value->expansions->a[i];
|
||||
if (!address_expansion_in_dictionary(expansion, DICTIONARY_TOPONYM)) {
|
||||
@@ -913,7 +913,7 @@ static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser,
|
||||
expansion_index = suffix_phrase.data;
|
||||
expansion_value = address_dictionary_get_expansions(expansion_index);
|
||||
|
||||
if (expansion_value->components & ADDRESS_STREET) {
|
||||
if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
|
||||
response = (address_parser_phrase_t){
|
||||
word,
|
||||
ADDRESS_PARSER_SUFFIX_PHRASE,
|
||||
@@ -928,8 +928,8 @@ static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser,
|
||||
expansion_index = prefix_phrase.data;
|
||||
expansion_value = address_dictionary_get_expansions(expansion_index);
|
||||
|
||||
// Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category
|
||||
if (expansion_value->components ^ ADDRESS_ANY) {
|
||||
// Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
|
||||
if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
|
||||
response = (address_parser_phrase_t){
|
||||
word,
|
||||
ADDRESS_PARSER_PREFIX_PHRASE,
|
||||
@@ -1164,16 +1164,16 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
||||
add_word_feature = false;
|
||||
log_debug("phrase_string=%s\n", phrase_string);
|
||||
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_CATEGORY, "category", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_UNIT, "unit", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_PO_BOX, "po_box", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_LEVEL, "level", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_ENTRANCE, "entrance", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_STAIRCASE, "staircase", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_HOUSE_NUMBER, "house_number", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, ADDRESS_POSTAL_CODE, "postal_code", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STREET, "street", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_NAME, "name", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_CATEGORY, "category", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_UNIT, "unit", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_PO_BOX, "po_box", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_LEVEL, "level", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_ENTRANCE, "entrance", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STAIRCASE, "staircase", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_HOUSE_NUMBER, "house_number", phrase_string);
|
||||
add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_POSTAL_CODE, "postal_code", phrase_string);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1330,8 +1330,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
||||
expansion_index = prefix_phrase.data;
|
||||
expansion_value = address_dictionary_get_expansions(expansion_index);
|
||||
|
||||
// Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category
|
||||
if (expansion_value->components ^ ADDRESS_ANY) {
|
||||
// Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
|
||||
if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
|
||||
known_prefix = true;
|
||||
char_array_clear(phrase_tokens);
|
||||
prefix_len = prefix_phrase.len;
|
||||
@@ -1347,7 +1347,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
||||
expansion_index = suffix_phrase.data;
|
||||
expansion_value = address_dictionary_get_expansions(expansion_index);
|
||||
|
||||
if (expansion_value->components & ADDRESS_STREET) {
|
||||
if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
|
||||
known_suffix = true;
|
||||
char_array_clear(context->suffix_phrase);
|
||||
suffix_len = suffix_phrase.len;
|
||||
@@ -1582,20 +1582,20 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
||||
right_context_affix = phrase_prefix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
|
||||
}
|
||||
|
||||
if (right_context_components & ADDRESS_STREET && !(right_context_components & ADDRESS_NAME)) {
|
||||
if (right_context_components & LIBPOSTAL_ADDRESS_STREET && !(right_context_components & LIBPOSTAL_ADDRESS_NAME)) {
|
||||
feature_array_add(features, 2, "first word unknown+street phrase right", relation_to_number);
|
||||
feature_array_add(features, 3, "first word unknown+street phrase right", relation_to_number, right_context_word);
|
||||
if (right_context_affix != NULL && right_affix_type != NULL) {
|
||||
feature_array_add(features, 4, "first word unknown+street affix right", relation_to_number, right_affix_type, right_context_affix);
|
||||
}
|
||||
break;
|
||||
} else if (right_context_components & ADDRESS_NAME && !(right_context_components & ADDRESS_STREET)) {
|
||||
} else if (right_context_components & LIBPOSTAL_ADDRESS_NAME && !(right_context_components & LIBPOSTAL_ADDRESS_STREET)) {
|
||||
feature_array_add(features, 2, "first word unknown+venue phrase right", relation_to_number);
|
||||
feature_array_add(features, 3, "first word unknown+venue phrase right", relation_to_number, right_context_word);
|
||||
if (right_context_affix != NULL && right_affix_type != NULL) {
|
||||
feature_array_add(features, 4, "first word unknown+venue affix right", relation_to_number, right_affix_type, right_context_affix);
|
||||
}
|
||||
} else if (right_context_components & (ADDRESS_NAME | ADDRESS_STREET)) {
|
||||
} else if (right_context_components & (LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET)) {
|
||||
if (seen_number) {
|
||||
feature_array_add(features, 1, "first word unknown+number+ambiguous phrase right");
|
||||
feature_array_add(features, 2, "first word unknown+number+ambiguous phrase right", right_context_word);
|
||||
@@ -1637,12 +1637,12 @@ bool address_parser_predict(address_parser_t *self, address_parser_context_t *co
|
||||
return false;
|
||||
}
|
||||
|
||||
address_parser_response_t *address_parser_response_new(void) {
|
||||
address_parser_response_t *response = malloc(sizeof(address_parser_response_t));
|
||||
libpostal_address_parser_response_t *address_parser_response_new(void) {
|
||||
libpostal_address_parser_response_t *response = malloc(sizeof(libpostal_address_parser_response_t));
|
||||
return response;
|
||||
}
|
||||
|
||||
address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context) {
|
||||
libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context) {
|
||||
if (address == NULL || context == NULL) return NULL;
|
||||
|
||||
address_parser_t *parser = get_address_parser();
|
||||
@@ -1693,7 +1693,7 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c
|
||||
country = NULL;
|
||||
address_parser_context_fill(context, parser, tokenized_str, language, country);
|
||||
|
||||
address_parser_response_t *response = NULL;
|
||||
libpostal_address_parser_response_t *response = NULL;
|
||||
|
||||
// If the whole input string is a single known phrase at the SUBURB level or higher, bypass sequence prediction altogether
|
||||
phrase_t only_phrase = NULL_PHRASE;
|
||||
|
||||
@@ -214,7 +214,7 @@ address_parser_t *address_parser_new_options(parser_options_t options);
|
||||
address_parser_t *get_address_parser(void);
|
||||
bool address_parser_load(char *dir);
|
||||
|
||||
address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context);
|
||||
libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context);
|
||||
void address_parser_destroy(address_parser_t *self);
|
||||
|
||||
char *address_parser_normalize_string(char *str);
|
||||
|
||||
@@ -117,10 +117,10 @@ int main(int argc, char **argv) {
|
||||
goto next_input;
|
||||
}
|
||||
|
||||
address_parser_response_t *parsed;
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_response_t *parsed;
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
if ((parsed = parse_address(input, options))) {
|
||||
if ((parsed = libpostal_parse_address(input, options))) {
|
||||
printf("\n");
|
||||
printf("Result:\n\n");
|
||||
printf("{\n");
|
||||
@@ -134,7 +134,7 @@ int main(int argc, char **argv) {
|
||||
printf("}\n");
|
||||
printf("\n");
|
||||
|
||||
address_parser_response_destroy(parsed);
|
||||
libpostal_address_parser_response_destroy(parsed);
|
||||
} else {
|
||||
log_error("Error parsing address\n");
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
@@ -98,8 +98,6 @@ bool address_parser_test(address_parser_t *parser, char *filename, address_parse
|
||||
|
||||
char *prev_label = NULL;
|
||||
|
||||
address_parser_response_t *response = NULL;
|
||||
|
||||
size_t starting_errors = result->num_errors;
|
||||
|
||||
bool prediction_success = address_parser_predict(parser, context, token_labels, &address_parser_features, data_set->tokenized_str);
|
||||
|
||||
@@ -210,7 +210,7 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr
|
||||
if (sub_tokens->n > 1 && search_address_dictionaries_tokens_with_phrases(postal_code_normalized, sub_tokens, language, &postal_code_dictionary_phrases) && postal_code_dictionary_phrases->n > 0) {
|
||||
phrase_t first_postal_code_phrase = postal_code_dictionary_phrases->a[0];
|
||||
address_expansion_value_t *value = address_dictionary_get_expansions(first_postal_code_phrase.data);
|
||||
if (value != NULL && value->components & ADDRESS_POSTAL_CODE) {
|
||||
if (value != NULL && value->components & LIBPOSTAL_ADDRESS_POSTAL_CODE) {
|
||||
char_array_clear(token_builder);
|
||||
size_t first_real_token_index = first_postal_code_phrase.start + first_postal_code_phrase.len;
|
||||
token_t first_real_token = sub_tokens->a[first_real_token_index];
|
||||
@@ -255,7 +255,7 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr
|
||||
|
||||
address_expansion_value_t *phrase_value = address_dictionary_get_expansions(current_phrase.data);
|
||||
size_t current_phrase_end = current_phrase.start + current_phrase.len;
|
||||
if (phrase_value != NULL && phrase_value->components & ADDRESS_POSTAL_CODE) {
|
||||
if (phrase_value != NULL && phrase_value->components & LIBPOSTAL_ADDRESS_POSTAL_CODE) {
|
||||
current_phrase_end = current_phrase.start;
|
||||
}
|
||||
|
||||
|
||||
14
src/bench.c
14
src/bench.c
@@ -28,8 +28,8 @@ int main(int argc, char **argv) {
|
||||
char *languages[argc - 2];
|
||||
for (int i = 0; i < argc - 2; i++) {
|
||||
char *arg = argv[i + 2];
|
||||
if (strlen(arg) >= MAX_LANGUAGE_LEN) {
|
||||
printf("arg %d was longer than a language code (%d chars). Make sure to quote the input string\n", i + 2, MAX_LANGUAGE_LEN - 1);
|
||||
if (strlen(arg) >= LIBPOSTAL_MAX_LANGUAGE_LEN) {
|
||||
printf("arg %d was longer than a language code (%d chars). Make sure to quote the input string\n", i + 2, LIBPOSTAL_MAX_LANGUAGE_LEN - 1);
|
||||
}
|
||||
languages[i] = arg;
|
||||
}
|
||||
@@ -38,7 +38,7 @@ int main(int argc, char **argv) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
normalize_options_t options = get_libpostal_default_options();
|
||||
libpostal_normalize_options_t options = libpostal_get_default_options();
|
||||
|
||||
options.num_languages = 1;
|
||||
options.languages = languages;
|
||||
@@ -56,12 +56,8 @@ int main(int argc, char **argv) {
|
||||
|
||||
clock_t t1 = clock();
|
||||
for (int i = 0; i < num_loops; i++) {
|
||||
strings = expand_address(str, options, &num_expansions);
|
||||
for (uint64_t i = 0; i < num_expansions; i++) {
|
||||
normalized = strings[i];
|
||||
free(normalized);
|
||||
}
|
||||
free(strings);
|
||||
strings = libpostal_expand_address(str, options, &num_expansions);
|
||||
libpostal_expansion_array_destroy(strings, num_expansions);
|
||||
}
|
||||
clock_t t2 = clock();
|
||||
|
||||
|
||||
@@ -1,46 +1,46 @@
|
||||
// Only need these for the in-memory dictionaries
|
||||
gazetteer_t gazetteer_config[] = {
|
||||
{DICTIONARY_ACADEMIC_DEGREE, ADDRESS_NAME},
|
||||
{DICTIONARY_AMBIGUOUS_EXPANSION, ADDRESS_NONE},
|
||||
{DICTIONARY_BUILDING_TYPE, ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT},
|
||||
{DICTIONARY_CATEGORY, ADDRESS_CATEGORY},
|
||||
{DICTIONARY_CHAIN, ADDRESS_NAME},
|
||||
{DICTIONARY_COMPANY_TYPE, ADDRESS_NAME},
|
||||
{DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_ANY},
|
||||
{DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_ANY},
|
||||
{DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_ANY},
|
||||
{DICTIONARY_CROSS_STREET, ADDRESS_STREET},
|
||||
{DICTIONARY_DIRECTIONAL, ADDRESS_ANY},
|
||||
{DICTIONARY_ELISION, ADDRESS_ANY},
|
||||
{DICTIONARY_ENTRANCE, ADDRESS_ENTRANCE},
|
||||
{DICTIONARY_GIVEN_NAME, ADDRESS_STREET | ADDRESS_NAME},
|
||||
{DICTIONARY_HOUSE_NUMBER, ADDRESS_HOUSE_NUMBER},
|
||||
{DICTIONARY_LEVEL_NUMBERED, ADDRESS_LEVEL},
|
||||
{DICTIONARY_LEVEL_STANDALONE, ADDRESS_LEVEL},
|
||||
{DICTIONARY_LEVEL_MEZZANINE, ADDRESS_LEVEL},
|
||||
{DICTIONARY_LEVEL_BASEMENT, ADDRESS_LEVEL},
|
||||
{DICTIONARY_LEVEL_SUB_BASEMENT, ADDRESS_LEVEL},
|
||||
{DICTIONARY_NEAR, ADDRESS_NEAR},
|
||||
{DICTIONARY_NULL, ADDRESS_ANY},
|
||||
{DICTIONARY_NAMED_ORGANIZATION, ADDRESS_NAME},
|
||||
{DICTIONARY_NAMED_PERSON, ADDRESS_NAME | ADDRESS_STREET},
|
||||
{DICTIONARY_NO_NUMBER, ADDRESS_HOUSE_NUMBER},
|
||||
{DICTIONARY_NUMBER, ADDRESS_HOUSE_NUMBER | ADDRESS_UNIT | ADDRESS_LEVEL | ADDRESS_STAIRCASE | ADDRESS_ENTRANCE},
|
||||
{DICTIONARY_PERSONAL_SUFFIX, ADDRESS_NAME | ADDRESS_STREET},
|
||||
{DICTIONARY_PERSONAL_TITLE, ADDRESS_NAME | ADDRESS_STREET},
|
||||
{DICTIONARY_PLACE_NAME, ADDRESS_NAME | ADDRESS_STREET},
|
||||
{DICTIONARY_POST_OFFICE, ADDRESS_PO_BOX},
|
||||
{DICTIONARY_POSTAL_CODE, ADDRESS_POSTAL_CODE},
|
||||
{DICTIONARY_QUALIFIER, ADDRESS_STREET},
|
||||
{DICTIONARY_STAIRCASE, ADDRESS_STAIRCASE},
|
||||
{DICTIONARY_STOPWORD, ADDRESS_ANY},
|
||||
{DICTIONARY_STREET_TYPE, ADDRESS_STREET},
|
||||
{DICTIONARY_SURNAME, ADDRESS_STREET | ADDRESS_NAME},
|
||||
{DICTIONARY_SYNONYM, ADDRESS_ANY},
|
||||
{DICTIONARY_TOPONYM, ADDRESS_NAME | ADDRESS_STREET | ADDRESS_TOPONYM},
|
||||
{DICTIONARY_UNIT_NUMBERED, ADDRESS_UNIT},
|
||||
{DICTIONARY_UNIT_STANDALONE, ADDRESS_UNIT},
|
||||
{DICTIONARY_UNIT_DIRECTION, ADDRESS_UNIT}
|
||||
{DICTIONARY_ACADEMIC_DEGREE, LIBPOSTAL_ADDRESS_NAME},
|
||||
{DICTIONARY_AMBIGUOUS_EXPANSION, LIBPOSTAL_ADDRESS_NONE},
|
||||
{DICTIONARY_BUILDING_TYPE, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_UNIT},
|
||||
{DICTIONARY_CATEGORY, LIBPOSTAL_ADDRESS_CATEGORY},
|
||||
{DICTIONARY_CHAIN, LIBPOSTAL_ADDRESS_NAME},
|
||||
{DICTIONARY_COMPANY_TYPE, LIBPOSTAL_ADDRESS_NAME},
|
||||
{DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, LIBPOSTAL_ADDRESS_ANY},
|
||||
{DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, LIBPOSTAL_ADDRESS_ANY},
|
||||
{DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, LIBPOSTAL_ADDRESS_ANY},
|
||||
{DICTIONARY_CROSS_STREET, LIBPOSTAL_ADDRESS_STREET},
|
||||
{DICTIONARY_DIRECTIONAL, LIBPOSTAL_ADDRESS_ANY},
|
||||
{DICTIONARY_ELISION, LIBPOSTAL_ADDRESS_ANY},
|
||||
{DICTIONARY_ENTRANCE, LIBPOSTAL_ADDRESS_ENTRANCE},
|
||||
{DICTIONARY_GIVEN_NAME, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME},
|
||||
{DICTIONARY_HOUSE_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER},
|
||||
{DICTIONARY_LEVEL_NUMBERED, LIBPOSTAL_ADDRESS_LEVEL},
|
||||
{DICTIONARY_LEVEL_STANDALONE, LIBPOSTAL_ADDRESS_LEVEL},
|
||||
{DICTIONARY_LEVEL_MEZZANINE, LIBPOSTAL_ADDRESS_LEVEL},
|
||||
{DICTIONARY_LEVEL_BASEMENT, LIBPOSTAL_ADDRESS_LEVEL},
|
||||
{DICTIONARY_LEVEL_SUB_BASEMENT, LIBPOSTAL_ADDRESS_LEVEL},
|
||||
{DICTIONARY_NEAR, LIBPOSTAL_ADDRESS_NEAR},
|
||||
{DICTIONARY_NULL, LIBPOSTAL_ADDRESS_ANY},
|
||||
{DICTIONARY_NAMED_ORGANIZATION, LIBPOSTAL_ADDRESS_NAME},
|
||||
{DICTIONARY_NAMED_PERSON, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET},
|
||||
{DICTIONARY_NO_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER},
|
||||
{DICTIONARY_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE},
|
||||
{DICTIONARY_PERSONAL_SUFFIX, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET},
|
||||
{DICTIONARY_PERSONAL_TITLE, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET},
|
||||
{DICTIONARY_PLACE_NAME, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET},
|
||||
{DICTIONARY_POST_OFFICE, LIBPOSTAL_ADDRESS_PO_BOX},
|
||||
{DICTIONARY_POSTAL_CODE, LIBPOSTAL_ADDRESS_POSTAL_CODE},
|
||||
{DICTIONARY_QUALIFIER, LIBPOSTAL_ADDRESS_STREET},
|
||||
{DICTIONARY_STAIRCASE, LIBPOSTAL_ADDRESS_STAIRCASE},
|
||||
{DICTIONARY_STOPWORD, LIBPOSTAL_ADDRESS_ANY},
|
||||
{DICTIONARY_STREET_TYPE, LIBPOSTAL_ADDRESS_STREET},
|
||||
{DICTIONARY_SURNAME, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME},
|
||||
{DICTIONARY_SYNONYM, LIBPOSTAL_ADDRESS_ANY},
|
||||
{DICTIONARY_TOPONYM, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_TOPONYM},
|
||||
{DICTIONARY_UNIT_NUMBERED, LIBPOSTAL_ADDRESS_UNIT},
|
||||
{DICTIONARY_UNIT_STANDALONE, LIBPOSTAL_ADDRESS_UNIT},
|
||||
{DICTIONARY_UNIT_DIRECTION, LIBPOSTAL_ADDRESS_UNIT}
|
||||
|
||||
};
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ static const size_t GAMMA_SCHEDULE_SIZE = sizeof(GAMMA_SCHEDULE) / sizeof(double
|
||||
#define DEFAULT_GAMMA_0 10.0
|
||||
|
||||
static double LAMBDA_SCHEDULE[] = {0.0, 1e-5, 1e-4, 0.001, 0.01, 0.1, \
|
||||
0.2, 0.5, 1.0, 2.0, 5.0, 10.0};
|
||||
0.2, 0.5, 1.0};
|
||||
static const size_t LAMBDA_SCHEDULE_SIZE = sizeof(LAMBDA_SCHEDULE) / sizeof(double);
|
||||
|
||||
#define DEFAULT_LAMBDA 0.0
|
||||
|
||||
@@ -32,10 +32,10 @@ KSORT_INIT(phrase_language_array, phrase_language_t, ks_lt_phrase_language)
|
||||
#define DEFAULT_KEY_LEN 32
|
||||
|
||||
|
||||
static normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = {
|
||||
static libpostal_normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = {
|
||||
.languages = NULL,
|
||||
.num_languages = 0,
|
||||
.address_components = ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_PO_BOX | ADDRESS_UNIT | ADDRESS_LEVEL | ADDRESS_ENTRANCE | ADDRESS_STAIRCASE | ADDRESS_POSTAL_CODE,
|
||||
.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE,
|
||||
.latin_ascii = true,
|
||||
.transliterate = true,
|
||||
.strip_accents = true,
|
||||
@@ -56,11 +56,11 @@ static normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = {
|
||||
.roman_numerals = true
|
||||
};
|
||||
|
||||
normalize_options_t get_libpostal_default_options(void) {
|
||||
libpostal_normalize_options_t libpostal_get_default_options(void) {
|
||||
return LIBPOSTAL_DEFAULT_OPTIONS;
|
||||
}
|
||||
|
||||
static inline uint64_t get_normalize_token_options(normalize_options_t options) {
|
||||
static inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) {
|
||||
uint64_t normalize_token_options = 0;
|
||||
|
||||
normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0;
|
||||
@@ -71,7 +71,7 @@ static inline uint64_t get_normalize_token_options(normalize_options_t options)
|
||||
return normalize_token_options;
|
||||
}
|
||||
|
||||
static inline uint64_t get_normalize_string_options(normalize_options_t options) {
|
||||
static inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) {
|
||||
uint64_t normalize_string_options = 0;
|
||||
normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0;
|
||||
normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0;
|
||||
@@ -83,7 +83,7 @@ static inline uint64_t get_normalize_string_options(normalize_options_t options)
|
||||
return normalize_string_options;
|
||||
}
|
||||
|
||||
static void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, normalize_options_t options) {
|
||||
static void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) {
|
||||
|
||||
uint64_t normalize_token_options = get_normalize_token_options(options);
|
||||
|
||||
@@ -135,7 +135,7 @@ static void add_normalized_strings_token(cstring_array *strings, char *str, toke
|
||||
}
|
||||
}
|
||||
|
||||
static string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
|
||||
static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options) {
|
||||
char_array *key = NULL;
|
||||
|
||||
log_debug("input=%s\n", str);
|
||||
@@ -500,7 +500,7 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt
|
||||
return tree;
|
||||
}
|
||||
|
||||
static void add_postprocessed_string(cstring_array *strings, char *str, normalize_options_t options) {
|
||||
static void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) {
|
||||
cstring_array_add_string(strings, str);
|
||||
|
||||
if (options.roman_numerals) {
|
||||
@@ -516,7 +516,7 @@ static void add_postprocessed_string(cstring_array *strings, char *str, normaliz
|
||||
|
||||
|
||||
|
||||
static address_expansion_array *get_affix_expansions(phrase_t phrase, normalize_options_t options) {
|
||||
static address_expansion_array *get_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) {
|
||||
uint32_t expansion_index = phrase.data;
|
||||
address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
|
||||
if (value != NULL && value->components & options.address_components) {
|
||||
@@ -526,7 +526,7 @@ static address_expansion_array *get_affix_expansions(phrase_t phrase, normalize_
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, normalize_options_t options) {
|
||||
static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) {
|
||||
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
|
||||
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
|
||||
uint64_t normalize_string_options = get_normalize_string_options(options);
|
||||
@@ -542,7 +542,7 @@ static inline void cat_affix_expansion(char_array *key, char *str, address_expan
|
||||
}
|
||||
}
|
||||
|
||||
static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, normalize_options_t options) {
|
||||
static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options) {
|
||||
cstring_array *strings = tree->strings;
|
||||
|
||||
bool have_suffix = suffix.len > 0 && suffix.len < token.len;
|
||||
@@ -753,7 +753,7 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok
|
||||
|
||||
}
|
||||
|
||||
static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, normalize_options_t options) {
|
||||
static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
|
||||
phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang);
|
||||
|
||||
phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang);
|
||||
@@ -764,7 +764,7 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to
|
||||
return add_affix_expansions(tree, str, lang, token, prefix, suffix, options);
|
||||
}
|
||||
|
||||
static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, normalize_options_t options) {
|
||||
static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
|
||||
cstring_array *strings = tree->strings;
|
||||
|
||||
for (size_t i = 0; i < tokens->n; i++) {
|
||||
@@ -795,7 +795,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s
|
||||
}
|
||||
|
||||
|
||||
static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, normalize_options_t options) {
|
||||
static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options) {
|
||||
size_t len = strlen(str);
|
||||
token_array *tokens = tokenize_keep_whitespace(str);
|
||||
string_tree_t *token_tree = string_tree_new_size(len);
|
||||
@@ -901,8 +901,8 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_
|
||||
char_array_destroy(temp_string);
|
||||
}
|
||||
|
||||
char **expand_address(char *input, normalize_options_t options, size_t *n) {
|
||||
options.address_components |= ADDRESS_ANY;
|
||||
char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
|
||||
options.address_components |= LIBPOSTAL_ADDRESS_ANY;
|
||||
|
||||
uint64_t normalize_string_options = get_normalize_string_options(options);
|
||||
|
||||
@@ -980,14 +980,14 @@ char **expand_address(char *input, normalize_options_t options, size_t *n) {
|
||||
|
||||
}
|
||||
|
||||
void expansion_array_destroy(char **expansions, size_t n) {
|
||||
void libpostal_expansion_array_destroy(char **expansions, size_t n) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
free(expansions[i]);
|
||||
}
|
||||
free(expansions);
|
||||
}
|
||||
|
||||
void address_parser_response_destroy(address_parser_response_t *self) {
|
||||
void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) {
|
||||
if (self == NULL) return;
|
||||
|
||||
for (size_t i = 0; i < self->num_components; i++) {
|
||||
@@ -1011,23 +1011,23 @@ void address_parser_response_destroy(address_parser_response_t *self) {
|
||||
free(self);
|
||||
}
|
||||
|
||||
static address_parser_options_t LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS = {
|
||||
static libpostal_address_parser_options_t LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS = {
|
||||
.language = NULL,
|
||||
.country = NULL
|
||||
};
|
||||
|
||||
inline address_parser_options_t get_libpostal_address_parser_default_options(void) {
|
||||
inline libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void) {
|
||||
return LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS;
|
||||
}
|
||||
|
||||
address_parser_response_t *parse_address(char *address, address_parser_options_t options) {
|
||||
libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options) {
|
||||
address_parser_context_t *context = address_parser_context_new();
|
||||
address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country, context);
|
||||
libpostal_address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country, context);
|
||||
|
||||
if (parsed == NULL) {
|
||||
log_error("Parser returned NULL\n");
|
||||
address_parser_context_destroy(context);
|
||||
address_parser_response_destroy(parsed);
|
||||
libpostal_address_parser_response_destroy(parsed);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -10,31 +10,31 @@ extern "C" {
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define MAX_LANGUAGE_LEN 4
|
||||
#define LIBPOSTAL_MAX_LANGUAGE_LEN 4
|
||||
|
||||
/*
|
||||
Address dictionaries
|
||||
*/
|
||||
// Bit set, should be able to keep it at a short (uint16_t)
|
||||
#define ADDRESS_NONE 0
|
||||
#define ADDRESS_ANY (1 << 0)
|
||||
#define ADDRESS_NAME (1 << 1)
|
||||
#define ADDRESS_HOUSE_NUMBER (1 << 2)
|
||||
#define ADDRESS_STREET (1 << 3)
|
||||
#define ADDRESS_UNIT (1 << 4)
|
||||
#define ADDRESS_LEVEL (1 << 5)
|
||||
#define ADDRESS_STAIRCASE (1 << 6)
|
||||
#define ADDRESS_ENTRANCE (1 << 7)
|
||||
#define LIBPOSTAL_ADDRESS_NONE 0
|
||||
#define LIBPOSTAL_ADDRESS_ANY (1 << 0)
|
||||
#define LIBPOSTAL_ADDRESS_NAME (1 << 1)
|
||||
#define LIBPOSTAL_ADDRESS_HOUSE_NUMBER (1 << 2)
|
||||
#define LIBPOSTAL_ADDRESS_STREET (1 << 3)
|
||||
#define LIBPOSTAL_ADDRESS_UNIT (1 << 4)
|
||||
#define LIBPOSTAL_ADDRESS_LEVEL (1 << 5)
|
||||
#define LIBPOSTAL_ADDRESS_STAIRCASE (1 << 6)
|
||||
#define LIBPOSTAL_ADDRESS_ENTRANCE (1 << 7)
|
||||
|
||||
#define ADDRESS_CATEGORY (1 << 8)
|
||||
#define ADDRESS_NEAR (1 << 9)
|
||||
#define LIBPOSTAL_ADDRESS_CATEGORY (1 << 8)
|
||||
#define LIBPOSTAL_ADDRESS_NEAR (1 << 9)
|
||||
|
||||
#define ADDRESS_TOPONYM (1 << 13)
|
||||
#define ADDRESS_POSTAL_CODE (1 << 14)
|
||||
#define ADDRESS_PO_BOX (1 << 15)
|
||||
#define ADDRESS_ALL ((1 << 16) - 1)
|
||||
#define LIBPOSTAL_ADDRESS_TOPONYM (1 << 13)
|
||||
#define LIBPOSTAL_ADDRESS_POSTAL_CODE (1 << 14)
|
||||
#define LIBPOSTAL_ADDRESS_PO_BOX (1 << 15)
|
||||
#define LIBPOSTAL_ADDRESS_ALL ((1 << 16) - 1)
|
||||
|
||||
typedef struct normalize_options {
|
||||
typedef struct libpostal_normalize_options {
|
||||
// List of language codes
|
||||
char **languages;
|
||||
size_t num_languages;
|
||||
@@ -60,34 +60,34 @@ typedef struct normalize_options {
|
||||
bool expand_numex;
|
||||
bool roman_numerals;
|
||||
|
||||
} normalize_options_t;
|
||||
} libpostal_normalize_options_t;
|
||||
|
||||
normalize_options_t get_libpostal_default_options(void);
|
||||
libpostal_normalize_options_t libpostal_get_default_options(void);
|
||||
|
||||
char **expand_address(char *input, normalize_options_t options, size_t *n);
|
||||
char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n);
|
||||
|
||||
void expansion_array_destroy(char **expansions, size_t n);
|
||||
void libpostal_expansion_array_destroy(char **expansions, size_t n);
|
||||
|
||||
/*
|
||||
Address parser
|
||||
*/
|
||||
|
||||
typedef struct address_parser_response {
|
||||
typedef struct libpostal_address_parser_response {
|
||||
size_t num_components;
|
||||
char **components;
|
||||
char **labels;
|
||||
} address_parser_response_t;
|
||||
} libpostal_address_parser_response_t;
|
||||
|
||||
typedef struct address_parser_options {
|
||||
typedef struct libpostal_address_parser_options {
|
||||
char *language;
|
||||
char *country;
|
||||
} address_parser_options_t;
|
||||
} libpostal_address_parser_options_t;
|
||||
|
||||
void address_parser_response_destroy(address_parser_response_t *self);
|
||||
void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self);
|
||||
|
||||
address_parser_options_t get_libpostal_address_parser_default_options(void);
|
||||
libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void);
|
||||
|
||||
address_parser_response_t *parse_address(char *address, address_parser_options_t options);
|
||||
libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options);
|
||||
|
||||
// Setup/teardown methods
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
||||
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
||||
LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
|
||||
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
||||
LIBPOSTAL_GEODB_FILE="geodb.tar.gz"
|
||||
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
||||
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||
|
||||
@@ -22,12 +21,10 @@ LIBPOSTAL_DATA_DIR=$3
|
||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||
|
||||
LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
|
||||
LIBPOSTAL_GEO_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_geo
|
||||
LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser
|
||||
LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier
|
||||
|
||||
BASIC_MODULE_DIRS="address_expansions numex transliteration"
|
||||
GEODB_MODULE_DIR=geodb
|
||||
PARSER_MODULE_DIR=address_parser
|
||||
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
|
||||
|
||||
@@ -133,11 +130,10 @@ if [ $COMMAND = "download" ]; then
|
||||
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
||||
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file"
|
||||
fi
|
||||
if [ $FILE = "geodb" ] || [ $FILE = "all" ]; then
|
||||
download_file $LIBPOSTAL_GEO_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_GEODB_FILE "geodb data file"
|
||||
fi
|
||||
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
||||
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_PARSER_FILE "parser data file"
|
||||
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/models/address_parser/latest)
|
||||
parser_filename="models/address_parser/$latest_parser/$LIBPOSTAL_PARSER_FILE"
|
||||
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_filename "parser data file"
|
||||
fi
|
||||
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
||||
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file"
|
||||
@@ -150,11 +146,6 @@ elif [ $COMMAND = "upload" ]; then
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY
|
||||
fi
|
||||
|
||||
if [ $FILE = "geodb" ] || [ $FILE = "all" ]; then
|
||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_GEODB_FILE $GEODB_MODULE_DIR
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_GEODB_FILE $LIBPOSTAL_S3_KEY
|
||||
fi
|
||||
|
||||
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $LIBPOSTAL_S3_KEY
|
||||
|
||||
@@ -13,10 +13,10 @@
|
||||
|
||||
#define LIBPOSTAL_USAGE "Usage: ./libpostal address [...languages] [--json]\n"
|
||||
|
||||
static inline void print_output(char *address, normalize_options_t options, bool use_json) {
|
||||
static inline void print_output(char *address, libpostal_normalize_options_t options, bool use_json) {
|
||||
size_t num_expansions;
|
||||
|
||||
char **strings = expand_address(address, options, &num_expansions);
|
||||
char **strings = libpostal_expand_address(address, options, &num_expansions);
|
||||
|
||||
char *normalized;
|
||||
|
||||
@@ -79,7 +79,7 @@ int main(int argc, char **argv) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
normalize_options_t options = get_libpostal_default_options();
|
||||
libpostal_normalize_options_t options = libpostal_get_default_options();
|
||||
|
||||
if (languages != NULL) {
|
||||
options.languages = languages->a;
|
||||
|
||||
@@ -8,9 +8,9 @@
|
||||
|
||||
SUITE(libpostal_expansion_tests);
|
||||
|
||||
static greatest_test_res test_expansion_contains(char *input, char *output, normalize_options_t options) {
|
||||
static greatest_test_res test_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) {
|
||||
size_t num_expansions;
|
||||
char **expansions = expand_address(input, options, &num_expansions);
|
||||
char **expansions = libpostal_expand_address(input, options, &num_expansions);
|
||||
|
||||
bool contains_expansion = false;
|
||||
char *expansion;
|
||||
@@ -23,6 +23,8 @@ static greatest_test_res test_expansion_contains(char *input, char *output, norm
|
||||
|
||||
}
|
||||
|
||||
libpostal_expansion_array_destroy(expansions, num_expansions);
|
||||
|
||||
if (!contains_expansion) {
|
||||
printf("Expansions should contain %s, got {", output);
|
||||
for (size_t i = 0; i < num_expansions; i++) {
|
||||
@@ -36,7 +38,7 @@ static greatest_test_res test_expansion_contains(char *input, char *output, norm
|
||||
PASS();
|
||||
}
|
||||
|
||||
static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, normalize_options_t options, size_t num_languages, ...) {
|
||||
static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) {
|
||||
char **languages = NULL;
|
||||
|
||||
size_t i;
|
||||
@@ -50,7 +52,7 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha
|
||||
|
||||
for (i = 0; i < num_languages; i++) {
|
||||
lang = va_arg(args, char *);
|
||||
ASSERT(strlen(lang) < MAX_LANGUAGE_LEN);
|
||||
ASSERT(strlen(lang) < LIBPOSTAL_MAX_LANGUAGE_LEN);
|
||||
languages[i] = strdup(lang);
|
||||
}
|
||||
|
||||
@@ -75,7 +77,7 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha
|
||||
|
||||
|
||||
TEST test_expansions(void) {
|
||||
normalize_options_t options = get_libpostal_default_options();
|
||||
libpostal_normalize_options_t options = libpostal_get_default_options();
|
||||
|
||||
CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
|
||||
@@ -86,7 +88,7 @@ TEST test_expansions(void) {
|
||||
}
|
||||
|
||||
TEST test_expansions_language_classifier(void) {
|
||||
normalize_options_t options = get_libpostal_default_options();
|
||||
libpostal_normalize_options_t options = libpostal_get_default_options();
|
||||
|
||||
CHECK_CALL(test_expansion_contains_with_languages("V XX Sett", "via 20 settembre", options, 0, NULL));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("C/ Ocho", "calle 8", options, 0, NULL));
|
||||
@@ -94,7 +96,7 @@ TEST test_expansions_language_classifier(void) {
|
||||
}
|
||||
|
||||
TEST test_expansions_no_options(void) {
|
||||
normalize_options_t options = get_libpostal_default_options();
|
||||
libpostal_normalize_options_t options = libpostal_get_default_options();
|
||||
options.lowercase = false;
|
||||
options.latin_ascii = false;
|
||||
options.transliterate = false;
|
||||
|
||||
@@ -14,8 +14,8 @@ typedef struct labeled_component {
|
||||
char *component;
|
||||
} labeled_component_t;
|
||||
|
||||
static greatest_test_res test_parse_result_equals(char *input, address_parser_options_t options, size_t output_len, ...) {
|
||||
address_parser_response_t *response = parse_address(input, options);
|
||||
static greatest_test_res test_parse_result_equals(char *input, libpostal_address_parser_options_t options, size_t output_len, ...) {
|
||||
libpostal_address_parser_response_t *response = libpostal_parse_address(input, options);
|
||||
|
||||
va_list args;
|
||||
|
||||
@@ -56,11 +56,11 @@ static greatest_test_res test_parse_result_equals(char *input, address_parser_op
|
||||
printf("%s: %s\n", response->labels[i], response->components[i]);
|
||||
}
|
||||
va_end(args);
|
||||
address_parser_response_destroy(response);
|
||||
libpostal_address_parser_response_destroy(response);
|
||||
FAIL();
|
||||
}
|
||||
|
||||
address_parser_response_destroy(response);
|
||||
libpostal_address_parser_response_destroy(response);
|
||||
|
||||
PASS();
|
||||
}
|
||||
@@ -68,7 +68,7 @@ static greatest_test_res test_parse_result_equals(char *input, address_parser_op
|
||||
|
||||
|
||||
TEST test_us_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
"Black Alliance for Just Immigration 660 Nostrand Ave, Brooklyn, N.Y., 11216",
|
||||
@@ -631,7 +631,7 @@ TEST test_us_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_ca_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// From: https://github.com/openvenues/libpostal/issues/55
|
||||
@@ -694,7 +694,7 @@ TEST test_ca_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_jm_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// From https://github.com/openvenues/libpostal/issues/113
|
||||
@@ -730,7 +730,7 @@ TEST test_jm_parses(void) {
|
||||
|
||||
|
||||
TEST test_gb_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
@@ -1069,7 +1069,7 @@ TEST test_gb_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_im_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// Multiple house names
|
||||
@@ -1089,7 +1089,7 @@ TEST test_im_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_nz_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
"wellington new zealand",
|
||||
@@ -1103,7 +1103,7 @@ TEST test_nz_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_fr_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// From https://github.com/pelias/pelias/issues/426
|
||||
"Chambéry",
|
||||
@@ -1169,7 +1169,7 @@ TEST test_fr_parses(void) {
|
||||
|
||||
|
||||
TEST test_es_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
// Use Spanish toponym
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
@@ -1214,7 +1214,7 @@ TEST test_es_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_co_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
"Cra 18#63-64 B Chapinero Bogotá DC Colombia",
|
||||
@@ -1277,7 +1277,7 @@ TEST test_co_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_mx_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
// From: https://github.com/openvenues/libpostal/issues/126
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
@@ -1309,7 +1309,7 @@ TEST test_mx_parses(void) {
|
||||
|
||||
|
||||
TEST test_br_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// Brazil address with sem número (s/n) and CEP used with postal code
|
||||
@@ -1328,7 +1328,7 @@ TEST test_br_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_cn_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// From https://github.com/openvenues/libpostal/issues/71
|
||||
@@ -1351,7 +1351,7 @@ TEST test_cn_parses(void) {
|
||||
|
||||
|
||||
TEST test_jp_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// Example of a Kanji address
|
||||
@@ -1410,7 +1410,7 @@ TEST test_jp_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_kr_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// English/Romanized Korean, ro + gil address, English unit
|
||||
@@ -1429,7 +1429,7 @@ TEST test_kr_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_my_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// From https://github.com/openvenues/libpostal/issues/121
|
||||
@@ -1448,7 +1448,7 @@ TEST test_my_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_za_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// Contains HTML entity which should be normalized
|
||||
@@ -1469,7 +1469,7 @@ TEST test_za_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_de_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
/* Contains German concatenated street suffix
|
||||
@@ -1518,7 +1518,7 @@ TEST test_de_parses(void) {
|
||||
|
||||
|
||||
TEST test_at_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
"Eduard Sueß Gasse 9",
|
||||
@@ -1592,7 +1592,7 @@ TEST test_at_parses(void) {
|
||||
|
||||
|
||||
TEST test_nl_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// From: https://github.com/openvenues/libpostal/issues/162
|
||||
"Nieuwe Binnenweg 17-19, Oude Westen, Rotterdam NL",
|
||||
@@ -1637,7 +1637,7 @@ TEST test_nl_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_da_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
"Valdemarsgade 42 4 t.v. København, 1665 Danmark",
|
||||
@@ -1655,7 +1655,7 @@ TEST test_da_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_fi_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
"1 Hämeenkatu, Tampere, Finland",
|
||||
@@ -1679,7 +1679,7 @@ TEST test_fi_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_no_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// From: https://github.com/openvenues/libpostal/issues/39#issuecomment-221027220
|
||||
@@ -1696,7 +1696,7 @@ TEST test_no_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_se_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// Uses the "en trappa upp" (one floor up) form in Swedish addresses
|
||||
@@ -1714,7 +1714,7 @@ TEST test_se_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_hu_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// Hungarian, 4-digit postal code
|
||||
@@ -1730,7 +1730,7 @@ TEST test_hu_parses(void) {
|
||||
}
|
||||
|
||||
TEST test_ro_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// Romanian address with staircase
|
||||
@@ -1751,7 +1751,7 @@ TEST test_ro_parses(void) {
|
||||
|
||||
|
||||
TEST test_ru_parses(void) {
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
|
||||
CHECK_CALL(test_parse_result_equals(
|
||||
// Contains Cyrillic with abbreviations
|
||||
|
||||
Reference in New Issue
Block a user