diff --git a/README.md b/README.md index 8f9792d4..90686c81 100644 --- a/README.md +++ b/README.md @@ -132,15 +132,15 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - address_parser_options_t options = get_libpostal_address_parser_default_options(); - address_parser_response_t *parsed = parse_address("781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); + libpostal_address_parser_response_t *parsed = libpostal_parse_address("781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options); for (size_t i = 0; i < parsed->num_components; i++) { printf("%s: %s\n", parsed->labels[i], parsed->components[i]); } // Free parse result - address_parser_response_destroy(parsed); + libpostal_address_parser_response_destroy(parsed); // Teardown (only called once at the end of your program) libpostal_teardown(); @@ -220,15 +220,15 @@ int main(int argc, char **argv) { } size_t num_expansions; - normalize_options_t options = get_libpostal_default_options(); - char **expansions = expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions); + libpostal_normalize_options_t options = libpostal_get_default_options(); + char **expansions = libpostal_expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions); for (size_t i = 0; i < num_expansions; i++) { printf("%s\n", expansions[i]); } // Free expansions - expansion_array_destroy(expansions, num_expansions); + libpostal_expansion_array_destroy(expansions, num_expansions); // Teardown (only called once at the end of your program) libpostal_teardown(); diff --git a/src/Makefile.am b/src/Makefile.am index 62a51453..9982b30b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,5 +1,3 @@ -SUBDIRS = sparkey - # Inherited from autoconf / user-specified CFLAGS_CONF = @CFLAGS@ CFLAGS_BASE = -Wall -Wextra -Wno-unused-function -Wdeclaration-after-statement -Wformat -Werror=format-security -Winit-self -Wno-sign-compare -DLIBPOSTAL_DATA_DIR='"$(datadir)/libpostal"' -g $(CFLAGS_CONF) @@ -14,9 +12,10 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include CFLAGS = lib_LTLIBRARIES = libpostal.la -libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c numex.c utf8proc/utf8proc.c cmp/cmp.c geodb.c geo_disambiguation.c normalize.c bloom.c features.c geonames.c geohash/geohash.c unicode_scripts.c msgpack_utils.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c -libpostal_la_LIBADD = libscanner.la sparkey/libsparkey.la $(CBLAS_LIBS) +libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c numex.c utf8proc/utf8proc.c cmp/cmp.c normalize.c bloom.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c +libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS) libpostal_la_CFLAGS = $(CFLAGS_O2) +libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ dist_bin_SCRIPTS = libpostal_data @@ -27,7 +26,7 @@ noinst_LTLIBRARIES = libscanner.la libscanner_la_SOURCES = scanner.c libscanner_la_CFLAGS = $(CFLAGS_O0) $(CFLAGS_SCANNER_EXTRA) -noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_geodb build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test +noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test libpostal_SOURCES = main.c json_encode.c libpostal_LDADD = libpostal.la @@ -41,9 +40,6 @@ address_parser_CFLAGS = $(CFLAGS_O3) build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c build_address_dictionary_CFLAGS = $(CFLAGS_O3) -build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c graph.c graph_builder.c normalize.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c -build_geodb_LDADD = sparkey/libsparkey.la -build_geodb_CFLAGS = $(CFLAGS_O3) build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c build_numex_table_CFLAGS = $(CFLAGS_O3) build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c diff --git a/src/address_parser.c b/src/address_parser.c index d499b328..14d6afaf 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -842,7 +842,7 @@ bool is_valid_dictionary_phrase(phrase_t phrase) { } uint32_t address_phrase_types = expansion_value->components; - if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME | ADDRESS_CATEGORY | ADDRESS_NEAR | ADDRESS_UNIT | ADDRESS_LEVEL | ADDRESS_ENTRANCE | ADDRESS_STAIRCASE | ADDRESS_POSTAL_CODE | ADDRESS_PO_BOX)) { + if (address_phrase_types & (LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_PO_BOX)) { for (size_t i = 0; i < expansion_value->expansions->n; i++) { address_expansion_t expansion = expansion_value->expansions->a[i]; if (!address_expansion_in_dictionary(expansion, DICTIONARY_TOPONYM)) { @@ -913,7 +913,7 @@ static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, expansion_index = suffix_phrase.data; expansion_value = address_dictionary_get_expansions(expansion_index); - if (expansion_value->components & ADDRESS_STREET) { + if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) { response = (address_parser_phrase_t){ word, ADDRESS_PARSER_SUFFIX_PHRASE, @@ -928,8 +928,8 @@ static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, expansion_index = prefix_phrase.data; expansion_value = address_dictionary_get_expansions(expansion_index); - // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category - if (expansion_value->components ^ ADDRESS_ANY) { + // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category + if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) { response = (address_parser_phrase_t){ word, ADDRESS_PARSER_PREFIX_PHRASE, @@ -1164,16 +1164,16 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize add_word_feature = false; log_debug("phrase_string=%s\n", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_CATEGORY, "category", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_UNIT, "unit", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_PO_BOX, "po_box", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_LEVEL, "level", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_ENTRANCE, "entrance", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_STAIRCASE, "staircase", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_HOUSE_NUMBER, "house_number", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_POSTAL_CODE, "postal_code", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STREET, "street", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_NAME, "name", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_CATEGORY, "category", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_UNIT, "unit", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_PO_BOX, "po_box", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_LEVEL, "level", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_ENTRANCE, "entrance", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STAIRCASE, "staircase", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_HOUSE_NUMBER, "house_number", phrase_string); + add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_POSTAL_CODE, "postal_code", phrase_string); } } @@ -1330,8 +1330,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize expansion_index = prefix_phrase.data; expansion_value = address_dictionary_get_expansions(expansion_index); - // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category - if (expansion_value->components ^ ADDRESS_ANY) { + // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category + if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) { known_prefix = true; char_array_clear(phrase_tokens); prefix_len = prefix_phrase.len; @@ -1347,7 +1347,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize expansion_index = suffix_phrase.data; expansion_value = address_dictionary_get_expansions(expansion_index); - if (expansion_value->components & ADDRESS_STREET) { + if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) { known_suffix = true; char_array_clear(context->suffix_phrase); suffix_len = suffix_phrase.len; @@ -1582,20 +1582,20 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize right_context_affix = phrase_prefix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase); } - if (right_context_components & ADDRESS_STREET && !(right_context_components & ADDRESS_NAME)) { + if (right_context_components & LIBPOSTAL_ADDRESS_STREET && !(right_context_components & LIBPOSTAL_ADDRESS_NAME)) { feature_array_add(features, 2, "first word unknown+street phrase right", relation_to_number); feature_array_add(features, 3, "first word unknown+street phrase right", relation_to_number, right_context_word); if (right_context_affix != NULL && right_affix_type != NULL) { feature_array_add(features, 4, "first word unknown+street affix right", relation_to_number, right_affix_type, right_context_affix); } break; - } else if (right_context_components & ADDRESS_NAME && !(right_context_components & ADDRESS_STREET)) { + } else if (right_context_components & LIBPOSTAL_ADDRESS_NAME && !(right_context_components & LIBPOSTAL_ADDRESS_STREET)) { feature_array_add(features, 2, "first word unknown+venue phrase right", relation_to_number); feature_array_add(features, 3, "first word unknown+venue phrase right", relation_to_number, right_context_word); if (right_context_affix != NULL && right_affix_type != NULL) { feature_array_add(features, 4, "first word unknown+venue affix right", relation_to_number, right_affix_type, right_context_affix); } - } else if (right_context_components & (ADDRESS_NAME | ADDRESS_STREET)) { + } else if (right_context_components & (LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET)) { if (seen_number) { feature_array_add(features, 1, "first word unknown+number+ambiguous phrase right"); feature_array_add(features, 2, "first word unknown+number+ambiguous phrase right", right_context_word); @@ -1637,12 +1637,12 @@ bool address_parser_predict(address_parser_t *self, address_parser_context_t *co return false; } -address_parser_response_t *address_parser_response_new(void) { - address_parser_response_t *response = malloc(sizeof(address_parser_response_t)); +libpostal_address_parser_response_t *address_parser_response_new(void) { + libpostal_address_parser_response_t *response = malloc(sizeof(libpostal_address_parser_response_t)); return response; } -address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context) { +libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context) { if (address == NULL || context == NULL) return NULL; address_parser_t *parser = get_address_parser(); @@ -1693,7 +1693,7 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c country = NULL; address_parser_context_fill(context, parser, tokenized_str, language, country); - address_parser_response_t *response = NULL; + libpostal_address_parser_response_t *response = NULL; // If the whole input string is a single known phrase at the SUBURB level or higher, bypass sequence prediction altogether phrase_t only_phrase = NULL_PHRASE; diff --git a/src/address_parser.h b/src/address_parser.h index f77986d0..3504cba6 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -214,7 +214,7 @@ address_parser_t *address_parser_new_options(parser_options_t options); address_parser_t *get_address_parser(void); bool address_parser_load(char *dir); -address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context); +libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context); void address_parser_destroy(address_parser_t *self); char *address_parser_normalize_string(char *str); diff --git a/src/address_parser_cli.c b/src/address_parser_cli.c index eb02eb84..2ae40b82 100644 --- a/src/address_parser_cli.c +++ b/src/address_parser_cli.c @@ -117,10 +117,10 @@ int main(int argc, char **argv) { goto next_input; } - address_parser_response_t *parsed; - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_response_t *parsed; + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); - if ((parsed = parse_address(input, options))) { + if ((parsed = libpostal_parse_address(input, options))) { printf("\n"); printf("Result:\n\n"); printf("{\n"); @@ -134,7 +134,7 @@ int main(int argc, char **argv) { printf("}\n"); printf("\n"); - address_parser_response_destroy(parsed); + libpostal_address_parser_response_destroy(parsed); } else { log_error("Error parsing address\n"); exit(EXIT_FAILURE); diff --git a/src/address_parser_test.c b/src/address_parser_test.c index 966e60fc..276a6a5f 100644 --- a/src/address_parser_test.c +++ b/src/address_parser_test.c @@ -98,8 +98,6 @@ bool address_parser_test(address_parser_t *parser, char *filename, address_parse char *prev_label = NULL; - address_parser_response_t *response = NULL; - size_t starting_errors = result->num_errors; bool prediction_success = address_parser_predict(parser, context, token_labels, &address_parser_features, data_set->tokenized_str); diff --git a/src/address_parser_train.c b/src/address_parser_train.c index 0a254cb7..cdb038da 100644 --- a/src/address_parser_train.c +++ b/src/address_parser_train.c @@ -210,7 +210,7 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr if (sub_tokens->n > 1 && search_address_dictionaries_tokens_with_phrases(postal_code_normalized, sub_tokens, language, &postal_code_dictionary_phrases) && postal_code_dictionary_phrases->n > 0) { phrase_t first_postal_code_phrase = postal_code_dictionary_phrases->a[0]; address_expansion_value_t *value = address_dictionary_get_expansions(first_postal_code_phrase.data); - if (value != NULL && value->components & ADDRESS_POSTAL_CODE) { + if (value != NULL && value->components & LIBPOSTAL_ADDRESS_POSTAL_CODE) { char_array_clear(token_builder); size_t first_real_token_index = first_postal_code_phrase.start + first_postal_code_phrase.len; token_t first_real_token = sub_tokens->a[first_real_token_index]; @@ -255,7 +255,7 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr address_expansion_value_t *phrase_value = address_dictionary_get_expansions(current_phrase.data); size_t current_phrase_end = current_phrase.start + current_phrase.len; - if (phrase_value != NULL && phrase_value->components & ADDRESS_POSTAL_CODE) { + if (phrase_value != NULL && phrase_value->components & LIBPOSTAL_ADDRESS_POSTAL_CODE) { current_phrase_end = current_phrase.start; } diff --git a/src/bench.c b/src/bench.c index a3bda385..5a91e4b0 100644 --- a/src/bench.c +++ b/src/bench.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) { char *languages[argc - 2]; for (int i = 0; i < argc - 2; i++) { char *arg = argv[i + 2]; - if (strlen(arg) >= MAX_LANGUAGE_LEN) { - printf("arg %d was longer than a language code (%d chars). Make sure to quote the input string\n", i + 2, MAX_LANGUAGE_LEN - 1); + if (strlen(arg) >= LIBPOSTAL_MAX_LANGUAGE_LEN) { + printf("arg %d was longer than a language code (%d chars). Make sure to quote the input string\n", i + 2, LIBPOSTAL_MAX_LANGUAGE_LEN - 1); } languages[i] = arg; } @@ -38,7 +38,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - normalize_options_t options = get_libpostal_default_options(); + libpostal_normalize_options_t options = libpostal_get_default_options(); options.num_languages = 1; options.languages = languages; @@ -56,12 +56,8 @@ int main(int argc, char **argv) { clock_t t1 = clock(); for (int i = 0; i < num_loops; i++) { - strings = expand_address(str, options, &num_expansions); - for (uint64_t i = 0; i < num_expansions; i++) { - normalized = strings[i]; - free(normalized); - } - free(strings); + strings = libpostal_expand_address(str, options, &num_expansions); + libpostal_expansion_array_destroy(strings, num_expansions); } clock_t t2 = clock(); diff --git a/src/gazetteer_data.c b/src/gazetteer_data.c index 492536f2..0c23759a 100644 --- a/src/gazetteer_data.c +++ b/src/gazetteer_data.c @@ -1,46 +1,46 @@ // Only need these for the in-memory dictionaries gazetteer_t gazetteer_config[] = { - {DICTIONARY_ACADEMIC_DEGREE, ADDRESS_NAME}, - {DICTIONARY_AMBIGUOUS_EXPANSION, ADDRESS_NONE}, - {DICTIONARY_BUILDING_TYPE, ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT}, - {DICTIONARY_CATEGORY, ADDRESS_CATEGORY}, - {DICTIONARY_CHAIN, ADDRESS_NAME}, - {DICTIONARY_COMPANY_TYPE, ADDRESS_NAME}, - {DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_ANY}, - {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_ANY}, - {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_ANY}, - {DICTIONARY_CROSS_STREET, ADDRESS_STREET}, - {DICTIONARY_DIRECTIONAL, ADDRESS_ANY}, - {DICTIONARY_ELISION, ADDRESS_ANY}, - {DICTIONARY_ENTRANCE, ADDRESS_ENTRANCE}, - {DICTIONARY_GIVEN_NAME, ADDRESS_STREET | ADDRESS_NAME}, - {DICTIONARY_HOUSE_NUMBER, ADDRESS_HOUSE_NUMBER}, - {DICTIONARY_LEVEL_NUMBERED, ADDRESS_LEVEL}, - {DICTIONARY_LEVEL_STANDALONE, ADDRESS_LEVEL}, - {DICTIONARY_LEVEL_MEZZANINE, ADDRESS_LEVEL}, - {DICTIONARY_LEVEL_BASEMENT, ADDRESS_LEVEL}, - {DICTIONARY_LEVEL_SUB_BASEMENT, ADDRESS_LEVEL}, - {DICTIONARY_NEAR, ADDRESS_NEAR}, - {DICTIONARY_NULL, ADDRESS_ANY}, - {DICTIONARY_NAMED_ORGANIZATION, ADDRESS_NAME}, - {DICTIONARY_NAMED_PERSON, ADDRESS_NAME | ADDRESS_STREET}, - {DICTIONARY_NO_NUMBER, ADDRESS_HOUSE_NUMBER}, - {DICTIONARY_NUMBER, ADDRESS_HOUSE_NUMBER | ADDRESS_UNIT | ADDRESS_LEVEL | ADDRESS_STAIRCASE | ADDRESS_ENTRANCE}, - {DICTIONARY_PERSONAL_SUFFIX, ADDRESS_NAME | ADDRESS_STREET}, - {DICTIONARY_PERSONAL_TITLE, ADDRESS_NAME | ADDRESS_STREET}, - {DICTIONARY_PLACE_NAME, ADDRESS_NAME | ADDRESS_STREET}, - {DICTIONARY_POST_OFFICE, ADDRESS_PO_BOX}, - {DICTIONARY_POSTAL_CODE, ADDRESS_POSTAL_CODE}, - {DICTIONARY_QUALIFIER, ADDRESS_STREET}, - {DICTIONARY_STAIRCASE, ADDRESS_STAIRCASE}, - {DICTIONARY_STOPWORD, ADDRESS_ANY}, - {DICTIONARY_STREET_TYPE, ADDRESS_STREET}, - {DICTIONARY_SURNAME, ADDRESS_STREET | ADDRESS_NAME}, - {DICTIONARY_SYNONYM, ADDRESS_ANY}, - {DICTIONARY_TOPONYM, ADDRESS_NAME | ADDRESS_STREET | ADDRESS_TOPONYM}, - {DICTIONARY_UNIT_NUMBERED, ADDRESS_UNIT}, - {DICTIONARY_UNIT_STANDALONE, ADDRESS_UNIT}, - {DICTIONARY_UNIT_DIRECTION, ADDRESS_UNIT} + {DICTIONARY_ACADEMIC_DEGREE, LIBPOSTAL_ADDRESS_NAME}, + {DICTIONARY_AMBIGUOUS_EXPANSION, LIBPOSTAL_ADDRESS_NONE}, + {DICTIONARY_BUILDING_TYPE, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_UNIT}, + {DICTIONARY_CATEGORY, LIBPOSTAL_ADDRESS_CATEGORY}, + {DICTIONARY_CHAIN, LIBPOSTAL_ADDRESS_NAME}, + {DICTIONARY_COMPANY_TYPE, LIBPOSTAL_ADDRESS_NAME}, + {DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_CROSS_STREET, LIBPOSTAL_ADDRESS_STREET}, + {DICTIONARY_DIRECTIONAL, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_ELISION, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_ENTRANCE, LIBPOSTAL_ADDRESS_ENTRANCE}, + {DICTIONARY_GIVEN_NAME, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME}, + {DICTIONARY_HOUSE_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER}, + {DICTIONARY_LEVEL_NUMBERED, LIBPOSTAL_ADDRESS_LEVEL}, + {DICTIONARY_LEVEL_STANDALONE, LIBPOSTAL_ADDRESS_LEVEL}, + {DICTIONARY_LEVEL_MEZZANINE, LIBPOSTAL_ADDRESS_LEVEL}, + {DICTIONARY_LEVEL_BASEMENT, LIBPOSTAL_ADDRESS_LEVEL}, + {DICTIONARY_LEVEL_SUB_BASEMENT, LIBPOSTAL_ADDRESS_LEVEL}, + {DICTIONARY_NEAR, LIBPOSTAL_ADDRESS_NEAR}, + {DICTIONARY_NULL, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_NAMED_ORGANIZATION, LIBPOSTAL_ADDRESS_NAME}, + {DICTIONARY_NAMED_PERSON, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, + {DICTIONARY_NO_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER}, + {DICTIONARY_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE}, + {DICTIONARY_PERSONAL_SUFFIX, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, + {DICTIONARY_PERSONAL_TITLE, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, + {DICTIONARY_PLACE_NAME, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, + {DICTIONARY_POST_OFFICE, LIBPOSTAL_ADDRESS_PO_BOX}, + {DICTIONARY_POSTAL_CODE, LIBPOSTAL_ADDRESS_POSTAL_CODE}, + {DICTIONARY_QUALIFIER, LIBPOSTAL_ADDRESS_STREET}, + {DICTIONARY_STAIRCASE, LIBPOSTAL_ADDRESS_STAIRCASE}, + {DICTIONARY_STOPWORD, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_STREET_TYPE, LIBPOSTAL_ADDRESS_STREET}, + {DICTIONARY_SURNAME, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME}, + {DICTIONARY_SYNONYM, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_TOPONYM, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_TOPONYM}, + {DICTIONARY_UNIT_NUMBERED, LIBPOSTAL_ADDRESS_UNIT}, + {DICTIONARY_UNIT_STANDALONE, LIBPOSTAL_ADDRESS_UNIT}, + {DICTIONARY_UNIT_DIRECTION, LIBPOSTAL_ADDRESS_UNIT} }; diff --git a/src/language_classifier_train.c b/src/language_classifier_train.c index 0fa33a99..ddf73719 100644 --- a/src/language_classifier_train.c +++ b/src/language_classifier_train.c @@ -27,7 +27,7 @@ static const size_t GAMMA_SCHEDULE_SIZE = sizeof(GAMMA_SCHEDULE) / sizeof(double #define DEFAULT_GAMMA_0 10.0 static double LAMBDA_SCHEDULE[] = {0.0, 1e-5, 1e-4, 0.001, 0.01, 0.1, \ - 0.2, 0.5, 1.0, 2.0, 5.0, 10.0}; + 0.2, 0.5, 1.0}; static const size_t LAMBDA_SCHEDULE_SIZE = sizeof(LAMBDA_SCHEDULE) / sizeof(double); #define DEFAULT_LAMBDA 0.0 diff --git a/src/libpostal.c b/src/libpostal.c index 8e0c90b5..11e10a2c 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -32,10 +32,10 @@ KSORT_INIT(phrase_language_array, phrase_language_t, ks_lt_phrase_language) #define DEFAULT_KEY_LEN 32 -static normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = { +static libpostal_normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = { .languages = NULL, .num_languages = 0, - .address_components = ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_PO_BOX | ADDRESS_UNIT | ADDRESS_LEVEL | ADDRESS_ENTRANCE | ADDRESS_STAIRCASE | ADDRESS_POSTAL_CODE, + .address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE, .latin_ascii = true, .transliterate = true, .strip_accents = true, @@ -56,11 +56,11 @@ static normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = { .roman_numerals = true }; -normalize_options_t get_libpostal_default_options(void) { +libpostal_normalize_options_t libpostal_get_default_options(void) { return LIBPOSTAL_DEFAULT_OPTIONS; } -static inline uint64_t get_normalize_token_options(normalize_options_t options) { +static inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) { uint64_t normalize_token_options = 0; normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0; @@ -71,7 +71,7 @@ static inline uint64_t get_normalize_token_options(normalize_options_t options) return normalize_token_options; } -static inline uint64_t get_normalize_string_options(normalize_options_t options) { +static inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) { uint64_t normalize_string_options = 0; normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0; normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0; @@ -83,7 +83,7 @@ static inline uint64_t get_normalize_string_options(normalize_options_t options) return normalize_string_options; } -static void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, normalize_options_t options) { +static void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { uint64_t normalize_token_options = get_normalize_token_options(options); @@ -135,7 +135,7 @@ static void add_normalized_strings_token(cstring_array *strings, char *str, toke } } -static string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { +static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options) { char_array *key = NULL; log_debug("input=%s\n", str); @@ -500,7 +500,7 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt return tree; } -static void add_postprocessed_string(cstring_array *strings, char *str, normalize_options_t options) { +static void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) { cstring_array_add_string(strings, str); if (options.roman_numerals) { @@ -516,7 +516,7 @@ static void add_postprocessed_string(cstring_array *strings, char *str, normaliz -static address_expansion_array *get_affix_expansions(phrase_t phrase, normalize_options_t options) { +static address_expansion_array *get_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); if (value != NULL && value->components & options.address_components) { @@ -526,7 +526,7 @@ static address_expansion_array *get_affix_expansions(phrase_t phrase, normalize_ return NULL; } -static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, normalize_options_t options) { +static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { if (expansion.canonical_index != NULL_CANONICAL_INDEX) { char *canonical = address_dictionary_get_canonical(expansion.canonical_index); uint64_t normalize_string_options = get_normalize_string_options(options); @@ -542,7 +542,7 @@ static inline void cat_affix_expansion(char_array *key, char *str, address_expan } } -static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, normalize_options_t options) { +static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options) { cstring_array *strings = tree->strings; bool have_suffix = suffix.len > 0 && suffix.len < token.len; @@ -753,7 +753,7 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok } -static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, normalize_options_t options) { +static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang); phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang); @@ -764,7 +764,7 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to return add_affix_expansions(tree, str, lang, token, prefix, suffix, options); } -static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, normalize_options_t options) { +static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { cstring_array *strings = tree->strings; for (size_t i = 0; i < tokens->n; i++) { @@ -795,7 +795,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s } -static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, normalize_options_t options) { +static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options) { size_t len = strlen(str); token_array *tokens = tokenize_keep_whitespace(str); string_tree_t *token_tree = string_tree_new_size(len); @@ -901,8 +901,8 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_ char_array_destroy(temp_string); } -char **expand_address(char *input, normalize_options_t options, size_t *n) { - options.address_components |= ADDRESS_ANY; +char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { + options.address_components |= LIBPOSTAL_ADDRESS_ANY; uint64_t normalize_string_options = get_normalize_string_options(options); @@ -980,14 +980,14 @@ char **expand_address(char *input, normalize_options_t options, size_t *n) { } -void expansion_array_destroy(char **expansions, size_t n) { +void libpostal_expansion_array_destroy(char **expansions, size_t n) { for (size_t i = 0; i < n; i++) { free(expansions[i]); } free(expansions); } -void address_parser_response_destroy(address_parser_response_t *self) { +void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) { if (self == NULL) return; for (size_t i = 0; i < self->num_components; i++) { @@ -1011,23 +1011,23 @@ void address_parser_response_destroy(address_parser_response_t *self) { free(self); } -static address_parser_options_t LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS = { +static libpostal_address_parser_options_t LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS = { .language = NULL, .country = NULL }; -inline address_parser_options_t get_libpostal_address_parser_default_options(void) { +inline libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void) { return LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS; } -address_parser_response_t *parse_address(char *address, address_parser_options_t options) { +libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options) { address_parser_context_t *context = address_parser_context_new(); - address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country, context); + libpostal_address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country, context); if (parsed == NULL) { log_error("Parser returned NULL\n"); address_parser_context_destroy(context); - address_parser_response_destroy(parsed); + libpostal_address_parser_response_destroy(parsed); return NULL; } diff --git a/src/libpostal.h b/src/libpostal.h index 487ee96e..3b86dea3 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -10,31 +10,31 @@ extern "C" { #include #include -#define MAX_LANGUAGE_LEN 4 +#define LIBPOSTAL_MAX_LANGUAGE_LEN 4 /* Address dictionaries */ // Bit set, should be able to keep it at a short (uint16_t) -#define ADDRESS_NONE 0 -#define ADDRESS_ANY (1 << 0) -#define ADDRESS_NAME (1 << 1) -#define ADDRESS_HOUSE_NUMBER (1 << 2) -#define ADDRESS_STREET (1 << 3) -#define ADDRESS_UNIT (1 << 4) -#define ADDRESS_LEVEL (1 << 5) -#define ADDRESS_STAIRCASE (1 << 6) -#define ADDRESS_ENTRANCE (1 << 7) +#define LIBPOSTAL_ADDRESS_NONE 0 +#define LIBPOSTAL_ADDRESS_ANY (1 << 0) +#define LIBPOSTAL_ADDRESS_NAME (1 << 1) +#define LIBPOSTAL_ADDRESS_HOUSE_NUMBER (1 << 2) +#define LIBPOSTAL_ADDRESS_STREET (1 << 3) +#define LIBPOSTAL_ADDRESS_UNIT (1 << 4) +#define LIBPOSTAL_ADDRESS_LEVEL (1 << 5) +#define LIBPOSTAL_ADDRESS_STAIRCASE (1 << 6) +#define LIBPOSTAL_ADDRESS_ENTRANCE (1 << 7) -#define ADDRESS_CATEGORY (1 << 8) -#define ADDRESS_NEAR (1 << 9) +#define LIBPOSTAL_ADDRESS_CATEGORY (1 << 8) +#define LIBPOSTAL_ADDRESS_NEAR (1 << 9) -#define ADDRESS_TOPONYM (1 << 13) -#define ADDRESS_POSTAL_CODE (1 << 14) -#define ADDRESS_PO_BOX (1 << 15) -#define ADDRESS_ALL ((1 << 16) - 1) +#define LIBPOSTAL_ADDRESS_TOPONYM (1 << 13) +#define LIBPOSTAL_ADDRESS_POSTAL_CODE (1 << 14) +#define LIBPOSTAL_ADDRESS_PO_BOX (1 << 15) +#define LIBPOSTAL_ADDRESS_ALL ((1 << 16) - 1) -typedef struct normalize_options { +typedef struct libpostal_normalize_options { // List of language codes char **languages; size_t num_languages; @@ -60,34 +60,34 @@ typedef struct normalize_options { bool expand_numex; bool roman_numerals; -} normalize_options_t; +} libpostal_normalize_options_t; -normalize_options_t get_libpostal_default_options(void); +libpostal_normalize_options_t libpostal_get_default_options(void); -char **expand_address(char *input, normalize_options_t options, size_t *n); +char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n); -void expansion_array_destroy(char **expansions, size_t n); +void libpostal_expansion_array_destroy(char **expansions, size_t n); /* Address parser */ -typedef struct address_parser_response { +typedef struct libpostal_address_parser_response { size_t num_components; char **components; char **labels; -} address_parser_response_t; +} libpostal_address_parser_response_t; -typedef struct address_parser_options { +typedef struct libpostal_address_parser_options { char *language; char *country; -} address_parser_options_t; +} libpostal_address_parser_options_t; -void address_parser_response_destroy(address_parser_response_t *self); +void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self); -address_parser_options_t get_libpostal_address_parser_default_options(void); +libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void); -address_parser_response_t *parse_address(char *address, address_parser_options_t options); +libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options); // Setup/teardown methods diff --git a/src/libpostal_data b/src/libpostal_data index 01361e45..e653c9a2 100755 --- a/src/libpostal_data +++ b/src/libpostal_data @@ -11,7 +11,6 @@ LIBPOSTAL_S3_BUCKET_NAME="libpostal" LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME" LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com" LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz" -LIBPOSTAL_GEODB_FILE="geodb.tar.gz" LIBPOSTAL_PARSER_FILE="parser.tar.gz" LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" @@ -22,12 +21,10 @@ LIBPOSTAL_DATA_DIR=$3 mkdir -p $LIBPOSTAL_DATA_DIR LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated -LIBPOSTAL_GEO_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_geo LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier BASIC_MODULE_DIRS="address_expansions numex transliteration" -GEODB_MODULE_DIR=geodb PARSER_MODULE_DIR=address_parser LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier @@ -133,11 +130,10 @@ if [ $COMMAND = "download" ]; then if [ $FILE = "base" ] || [ $FILE = "all" ]; then download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file" fi - if [ $FILE = "geodb" ] || [ $FILE = "all" ]; then - download_file $LIBPOSTAL_GEO_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_GEODB_FILE "geodb data file" - fi if [ $FILE = "parser" ] || [ $FILE = "all" ]; then - download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_PARSER_FILE "parser data file" + latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/models/address_parser/latest) + parser_filename="models/address_parser/$latest_parser/$LIBPOSTAL_PARSER_FILE" + download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_filename "parser data file" fi if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" @@ -150,11 +146,6 @@ elif [ $COMMAND = "upload" ]; then aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY fi - if [ $FILE = "geodb" ] || [ $FILE = "all" ]; then - tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_GEODB_FILE $GEODB_MODULE_DIR - aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_GEODB_FILE $LIBPOSTAL_S3_KEY - fi - if [ $FILE = "parser" ] || [ $FILE = "all" ]; then tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $LIBPOSTAL_S3_KEY diff --git a/src/main.c b/src/main.c index 22be1a76..cabc3e25 100644 --- a/src/main.c +++ b/src/main.c @@ -13,10 +13,10 @@ #define LIBPOSTAL_USAGE "Usage: ./libpostal address [...languages] [--json]\n" -static inline void print_output(char *address, normalize_options_t options, bool use_json) { +static inline void print_output(char *address, libpostal_normalize_options_t options, bool use_json) { size_t num_expansions; - char **strings = expand_address(address, options, &num_expansions); + char **strings = libpostal_expand_address(address, options, &num_expansions); char *normalized; @@ -79,7 +79,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - normalize_options_t options = get_libpostal_default_options(); + libpostal_normalize_options_t options = libpostal_get_default_options(); if (languages != NULL) { options.languages = languages->a; diff --git a/test/test_expand.c b/test/test_expand.c index a213a3fc..a1938984 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -8,9 +8,9 @@ SUITE(libpostal_expansion_tests); -static greatest_test_res test_expansion_contains(char *input, char *output, normalize_options_t options) { +static greatest_test_res test_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { size_t num_expansions; - char **expansions = expand_address(input, options, &num_expansions); + char **expansions = libpostal_expand_address(input, options, &num_expansions); bool contains_expansion = false; char *expansion; @@ -23,6 +23,8 @@ static greatest_test_res test_expansion_contains(char *input, char *output, norm } + libpostal_expansion_array_destroy(expansions, num_expansions); + if (!contains_expansion) { printf("Expansions should contain %s, got {", output); for (size_t i = 0; i < num_expansions; i++) { @@ -36,7 +38,7 @@ static greatest_test_res test_expansion_contains(char *input, char *output, norm PASS(); } -static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, normalize_options_t options, size_t num_languages, ...) { +static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { char **languages = NULL; size_t i; @@ -50,7 +52,7 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha for (i = 0; i < num_languages; i++) { lang = va_arg(args, char *); - ASSERT(strlen(lang) < MAX_LANGUAGE_LEN); + ASSERT(strlen(lang) < LIBPOSTAL_MAX_LANGUAGE_LEN); languages[i] = strdup(lang); } @@ -75,7 +77,7 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha TEST test_expansions(void) { - normalize_options_t options = get_libpostal_default_options(); + libpostal_normalize_options_t options = libpostal_get_default_options(); CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en")); @@ -86,7 +88,7 @@ TEST test_expansions(void) { } TEST test_expansions_language_classifier(void) { - normalize_options_t options = get_libpostal_default_options(); + libpostal_normalize_options_t options = libpostal_get_default_options(); CHECK_CALL(test_expansion_contains_with_languages("V XX Sett", "via 20 settembre", options, 0, NULL)); CHECK_CALL(test_expansion_contains_with_languages("C/ Ocho", "calle 8", options, 0, NULL)); @@ -94,7 +96,7 @@ TEST test_expansions_language_classifier(void) { } TEST test_expansions_no_options(void) { - normalize_options_t options = get_libpostal_default_options(); + libpostal_normalize_options_t options = libpostal_get_default_options(); options.lowercase = false; options.latin_ascii = false; options.transliterate = false; diff --git a/test/test_parser.c b/test/test_parser.c index 75706dff..b29e932b 100644 --- a/test/test_parser.c +++ b/test/test_parser.c @@ -14,8 +14,8 @@ typedef struct labeled_component { char *component; } labeled_component_t; -static greatest_test_res test_parse_result_equals(char *input, address_parser_options_t options, size_t output_len, ...) { - address_parser_response_t *response = parse_address(input, options); +static greatest_test_res test_parse_result_equals(char *input, libpostal_address_parser_options_t options, size_t output_len, ...) { + libpostal_address_parser_response_t *response = libpostal_parse_address(input, options); va_list args; @@ -56,11 +56,11 @@ static greatest_test_res test_parse_result_equals(char *input, address_parser_op printf("%s: %s\n", response->labels[i], response->components[i]); } va_end(args); - address_parser_response_destroy(response); + libpostal_address_parser_response_destroy(response); FAIL(); } - address_parser_response_destroy(response); + libpostal_address_parser_response_destroy(response); PASS(); } @@ -68,7 +68,7 @@ static greatest_test_res test_parse_result_equals(char *input, address_parser_op TEST test_us_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( "Black Alliance for Just Immigration 660 Nostrand Ave, Brooklyn, N.Y., 11216", @@ -631,7 +631,7 @@ TEST test_us_parses(void) { } TEST test_ca_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // From: https://github.com/openvenues/libpostal/issues/55 @@ -694,7 +694,7 @@ TEST test_ca_parses(void) { } TEST test_jm_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // From https://github.com/openvenues/libpostal/issues/113 @@ -730,7 +730,7 @@ TEST test_jm_parses(void) { TEST test_gb_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( @@ -1069,7 +1069,7 @@ TEST test_gb_parses(void) { } TEST test_im_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // Multiple house names @@ -1089,7 +1089,7 @@ TEST test_im_parses(void) { } TEST test_nz_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( "wellington new zealand", @@ -1103,7 +1103,7 @@ TEST test_nz_parses(void) { } TEST test_fr_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // From https://github.com/pelias/pelias/issues/426 "Chambéry", @@ -1169,7 +1169,7 @@ TEST test_fr_parses(void) { TEST test_es_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); // Use Spanish toponym CHECK_CALL(test_parse_result_equals( @@ -1214,7 +1214,7 @@ TEST test_es_parses(void) { } TEST test_co_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( "Cra 18#63-64 B Chapinero Bogotá DC Colombia", @@ -1277,7 +1277,7 @@ TEST test_co_parses(void) { } TEST test_mx_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); // From: https://github.com/openvenues/libpostal/issues/126 CHECK_CALL(test_parse_result_equals( @@ -1309,7 +1309,7 @@ TEST test_mx_parses(void) { TEST test_br_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // Brazil address with sem número (s/n) and CEP used with postal code @@ -1328,7 +1328,7 @@ TEST test_br_parses(void) { } TEST test_cn_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // From https://github.com/openvenues/libpostal/issues/71 @@ -1351,7 +1351,7 @@ TEST test_cn_parses(void) { TEST test_jp_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // Example of a Kanji address @@ -1410,7 +1410,7 @@ TEST test_jp_parses(void) { } TEST test_kr_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // English/Romanized Korean, ro + gil address, English unit @@ -1429,7 +1429,7 @@ TEST test_kr_parses(void) { } TEST test_my_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // From https://github.com/openvenues/libpostal/issues/121 @@ -1448,7 +1448,7 @@ TEST test_my_parses(void) { } TEST test_za_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // Contains HTML entity which should be normalized @@ -1469,7 +1469,7 @@ TEST test_za_parses(void) { } TEST test_de_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( /* Contains German concatenated street suffix @@ -1518,7 +1518,7 @@ TEST test_de_parses(void) { TEST test_at_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( "Eduard Sueß Gasse 9", @@ -1592,7 +1592,7 @@ TEST test_at_parses(void) { TEST test_nl_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // From: https://github.com/openvenues/libpostal/issues/162 "Nieuwe Binnenweg 17-19, Oude Westen, Rotterdam NL", @@ -1637,7 +1637,7 @@ TEST test_nl_parses(void) { } TEST test_da_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( "Valdemarsgade 42 4 t.v. København, 1665 Danmark", @@ -1655,7 +1655,7 @@ TEST test_da_parses(void) { } TEST test_fi_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( "1 Hämeenkatu, Tampere, Finland", @@ -1679,7 +1679,7 @@ TEST test_fi_parses(void) { } TEST test_no_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // From: https://github.com/openvenues/libpostal/issues/39#issuecomment-221027220 @@ -1696,7 +1696,7 @@ TEST test_no_parses(void) { } TEST test_se_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // Uses the "en trappa upp" (one floor up) form in Swedish addresses @@ -1714,7 +1714,7 @@ TEST test_se_parses(void) { } TEST test_hu_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // Hungarian, 4-digit postal code @@ -1730,7 +1730,7 @@ TEST test_hu_parses(void) { } TEST test_ro_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // Romanian address with staircase @@ -1751,7 +1751,7 @@ TEST test_ro_parses(void) { TEST test_ru_parses(void) { - address_parser_options_t options = get_libpostal_address_parser_default_options(); + libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( // Contains Cyrillic with abbreviations