From b8a12e05172f00fa8357345dc67c8776d0e54655 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 20 Mar 2017 00:58:52 -0400 Subject: [PATCH] [test] adding parser test cases in 22 countries. These may change, and I'm generlaly against putting every obscure test case in the world in here. It's better to measure accuracy in aggregate statistics instead of individual test cases (i.e. if a particular change to the parser improves overall performance but fails one test case, should we accept the improvement?) The thought here is: these represent parses that are used in documentation/examples, as well as most of those that have been brought up in Github issues from the initial release, and we want these specific tests to work from build to build. If a model fails one of these test cases, it shouldn't get pushed to our users. --- test/test_parser.c | 1132 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 1100 insertions(+), 32 deletions(-) diff --git a/test/test_parser.c b/test/test_parser.c index db6a11cf..2a7b7446 100644 --- a/test/test_parser.c +++ b/test/test_parser.c @@ -5,6 +5,7 @@ #include "greatest.h" #include "../src/libpostal.h" +#include "../src/address_parser.h" SUITE(libpostal_parser_tests); @@ -19,7 +20,30 @@ static greatest_test_res test_parse_result_equals(char *input, address_parser_op va_list args; size_t i; - if (output_len != response->num_components) { + + + bool valid = output_len == response->num_components; + + if (valid) { + va_start(args, output_len); + + for (i = 0; i < response->num_components; i++) { + labeled_component_t lc = va_arg(args, labeled_component_t); + + if (!string_equals(lc.label, response->labels[i])) { + valid = false; + break; + } + if (!string_equals(lc.component, response->components[i])) { + valid = false; + break; + } + } + + va_end(args); + } + + if (!valid) { va_start(args, output_len); printf("Expected\n\n"); for (i = 0; i < output_len; i++) { @@ -36,17 +60,6 @@ static greatest_test_res test_parse_result_equals(char *input, address_parser_op FAIL(); } - va_start(args, output_len); - - for (i = 0; i < response->num_components; i++) { - labeled_component_t lc = va_arg(args, labeled_component_t); - - ASSERT_STR_EQ(lc.label, response->labels[i]); - ASSERT_STR_EQ(lc.component, response->components[i]); - } - - va_end(args); - address_parser_response_destroy(response); PASS(); @@ -58,6 +71,8 @@ TEST test_us_parses(void) { address_parser_options_t options = get_libpostal_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + // Rare venue name without any common venue tokens following it + // Neighborhood name "Barboncino 781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options, 9, @@ -71,11 +86,404 @@ TEST test_us_parses(void) { (labeled_component_t){"postcode", "11216"}, (labeled_component_t){"country", "usa"} )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/pelias/pelias/issues/464 + "103 BEAL PKWY SE, FT WALTON BEACH, FL", + options, + 4, + (labeled_component_t){"house_number", "103"}, + (labeled_component_t){"road", "beal pkwy se"}, + (labeled_component_t){"city", "ft walton beach"}, + (labeled_component_t){"state", "fl"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/pelias/pelias/issues/463 + "Canal Rd, Deltona FL", + options, + 3, + (labeled_component_t){"road", "canal rd"}, + (labeled_component_t){"city", "deltona"}, + (labeled_component_t){"state", "fl"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/125 + "123 Main St # 456 Oakland CA 94789", + options, + 6, + (labeled_component_t){"house_number", "123"}, + (labeled_component_t){"road", "main st"}, + (labeled_component_t){"unit", "# 456"}, + (labeled_component_t){"city", "oakland"}, + (labeled_component_t){"state", "ca"}, + (labeled_component_t){"postcode", "94789"} + + )); + + CHECK_CALL(test_parse_result_equals( + "123 Main St Apt 456 Oakland CA 94789", + options, + 6, + (labeled_component_t){"house_number", "123"}, + (labeled_component_t){"road", "main st"}, + (labeled_component_t){"unit", "apt 456"}, + (labeled_component_t){"city", "oakland"}, + (labeled_component_t){"state", "ca"}, + (labeled_component_t){"postcode", "94789"} + )); + + CHECK_CALL(test_parse_result_equals( + "123 Main St Apt #456 Oakland CA 94789", + options, + 6, + (labeled_component_t){"house_number", "123"}, + (labeled_component_t){"road", "main st"}, + (labeled_component_t){"unit", "apt #456"}, + (labeled_component_t){"city", "oakland"}, + (labeled_component_t){"state", "ca"}, + (labeled_component_t){"postcode", "94789"} + )); + + CHECK_CALL(test_parse_result_equals( + "123 Main St Apt No. 456 Oakland CA 94789", + options, + 6, + (labeled_component_t){"house_number", "123"}, + (labeled_component_t){"road", "main st"}, + (labeled_component_t){"unit", "apt no. 456"}, + (labeled_component_t){"city", "oakland"}, + (labeled_component_t){"state", "ca"}, + (labeled_component_t){"postcode", "94789"} + )); + + CHECK_CALL(test_parse_result_equals( + "whole foods nyc", + options, + 2, + (labeled_component_t){"house", "whole foods"}, + (labeled_component_t){"city", "nyc"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/pelias/pelias/issues/427 + "921 83 street, nyc", + options, + 3, + (labeled_component_t){"house_number", "921"}, + (labeled_component_t){"road", "83 street"}, + (labeled_component_t){"city", "nyc"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/pelias/pelias/issues/424 + "30 w 26 st", + options, + 2, + (labeled_component_t){"house_number", "30"}, + (labeled_component_t){"road", "w 26 st"} + )); + + CHECK_CALL(test_parse_result_equals( + "30 West 26th Street Sixth Floor", + options, + 3, + (labeled_component_t){"house_number", "30"}, + (labeled_component_t){"road", "west 26th street"}, + (labeled_component_t){"level", "sixth floor"} + )); + + CHECK_CALL(test_parse_result_equals( + "30 W 26th St 6th Fl", + options, + 3, + (labeled_component_t){"house_number", "30"}, + (labeled_component_t){"road", "w 26th st"}, + (labeled_component_t){"level", "6th fl"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/pelias/pelias/issues/440 + "301 Commons Park S, Stamford, CT 06902", + options, + 5, + (labeled_component_t){"house_number", "301"}, + (labeled_component_t){"road", "commons park s"}, + (labeled_component_t){"city", "stamford"}, + (labeled_component_t){"state", "ct"}, + (labeled_component_t){"postcode", "06902"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/151 + // House number range + "912-914 8TH ST, CLARKSTON, WA 99403", + options, + 5, + (labeled_component_t){"house_number", "912-914"}, + (labeled_component_t){"road", "8th st"}, + (labeled_component_t){"city", "clarkston"}, + (labeled_component_t){"state", "wa"}, + (labeled_component_t){"postcode", "99403"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/151 + "2120 E Hill Street #104 Signal Hill CA 90755", + options, + 6, + (labeled_component_t){"house_number", "2120"}, + (labeled_component_t){"road", "e hill street"}, + (labeled_component_t){"unit", "#104"}, + (labeled_component_t){"city", "signal hill"}, + (labeled_component_t){"state", "ca"}, + (labeled_component_t){"postcode", "90755"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/151 + // 5-digit house number + "12200 Montecito Road #H206 Seal Beach 90740", + options, + 5, + (labeled_component_t){"house_number", "12200"}, + (labeled_component_t){"road", "montecito road"}, + (labeled_component_t){"unit", "#h206"}, + (labeled_component_t){"city", "seal beach"}, + (labeled_component_t){"postcode", "90740"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/151 + // space between mc and carroll + "1036-1038 MC CARROLL ST CLARKSTON WA 99403", + options, + 5, + (labeled_component_t){"house_number", "1036-1038"}, + (labeled_component_t){"road", "mc carroll st"}, + (labeled_component_t){"city", "clarkston"}, + (labeled_component_t){"state", "wa"}, + (labeled_component_t){"postcode", "99403"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/151 + // hyphenated house number + "2455-B W BENCH RD OTHELLO WA 99344", + options, + 5, + (labeled_component_t){"house_number", "2455-b"}, + (labeled_component_t){"road", "w bench rd"}, + (labeled_component_t){"city", "othello"}, + (labeled_component_t){"state", "wa"}, + (labeled_component_t){"postcode", "99344"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/151 + // city name is part of street + "473 Boston Rd, Wilbraham, MA", + options, + 4, + (labeled_component_t){"house_number", "473"}, + (labeled_component_t){"road", "boston rd"}, + (labeled_component_t){"city", "wilbraham"}, + (labeled_component_t){"state", "ma"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/45 + // House number is a valid postcode but not in context + // Postcode is a ZIP+4 so have to rely on masked digits + "25050 ALESSANDRO BLVD, STE B, MORENO VALLEY, CA, 92553-4313", + options, + 6, + (labeled_component_t){"house_number", "25050"}, + (labeled_component_t){"road", "alessandro blvd"}, + (labeled_component_t){"unit", "ste b"}, + (labeled_component_t){"city", "moreno valley"}, + (labeled_component_t){"state", "ca"}, + (labeled_component_t){"postcode", "92553-4313"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/21 + // PO box example + "PO Box 1, Seattle, WA 98103", + options, + 4, + (labeled_component_t){"po_box", "po box 1"}, + (labeled_component_t){"city", "seattle"}, + (labeled_component_t){"state", "wa"}, + (labeled_component_t){"postcode", "98103"} + )); + + + CHECK_CALL(test_parse_result_equals( + "4411 Stone Way North Seattle, King County, WA 98103", + options, + 6, + (labeled_component_t){"house_number", "4411"}, + (labeled_component_t){"road", "stone way north"}, + (labeled_component_t){"city", "seattle"}, + (labeled_component_t){"state_district", "king county"}, + (labeled_component_t){"state", "wa"}, + (labeled_component_t){"postcode", "98103"} + )); + + + // Tests of simple place names + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/114 + "Columbus, OH", + options, + 2, + (labeled_component_t){"city", "columbus"}, + (labeled_component_t){"state", "oh"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/114 + "San Francisco CA", + options, + 2, + (labeled_component_t){"city", "san francisco"}, + (labeled_component_t){"state", "ca"} + )); + + CHECK_CALL(test_parse_result_equals( + // Common alternative name for San Francicso + "SF CA", + options, + 2, + (labeled_component_t){"city", "sf"}, + (labeled_component_t){"state", "ca"} + )); + + CHECK_CALL(test_parse_result_equals( + // Carmel-by-the-Sea hyphenated + "Carmel-by-the-Sea, CA", + options, + 2, + (labeled_component_t){"city", "carmel-by-the-sea"}, + (labeled_component_t){"state", "ca"} + )); + + CHECK_CALL(test_parse_result_equals( + // Carmel-by-the-Sea de-hyphenated + "Carmel by the Sea, CA", + options, + 2, + (labeled_component_t){"city", "carmel by the sea"}, + (labeled_component_t){"state", "ca"} + )); + + // Disambiguation tests + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/53 + // Manhattan as city_district + "Manhattan, NY", + options, + 2, + (labeled_component_t){"city_district", "manhattan"}, + (labeled_component_t){"state", "ny"} + )); + + CHECK_CALL(test_parse_result_equals( + // Manhattan, Kansas - city + "Manhattan, KS", + options, + 2, + (labeled_component_t){"city", "manhattan"}, + (labeled_component_t){"state", "ks"} + )); + + CHECK_CALL(test_parse_result_equals( + // Brooklyn, NY - city_district + "Brooklyn, NY", + options, + 2, + (labeled_component_t){"city_district", "brooklyn"}, + (labeled_component_t){"state", "ny"} + )); + + CHECK_CALL(test_parse_result_equals( + // Brooklyn, Connecticut - city + "Brooklyn CT", + options, + 2, + (labeled_component_t){"city", "brooklyn"}, + (labeled_component_t){"state", "ct"} + )); + + CHECK_CALL(test_parse_result_equals( + // Here Brooklyn CT means "Brooklyn Court", a small street in Oregon + "18312 SE Brooklyn CT Gresham OR", + options, + 4, + (labeled_component_t){"house_number", "18312"}, + (labeled_component_t){"road", "se brooklyn ct"}, + (labeled_component_t){"city", "gresham"}, + (labeled_component_t){"state", "or"} + )); + PASS(); } +TEST test_ca_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); -TEST test_uk_parses(void) { + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/55 + "332 Menzies Street, Victoria, BC V8V 2G9", + options, + 5, + (labeled_component_t){"house_number", "332"}, + (labeled_component_t){"road", "menzies street"}, + (labeled_component_t){"city", "victoria"}, + (labeled_component_t){"state", "bc"}, + (labeled_component_t){"postcode", "v8v 2g9"} + )); +} + +TEST test_jm_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/113 + // Kingston postcode, rare case where single-digit number is a postcode + // Uses W.I for "West Indies" + "237 Old Hope Road, Kingston 6, Jamaica W.I", + options, + 6, + (labeled_component_t){"house_number", "237"}, + (labeled_component_t){"road", "old hope road"}, + (labeled_component_t){"city", "kingston"}, + (labeled_component_t){"postcode", "6"}, + (labeled_component_t){"country", "jamaica"}, + (labeled_component_t){"world_region", "w.i"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/113 + // Fractional house number + "16 1/2 Windward Road, Kingston 2, Jamaica", + options, + 5, + (labeled_component_t){"house_number", "16 1/2"}, + (labeled_component_t){"road", "windward road"}, + (labeled_component_t){"city", "kingston"}, + (labeled_component_t){"postcode", "2"}, + (labeled_component_t){"country", "jamaica"} + + )); + +} + + +TEST test_gb_parses(void) { address_parser_options_t options = get_libpostal_address_parser_default_options(); @@ -107,6 +515,139 @@ TEST test_uk_parses(void) { (labeled_component_t){"country", "united kingdom"} )); + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openownership/data-standard/issues/18 + "Aston House, Cornwall Avenue, London, N3 1LF", + options, + 4, + (labeled_component_t){"house", "aston house"}, + (labeled_component_t){"road", "cornwall avenue"}, + (labeled_component_t){"city", "london"}, + (labeled_component_t){"postcode", "n3 1lf"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/39 + "318 Upper Street, N1 2XQ London", + options, + 4, + (labeled_component_t){"house_number", "318"}, + (labeled_component_t){"road", "upper street"}, + (labeled_component_t){"postcode", "n1 2xq"}, + (labeled_component_t){"city", "london"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/39 + "21, Kingswood Road SW2 4JE, London", + options, + 4, + (labeled_component_t){"house_number", "21"}, + (labeled_component_t){"road", "kingswood road"}, + (labeled_component_t){"postcode", "sw2 4je"}, + (labeled_component_t){"city", "london"} + )); + + CHECK_CALL(test_parse_result_equals( + // From Moz tests + "1 Riverside Dr Liverpool, Merseyside L3 4EN", + options, + 5, + (labeled_component_t){"house_number", "1"}, + (labeled_component_t){"road", "riverside dr"}, + (labeled_component_t){"city", "liverpool"}, + (labeled_component_t){"state_district", "merseyside"}, + (labeled_component_t){"postcode", "l3 4en"} + )); + + CHECK_CALL(test_parse_result_equals( + "Knutsford, Cheshire East WA16 9EX, UK", + options, + 4, + (labeled_component_t){"city", "knutsford"}, + (labeled_component_t){"state_district", "cheshire east"}, + (labeled_component_t){"postcode", "wa16 9ex"}, + (labeled_component_t){"country", "uk"} + )); + + PASS(); +} + +TEST test_nz_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + "wellington new zealand", + options, + 2, + (labeled_component_t){"city", "wellington"}, + (labeled_component_t){"country", "new zealand"} + )); + + PASS(); +} + +TEST test_fr_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + CHECK_CALL(test_parse_result_equals( + // From https://github.com/pelias/pelias/issues/426 + "Chambéry", + options, + 1, + (labeled_component_t){"city", "chambéry"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/pelias/pelias/issues/426 + "Chambery", + options, + 1, + (labeled_component_t){"city", "chambery"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/114 + "Paris, France", + options, + 2, + (labeled_component_t){"city", "paris"}, + (labeled_component_t){"country", "france"} + )); + + CHECK_CALL(test_parse_result_equals( + // Variant of above + "Paris", + options, + 1, + (labeled_component_t){"city", "paris"} + )); + + CHECK_CALL(test_parse_result_equals( + // Variant of above + "Paris, FR", + options, + 2, + (labeled_component_t){"city", "paris"}, + (labeled_component_t){"country", "fr"} + )); + + CHECK_CALL(test_parse_result_equals( + // Arrondissement Roman numerals + "IXe arrondissement Paris", + options, + 2, + (labeled_component_t){"city_district", "ixe arrondissement"}, + (labeled_component_t){"city", "paris"} + )); + + CHECK_CALL(test_parse_result_equals( + // Arrondissement Arabic numerals + "9e arrondissement Paris", + options, + 2, + (labeled_component_t){"city_district", "9e arrondissement"}, + (labeled_component_t){"city", "paris"} + )); PASS(); } @@ -121,11 +662,11 @@ TEST test_es_parses(void) { options, 6, (labeled_component_t){"house", "museo del prado"}, - (labeled_component_t){"road", "c. de ruiz de alarcón"}, + (labeled_component_t){"road", "c. de ruiz de alarcón"}, (labeled_component_t){"house_number", "23"}, (labeled_component_t){"postcode", "28014"}, (labeled_component_t){"city", "madrid"}, - (labeled_component_t){"country", "españa"} + (labeled_component_t){"country", "españa"} )); // Use English toponym @@ -134,13 +675,253 @@ TEST test_es_parses(void) { options, 6, (labeled_component_t){"house", "museo del prado"}, - (labeled_component_t){"road", "c. de ruiz de alarcón"}, + (labeled_component_t){"road", "c. de ruiz de alarcón"}, (labeled_component_t){"house_number", "23"}, (labeled_component_t){"postcode", "28014"}, (labeled_component_t){"city", "madrid"}, (labeled_component_t){"country", "spain"} )); + CHECK_CALL(test_parse_result_equals( + // Spanish-style floor number + "Paseo de la Castellana, 185 - 5º, 28046 Madrid Madrid", + options, + 6, + (labeled_component_t){"road", "paseo de la castellana"}, + (labeled_component_t){"house_number", "185"}, + (labeled_component_t){"level", "5º"}, + (labeled_component_t){"postcode", "28046"}, + (labeled_component_t){"city", "madrid"}, + (labeled_component_t){"state", "madrid"} + )); + + CHECK_CALL(test_parse_result_equals( + // Spanish-style floor number + side (unit) + "Av. de las Delicias, 14, 1º Dcha, 28045 Madrid", + options, + 6, + (labeled_component_t){"road", "av. de las delicias"}, + (labeled_component_t){"house_number", "14"}, + (labeled_component_t){"level", "1º"}, + (labeled_component_t){"unit", "dcha"}, + (labeled_component_t){"postcode", "28045"}, + (labeled_component_t){"city", "madrid"} + )); + PASS(); +} + +TEST test_co_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + "Cra 18#63-64 B Chapinero Bogotá DC Colombia", + options, + 5, + (labeled_component_t){"road", "cra 18"}, + (labeled_component_t){"house_number", "#63-64 b"}, + (labeled_component_t){"city_district", "chapinero"}, + (labeled_component_t){"city", "bogotá dc"}, + (labeled_component_t){"country", "colombia"} + )); + + CHECK_CALL(test_parse_result_equals( + "Bogotá Colombia", + options, + 2, + (labeled_component_t){"city", "bogotá"}, + (labeled_component_t){"country", "colombia"} + )); + + // Test with country code (could also be Colorado, company, etc.) + CHECK_CALL(test_parse_result_equals( + "Bogotá CO", + options, + 2, + (labeled_component_t){"city", "bogotá"}, + (labeled_component_t){"country", "co"} + )); + + // Same tests without accent + CHECK_CALL(test_parse_result_equals( + "Cra 18#63-64 B Chapinero Bogota DC Colombia", + options, + 5, + (labeled_component_t){"road", "cra 18"}, + (labeled_component_t){"house_number", "#63-64 b"}, + (labeled_component_t){"city_district", "chapinero"}, + (labeled_component_t){"city", "bogota dc"}, + (labeled_component_t){"country", "colombia"} + )); + + CHECK_CALL(test_parse_result_equals( + "Bogota Colombia", + options, + 2, + (labeled_component_t){"city", "bogota"}, + (labeled_component_t){"country", "colombia"} + )); + + CHECK_CALL(test_parse_result_equals( + "Bogota CO", + options, + 2, + (labeled_component_t){"city", "bogota"}, + (labeled_component_t){"country", "co"} + )); + + + PASS(); +} + +TEST test_mx_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + // From: https://github.com/openvenues/libpostal/issues/126 + CHECK_CALL(test_parse_result_equals( + "LÓPEZ MATEOS, 106, 21840, MEXICALI, baja-california, mx", + options, + 6, + (labeled_component_t){"road", "lópez mateos"}, + (labeled_component_t){"house_number", "106"}, + (labeled_component_t){"postcode", "21840"}, + (labeled_component_t){"city", "mexicali"}, + (labeled_component_t){"state", "baja-california"}, + (labeled_component_t){"country", "mx"} + )); + + CHECK_CALL(test_parse_result_equals( + "LORENZO DE ZOVELA, 1126, 22715, PLAYAS DE ROSARITO, baja-california, mx", + options, + 6, + (labeled_component_t){"road", "lorenzo de zovela"}, + (labeled_component_t){"house_number", "1126"}, + (labeled_component_t){"postcode", "22715"}, + (labeled_component_t){"city", "playas de rosarito"}, + (labeled_component_t){"state", "baja-california"}, + (labeled_component_t){"country", "mx"} + )); + + PASS(); +} + + +TEST test_cn_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/71 + // Level, unit, road name containing a city (Hong Kong) + "中国,山东省,青岛市 香港东路6号,5号楼,8号室 李小方 先生收", + options, + 8, + (labeled_component_t){"country", "中国"}, + (labeled_component_t){"state", "山东省"}, + (labeled_component_t){"city", "青岛市"}, + (labeled_component_t){"road", "香港东路"}, + (labeled_component_t){"house_number", "6号"}, + (labeled_component_t){"level", "5号楼"}, + (labeled_component_t){"unit", "8号室"}, + (labeled_component_t){"house", "李小方 先生收"} + )); + PASS(); +} + + + +TEST test_jp_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + // Example of a Kanji address + "〒601-8446京都市南区西九条高畠町25-1京都醸造株式会社", + options, + 6, + (labeled_component_t){"postcode", "〒601-8446"}, + (labeled_component_t){"city", "京都市"}, + (labeled_component_t){"city_district", "南区"}, + (labeled_component_t){"suburb", "西九条高畠町"}, + (labeled_component_t){"house_number", "25-1"}, + (labeled_component_t){"house", "京都醸造株式会社"} + )); + + CHECK_CALL(test_parse_result_equals( + // Ban-go style house number, level and unit + "日本国〒113-0001文京区4丁目3番2号3階323号室", + options, + 7, + (labeled_component_t){"country", "日本国"}, + (labeled_component_t){"postcode", "〒113-0001"}, + (labeled_component_t){"city", "文京区"}, + (labeled_component_t){"suburb", "4丁目"}, + (labeled_component_t){"house_number", "3番2号"}, + (labeled_component_t){"level", "3階"}, + (labeled_component_t){"unit", "323号室"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/123 + // University (slightly ambiguous i.e. the 2nd "Osaka" can be part of a campus name) + // English toponyms + "osaka university, osaka, japan, 565-0871", + options, + 4, + (labeled_component_t){"house", "osaka university"}, + (labeled_component_t){"city", "osaka"}, + (labeled_component_t){"country", "japan"}, + (labeled_component_t){"postcode", "565-0871"} + )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/62 + // Romaji + // Has road name (I think?) + "135, Higashifunahashi 2 Chome, Hirakata-shi Osaka-fu", + options, + 5, + (labeled_component_t){"house_number", "135"}, + (labeled_component_t){"road", "higashifunahashi"}, + (labeled_component_t){"suburb", "2 chome"}, + (labeled_component_t){"city", "hirakata-shi"}, + (labeled_component_t){"state", "osaka-fu"} + + )); + PASS(); +} + +TEST test_kr_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + // English/Romanized Korean, ro + gil address, English unit + "Suite 1005, 36, Teheran-ro 87-gil, Gangnam-gu Seoul 06164 Republic of Korea", + options, + 7, + (labeled_component_t){"unit", "suite 1005"}, + (labeled_component_t){"house_number", "36"}, + (labeled_component_t){"road", "teheran-ro 87-gil"}, + (labeled_component_t){"city_district", "gangnam-gu"}, + (labeled_component_t){"city", "seoul"}, + (labeled_component_t){"postcode", "06164"}, + (labeled_component_t){"country", "republic of korea"} + )); + PASS(); +} + +TEST test_my_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/121 + // Not adding the block format yet in case we change how it's parsed + "IBS Centre Jalan Chan Sow Lin, 55200 Kuala Lumpur, Malaysia", + options, + 5, + (labeled_component_t){"house", "ibs centre"}, + (labeled_component_t){"road", "jalan chan sow lin"}, + (labeled_component_t){"postcode", "55200"}, + (labeled_component_t){"city", "kuala lumpur"}, + (labeled_component_t){"country", "malaysia"} + )); PASS(); } @@ -150,7 +931,7 @@ TEST test_za_parses(void) { CHECK_CALL(test_parse_result_equals( // Contains HTML entity which should be normalized // Contains 4-digit postcode, which can be confusable with a house number - "Double Shot Tea & Coffee 15 Melle St. Braamfontein Johannesburg, 2000, South Africa", + "Double Shot Tea & Coffee 15 Melle St. Braamfontein Johannesburg, 2001, South Africa", options, 7, (labeled_component_t){"house", "double shot tea & coffee"}, @@ -158,7 +939,7 @@ TEST test_za_parses(void) { (labeled_component_t){"road", "melle st."}, (labeled_component_t){"suburb", "braamfontein"}, (labeled_component_t){"city", "johannesburg"}, - (labeled_component_t){"postcode", "2000"}, + (labeled_component_t){"postcode", "2001"}, (labeled_component_t){"country", "south africa"} )); PASS(); @@ -169,8 +950,7 @@ TEST test_de_parses(void) { address_parser_options_t options = get_libpostal_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( - /* Contains Latin-ASCII normalizations - Contains German concatenated street suffix + /* Contains German concatenated street suffix N.B. We may want to move ä => ae out of the Latin-ASCII transliterator which will change the output of this test to e.g. eschenbräu bräurei @@ -178,13 +958,235 @@ TEST test_de_parses(void) { "Eschenbräu Bräurei Triftstraße 67 13353 Berlin Deutschland", options, 6, - (labeled_component_t){"house", "eschenbraeu braeurei"}, + (labeled_component_t){"house", "eschenbräu bräurei"}, + (labeled_component_t){"road", "triftstraße"}, + (labeled_component_t){"house_number", "67"}, + (labeled_component_t){"postcode", "13353"}, + (labeled_component_t){"city", "berlin"}, + (labeled_component_t){"country", "deutschland"} + )); + + // Test transliterated versions + CHECK_CALL(test_parse_result_equals( + "Eschenbrau Braurei Triftstrasse 67 13353 Berlin Deutschland", + options, + 6, + (labeled_component_t){"house", "eschenbrau braurei"}, (labeled_component_t){"road", "triftstrasse"}, (labeled_component_t){"house_number", "67"}, (labeled_component_t){"postcode", "13353"}, (labeled_component_t){"city", "berlin"}, (labeled_component_t){"country", "deutschland"} )); + + CHECK_CALL(test_parse_result_equals( + "Eschenbraeu Braeurei Triftstrasse 67 13353 Berlin DE", + options, + 6, + (labeled_component_t){"house", "eschenbraeu braeurei"}, + (labeled_component_t){"road", "triftstrasse"}, + (labeled_component_t){"house_number", "67"}, + (labeled_component_t){"postcode", "13353"}, + (labeled_component_t){"city", "berlin"}, + (labeled_component_t){"country", "de"} + )); + + PASS(); +} + + +TEST test_at_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + "Eduard Sueß Gasse 9", + options, + 2, + (labeled_component_t){"road", "eduard sueß gasse"}, + (labeled_component_t){"house_number", "9"} + )); + + CHECK_CALL(test_parse_result_equals( + "Eduard-Sueß Gasse 9", + options, + 2, + (labeled_component_t){"road", "eduard-sueß gasse"}, + (labeled_component_t){"house_number", "9"} + )); + + CHECK_CALL(test_parse_result_equals( + "Eduard-Sueß-Gasse 9", + options, + 2, + (labeled_component_t){"road", "eduard-sueß-gasse"}, + (labeled_component_t){"house_number", "9"} + )); + + CHECK_CALL(test_parse_result_equals( + "Eduard Sueß-Gasse 9", + options, + 2, + (labeled_component_t){"road", "eduard sueß-gasse"}, + (labeled_component_t){"house_number", "9"} + )); + + // From https://github.com/openvenues/libpostal/issues/128 + CHECK_CALL(test_parse_result_equals( + "Wien, Österreich", + options, + 2, + (labeled_component_t){"city", "wien"}, + (labeled_component_t){"country", "österreich"} + )); + + // Transliterations + CHECK_CALL(test_parse_result_equals( + "Wien, Osterreich", + options, + 2, + (labeled_component_t){"city", "wien"}, + (labeled_component_t){"country", "osterreich"} + )); + + CHECK_CALL(test_parse_result_equals( + "Wien, Oesterreich", + options, + 2, + (labeled_component_t){"city", "wien"}, + (labeled_component_t){"country", "oesterreich"} + )); + + CHECK_CALL(test_parse_result_equals( + // English names + "Vienna, Austria", + options, + 2, + (labeled_component_t){"city", "vienna"}, + (labeled_component_t){"country", "austria"} + )); + + PASS(); +} + + +TEST test_nl_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/162 + "Nieuwe Binnenweg 17-19, Oude Westen, Rotterdam NL", + options, + 5, + (labeled_component_t){"road", "nieuwe binnenweg"}, + (labeled_component_t){"house_number", "17-19"}, + (labeled_component_t){"suburb", "oude westen"}, + (labeled_component_t){"city", "rotterdam"}, + (labeled_component_t){"country", "nl"} + )); + + CHECK_CALL(test_parse_result_equals( + "Nieuwe Binnenweg 17-19, Oude Westen, Rotterdam", + options, + 4, + (labeled_component_t){"road", "nieuwe binnenweg"}, + (labeled_component_t){"house_number", "17-19"}, + (labeled_component_t){"suburb", "oude westen"}, + (labeled_component_t){"city", "rotterdam"} + )); + + CHECK_CALL(test_parse_result_equals( + "Oude Westen, Rotterdam", + options, + 2, + (labeled_component_t){"suburb", "oude westen"}, + (labeled_component_t){"city", "rotterdam"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/75 + "Olympia 1 A begane gro", + options, + 3, + (labeled_component_t){"road", "olympia"}, + (labeled_component_t){"house_number", "1 a"}, + (labeled_component_t){"level", "begane gro"} + )); + + PASS(); +} + +TEST test_da_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + "Valdemarsgade 42 4 t.v. København, 1665 Danmark", + options, + 6, + (labeled_component_t){"road", "valdemarsgade"}, + (labeled_component_t){"house_number", "42"}, + (labeled_component_t){"unit", "4 t.v."}, + (labeled_component_t){"city", "københavn"}, + (labeled_component_t){"postcode", "1665"}, + (labeled_component_t){"country", "danmark"} + )); + + PASS(); +} + +TEST test_fi_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + "1 Hämeenkatu, Tampere, Finland", + options, + 4, + (labeled_component_t){"house_number", "1"}, + (labeled_component_t){"road", "hämeenkatu"}, + (labeled_component_t){"city", "tampere"}, + (labeled_component_t){"country", "finland"} + )); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/111 + "Pitkämäentie", + options, + 1, + (labeled_component_t){"road", "pitkämäentie"} + )); + + PASS(); +} + +TEST test_no_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + // From: https://github.com/openvenues/libpostal/issues/39#issuecomment-221027220 + "Sars gate 2 A, 562 OSLO", + options, + 4, + (labeled_component_t){"road", "sars gate"}, + (labeled_component_t){"house_number", "2 a"}, + (labeled_component_t){"postcode", "562"}, + (labeled_component_t){"city", "oslo"} + )); + PASS(); +} + +TEST test_se_parses(void) { + address_parser_options_t options = get_libpostal_address_parser_default_options(); + + CHECK_CALL(test_parse_result_equals( + // Uses the "en trappa upp" (one floor up) form in Swedish addresses + "Storgatan 1 en trappa upp 112 01 Stockholm Sweden", + options, + 6, + (labeled_component_t){"road", "storgatan"}, + (labeled_component_t){"house_number", "1"}, + (labeled_component_t){"level", "en trappa upp"}, + (labeled_component_t){"postcode", "112 01"}, + (labeled_component_t){"city", "stockholm"}, + (labeled_component_t){"country", "sweden"} + )); PASS(); } @@ -193,20 +1195,17 @@ TEST test_hu_parses(void) { CHECK_CALL(test_parse_result_equals( // Hungarian, 4-digit postal code - "Szimpla Kert Kazinczy utca 14 Budapest 1075, Magyarország", + "1075 Budapest kazinczy utca, 14", options, - 6, - (labeled_component_t){"house", "szimpla kert"}, - (labeled_component_t){"road", "kazinczy utca"}, - (labeled_component_t){"house_number", "14"}, - (labeled_component_t){"city", "budapest"}, + 4, (labeled_component_t){"postcode", "1075"}, - (labeled_component_t){"country", "magyarország"} + (labeled_component_t){"city", "budapest"}, + (labeled_component_t){"road", "kazinczy utca"}, + (labeled_component_t){"house_number", "14"} )); PASS(); } - TEST test_ru_parses(void) { address_parser_options_t options = get_libpostal_address_parser_default_options(); @@ -217,13 +1216,68 @@ TEST test_ru_parses(void) { "Государственный Эрмитаж Дворцовая наб., 34 191186, St. Petersburg, Russia", options, 6, - (labeled_component_t){"house", "государственный эрмитаж"}, + (labeled_component_t){"house", "государственный эрмитаж"}, (labeled_component_t){"road", "дворцовая наб."}, (labeled_component_t){"house_number", "34"}, (labeled_component_t){"postcode", "191186"}, (labeled_component_t){"city", "st. petersburg"}, (labeled_component_t){"country", "russia"} )); + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/138 + "Петрозаводск Карелия Российская Федерация", + options, + 3, + (labeled_component_t){"city", "петрозаводск"}, + (labeled_component_t){"state", "карелия"}, + (labeled_component_t){"country", "российская федерация"} + )); + + + CHECK_CALL(test_parse_result_equals( + // From https://github.com/openvenues/libpostal/issues/138 + "Автолюбителейроезд 24 Петрозаводск Карелия Российская Федерация 185013", + options, + 6, + (labeled_component_t){"road", "автолюбителейроезд"}, + (labeled_component_t){"house_number", "24"}, + (labeled_component_t){"city", "петрозаводск"}, + (labeled_component_t){"state", "карелия"}, + (labeled_component_t){"country", "российская федерация"}, + (labeled_component_t){"postcode", "185013"} + )); + + CHECK_CALL(test_parse_result_equals( + // Old Soviet format, from https://github.com/openvenues/libpostal/issues/125#issuecomment-269319652 + // Uses "г."" prefix for the city + // Uses "д." for house number + // Has apartment number with "кв." + "197198, г. Санкт-Петербург, ул. Съезжинская д. 10 кв. 40", + options, + 5, + (labeled_component_t){"postcode", "197198"}, + (labeled_component_t){"city", "г. санкт-петербург"}, + (labeled_component_t){"road", "ул. съезжинская"}, + (labeled_component_t){"house_number", "д. 10"}, + (labeled_component_t){"unit", "кв. 40"} + )); + + /* + CHECK_CALL(test_parse_result_equals( + // Uses genitive place names, see https://github.com/openvenues/libpostal/issues/125#issuecomment-269438636 + "188541, г. Сосновый Бор Ленинградской области, пр. Героев 40, кв. 400", + options, + 6, + (labeled_component_t){"postcode", "188541"}, + (labeled_component_t){"city", "г. сосновый бор"}, + (labeled_component_t){"state", "ленинградской области"}, + (labeled_component_t){"road", "пр. героев"}, + (labeled_component_t){"house_number", "40"}, + (labeled_component_t){"unit", "кв. 400"} + )); + */ + PASS(); } @@ -234,14 +1288,28 @@ SUITE(libpostal_parser_tests) { } RUN_TEST(test_us_parses); - RUN_TEST(test_uk_parses); + RUN_TEST(test_jm_parses); + RUN_TEST(test_gb_parses); + RUN_TEST(test_nz_parses); + RUN_TEST(test_fr_parses); RUN_TEST(test_es_parses); + RUN_TEST(test_co_parses); + RUN_TEST(test_mx_parses); + RUN_TEST(test_cn_parses); + RUN_TEST(test_jp_parses); + RUN_TEST(test_kr_parses); + RUN_TEST(test_my_parses); RUN_TEST(test_za_parses); RUN_TEST(test_de_parses); + RUN_TEST(test_at_parses); + RUN_TEST(test_nl_parses); + RUN_TEST(test_da_parses); + RUN_TEST(test_fi_parses); + RUN_TEST(test_no_parses); + RUN_TEST(test_se_parses); RUN_TEST(test_hu_parses); RUN_TEST(test_ru_parses); libpostal_teardown(); libpostal_teardown_parser(); } -