From c0e8578b9c86d20a20ffe3c4d048014b3dbb05c4 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 28 May 2016 19:19:18 -0400 Subject: [PATCH] [gazetteers] Adding new gazetteer types/address components --- src/gazetteer_data.c | 24 ++++++++++++++++++++---- src/gazetteers.h | 40 +++++++++++++++++++++++++--------------- src/libpostal.h | 15 +++++++-------- 3 files changed, 52 insertions(+), 27 deletions(-) diff --git a/src/gazetteer_data.c b/src/gazetteer_data.c index 5905c937..693362f6 100644 --- a/src/gazetteer_data.c +++ b/src/gazetteer_data.c @@ -3,29 +3,45 @@ gazetteer_t gazetteer_config[] = { {DICTIONARY_ACADEMIC_DEGREE, ADDRESS_NAME}, {DICTIONARY_AMBIGUOUS_EXPANSION, ADDRESS_NONE}, {DICTIONARY_BUILDING_TYPE, ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT}, + {DICTIONARY_CATEGORY, ADDRESS_CATEGORY}, + {DICTIONARY_CHAIN, ADDRESS_NAME}, {DICTIONARY_COMPANY_TYPE, ADDRESS_NAME}, {DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_STREET}, {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_STREET}, {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_STREET}, + {DICTIONARY_CROSS_STREET, ADDRESS_STREET}, {DICTIONARY_DIRECTIONAL, ADDRESS_ANY}, {DICTIONARY_ELISION, ADDRESS_ANY}, + {DICTIONARY_ENTRANCE, ADDRESS_ENTRANCE}, {DICTIONARY_GIVEN_NAME, ADDRESS_STREET | ADDRESS_NAME}, - {DICTIONARY_LEVEL, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT}, - {DICTIONARY_NO_ADDRESS, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET}, + {DICTIONARY_HOUSE_NUMBER, ADDRESS_HOUSE_NUMBER}, + {DICTIONARY_LEVEL_NUMBERED, ADDRESS_LEVEL}, + {DICTIONARY_LEVEL_STANDALONE, ADDRESS_LEVEL}, + {DICTIONARY_LEVEL_MEZZANINE, ADDRESS_LEVEL}, + {DICTIONARY_LEVEL_BASEMENT, ADDRESS_LEVEL}, + {DICTIONARY_LEVEL_SUB_BASEMENT, ADDRESS_LEVEL}, {DICTIONARY_NULL, ADDRESS_ANY}, {DICTIONARY_NAMED_ORGANIZATION, ADDRESS_NAME}, {DICTIONARY_NAMED_PERSON, ADDRESS_NAME | ADDRESS_STREET}, + {DICTIONARY_NO_NUMBER, ADDRESS_HOUSE_NUMBER}, + {DICTIONARY_NUMBER, ADDRESS_HOUSE_NUMBER | ADDRESS_UNIT | ADDRESS_LEVEL | ADDRESS_STAIRCASE | ADDRESS_ENTRANCE}, {DICTIONARY_PERSONAL_SUFFIX, ADDRESS_NAME | ADDRESS_STREET}, {DICTIONARY_PERSONAL_TITLE, ADDRESS_NAME | ADDRESS_STREET}, {DICTIONARY_PLACE_NAME, ADDRESS_NAME | ADDRESS_STREET}, {DICTIONARY_POST_OFFICE, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET}, + {DICTIONARY_POSTAL_CODE, ADDRESS_POSTAL_CODE}, {DICTIONARY_QUALIFIER, ADDRESS_STREET}, + {DICTIONARY_STAIRCASE, ADDRESS_STAIRCASE}, {DICTIONARY_STOPWORD, ADDRESS_ANY}, {DICTIONARY_STREET_TYPE, ADDRESS_STREET}, {DICTIONARY_SURNAME, ADDRESS_STREET | ADDRESS_NAME}, {DICTIONARY_SYNONYM, ADDRESS_ANY}, - {DICTIONARY_TOPONYM, ADDRESS_NAME | ADDRESS_STREET | ADDRESS_LOCALITY | ADDRESS_ADMIN1 | ADDRESS_ADMIN2 | ADDRESS_ADMIN3 | ADDRESS_ADMIN4 | ADDRESS_ADMIN_OTHER | ADDRESS_NEIGHBORHOOD}, - {DICTIONARY_UNIT, ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT} + {DICTIONARY_TOPONYM, ADDRESS_NAME | ADDRESS_STREET | ADDRESS_TOPONYM}, + {DICTIONARY_UNIT_NUMBERED, ADDRESS_UNIT}, + {DICTIONARY_UNIT_STANDALONE, ADDRESS_UNIT}, + {DICTIONARY_UNIT_DIRECTION, ADDRESS_UNIT} + }; + #define NUM_DICTIONARY_TYPES sizeof(gazetteer_config) / sizeof(gazetteer_t) diff --git a/src/gazetteers.h b/src/gazetteers.h index dc45270c..4276805e 100644 --- a/src/gazetteers.h +++ b/src/gazetteers.h @@ -27,11 +27,22 @@ typedef enum dictionary_type { DICTIONARY_DIRECTIONAL = 15, DICTIONARY_QUALIFIER = 16, DICTIONARY_BUILDING_TYPE = 17, - DICTIONARY_LEVEL = 18, - DICTIONARY_UNIT = 19, - DICTIONARY_POST_OFFICE = 20, - DICTIONARY_NO_ADDRESS = 21, - DICTIONARY_NULL = 22, + DICTIONARY_LEVEL_NUMBERED = 18, + DICTIONARY_LEVEL_STANDALONE = 19, + DICTIONARY_LEVEL_MEZZANINE = 20, + DICTIONARY_LEVEL_BASEMENT = 21, + DICTIONARY_LEVEL_SUB_BASEMENT = 22, + DICTIONARY_UNIT_NUMBERED = 23, + DICTIONARY_UNIT_STANDALONE = 24, + DICTIONARY_UNIT_DIRECTION = 25, + DICTIONARY_ENTRANCE = 26, + DICTIONARY_STAIRCASE = 27, + + DICTIONARY_NUMBER = 30, + DICTIONARY_NO_NUMBER = 31, + DICTIONARY_HOUSE_NUMBER = 32, + DICTIONARY_POST_OFFICE = 33, + DICTIONARY_POSTCODE = 34, DICTIONARY_PLACE_NAME = 50, DICTIONARY_COMPANY_TYPE = 51, @@ -44,16 +55,15 @@ typedef enum dictionary_type { DICTIONARY_NAMED_PERSON = 60, DICTIONARY_NAMED_ORGANIZATION = 61, - DICTIONARY_LOCALITY = 100, - DICTIONARY_ADMIN1 = 101, - DICTIONARY_ADMIN2 = 102, - DICTIONARY_ADMIN3 = 103, - DICTIONARY_ADMIN4 = 104, - DICTIONARY_ADMIN_OTHER = 105, - DICTIONARY_NEIGHBORHOOD = 106, - DICTIONARY_POSTAL_CODE = 107, - DICTIONARY_COUNTRY = 108, - DICTIONARY_TOPONYM = 109 + DICTIONARY_CATEGORY = 70, + DICTIONARY_CHAIN = 71, + + DICTIONARY_CROSS_STREET = 80, + + DICTIONARY_NULL = 90, + + DICTIONARY_TOPONYM = 100, + DICTIONARY_POSTAL_CODE = 101, } dictionary_type_t; diff --git a/src/libpostal.h b/src/libpostal.h index 30646e59..e2267d8d 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -22,14 +22,13 @@ Address dictionaries #define ADDRESS_HOUSE_NUMBER (1 << 2) #define ADDRESS_STREET (1 << 3) #define ADDRESS_UNIT (1 << 4) +#define ADDRESS_LEVEL (1 << 5) +#define ADDRESS_STAIRCASE (1 << 6) +#define ADDRESS_ENTRANCE (1 << 7) -#define ADDRESS_LOCALITY (1 << 7) -#define ADDRESS_ADMIN1 (1 << 8) -#define ADDRESS_ADMIN2 (1 << 9) -#define ADDRESS_ADMIN3 (1 << 10) -#define ADDRESS_ADMIN4 (1 << 11) -#define ADDRESS_ADMIN_OTHER (1 << 12) -#define ADDRESS_COUNTRY (1 << 13) +#define ADDRESS_CATEGORY (1 << 8) + +#define ADDRESS_TOPONYM (1 << 13) #define ADDRESS_POSTAL_CODE (1 << 14) #define ADDRESS_NEIGHBORHOOD (1 << 15) #define ADDRESS_ALL ((1 << 16) - 1) @@ -37,7 +36,7 @@ Address dictionaries typedef struct normalize_options { // List of language codes char **languages; - int num_languages; + size_t num_languages; uint16_t address_components; // String options