#ifndef GAZETTEERS_H #define GAZETTEERS_H #ifdef __cplusplus extern "C" { #endif #include #include #include "klib/kvec.h" #include "vector.h" // Bit set, should be able to keep it at a short (uint16_t) #define ADDRESS_ANY 1 << 0 #define ADDRESS_NAME 1 << 1 #define ADDRESS_HOUSE_NUMBER 1 << 2 #define ADDRESS_STREET 1 << 3 #define ADDRESS_UNIT 1 << 4 #define ADDRESS_LOCALITY 1 << 7 #define ADDRESS_ADMIN1 1 << 8 #define ADDRESS_ADMIN2 1 << 9 #define ADDRESS_ADMIN3 1 << 10 #define ADDRESS_ADMIN4 1 << 11 #define ADDRESS_ADMIN_OTHER 1 << 12 #define ADDRESS_COUNTRY 1 << 13 #define ADDRESS_POSTAL_CODE 1 << 14 #define ADDRESS_NEIGHBORHOOD 1 << 15 typedef enum dictionary_type { DICTIONARY_ANY = 1, DICTIONARY_SYNONYM = 2, DICTIONARY_STOPWORD = 3, DICTIONARY_ELISION = 4, DICTIONARY_STREET_NAME = 10, DICTIONARY_STREET_TYPE = 11, DICTIONARY_CONCATENATED_PREFIX_SEPARABLE = 12, DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE = 13, DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE = 14, DICTIONARY_DIRECTIONAL = 15, DICTIONARY_QUALIFIER = 16, DICTIONARY_BUILDING_TYPE = 17, DICTIONARY_LEVEL = 18, DICTIONARY_UNIT = 19, DICTIONARY_POST_OFFICE = 20, DICTIONARY_NO_ADDRESS = 21, DICTIONARY_NULL = 22, DICTIONARY_PLACE_NAME = 50, DICTIONARY_COMPANY_TYPE = 51, DICTIONARY_GIVEN_NAME = 52, DICTIONARY_SURNAME = 53, DICTIONARY_PERSONAL_TITLE = 54, DICTIONARY_PERSONAL_SUFFIX = 55, DICTIONARY_ACADEMIC_DEGREE = 56, DICTIONARY_NAMED_PERSON = 60, DICTIONARY_NAMED_ORGANIZATION = 61, DICTIONARY_LOCALITY = 100, DICTIONARY_ADMIN1 = 101, DICTIONARY_ADMIN2 = 102, DICTIONARY_ADMIN3 = 103, DICTIONARY_ADMIN4 = 104, DICTIONARY_ADMIN_OTHER = 105, DICTIONARY_NEIGHBORHOOD = 106, DICTIONARY_POSTAL_CODE = 107, DICTIONARY_COUNTRY = 108, DICTIONARY_TOPONYM = 109 } dictionary_type_t; typedef struct gazetteer { dictionary_type_t type; uint16_t address_components; } gazetteer_t; typdef struct named_gazetteer { char name[64]; gazetteer_t gazetteer; } named_gazetteer_t; // Only need these for the in-memory dictionaries named_gazetteer_t gazetteer_config[] = { {"academic_degrees", {DICTIONARY_ACADEMIC_DEGREE, ADDRESS_NAME}}, {"building_types", {DICTIONARY_BUILDING_TYPE, ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT}}, {"company_types", {DICTIONARY_COMPANY_TYPE, ADDRESS_NAME}}, {"concatenated_prefixes_inseparable", {DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_STREET}}, {"concatenated_suffixes_inseparable", {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_STREET}}, {"concatenated_suffixes_separable", {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_STREET}}, {"directionals", {DICTIONARY_DIRECTIONAL, ADDRESS_ANY}}, {"elisions", {DICTIONARY_ELISION, ADDRESS_ANY}}, {"given_names", {DICTIONARY_GIVEN_NAME, ADDRESS_STREET | ADDRESS_NAME}}, {"level_types", {DICTIONARY_LEVEL, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT}}, {"no_number", {DICTIONARY_NO_ADDRESS, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET}}, {"nulls", {DICTIONARY_NULL, ADDRESS_ANY}}, {"organizations", {DICTIONARY_NAMED_ORGANIZATION, ADDRESS_NAME}}, {"people", {DICTIONARY_NAMED_PERSON, ADDRESS_NAME | ADDRESS_STREET}}, {"personal_suffixes", {DICTIONARY_PERSONAL_SUFFIX, ADDRESS_NAME | ADDRESS_STREET}}, {"personal_titles", {DICTIONARY_PERSONAL_TITLE, ADDRESS_NAME | ADDRESS_STREET}}, {"place_names", {DICTIONARY_PLACE_NAME, ADDRESS_NAME | ADDRESS_STREET}}, {"post_office", {DICTIONARY_POST_OFFICE, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET}}, {"qualifiers", {DICTIONARY_QUALIFIER, ADDRESS_STREET}}, {"stopwords", {DICTIONARY_STOPWORD, ADDRESS_ANY}}, {"street_types", {DICTIONARY_STREET_TYPE, ADDRESS_STREET}}, {"surnames", {DICTIONARY_SURNAME, ADDRESS_STREET | ADDRESS_NAME}}, {"synonyms", {DICTIONARY_SYNONYM, ADDRESS_ANY}}, {"toponyms", {DICTIONARY_TOPONYM, ADDRESS_LOCALITY | ADDRESS_ADMIN1 | ADDRESS_ADMIN2 | ADDRESS_ADMIN3 | ADDRESS_ADMIN4 | ADDRESS_ADMIN_OTHER | ADDRESS_NEIGHBORHOOD}}, {"unit_types", {DICTIONARY_UNIT, ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET}} }; VECTOR_INIT(gazetteer_array, gazetteer_t) #define NUM_DICTIONARY_TYPES sizeof(gazetteers) / sizeof(gazetteer_t) #ifdef __cplusplus } #endif #endif