Files
libpostal/src/gazetteers.h

126 lines
4.3 KiB
C

#ifndef GAZETTEERS_H
#define GAZETTEERS_H
#ifdef __cplusplus
extern "C" {
#endif
#include <stdio.h>
#include <stdlib.h>
#include "klib/kvec.h"
#include "vector.h"
// Bit set, should be able to keep it at a short (uint16_t)
#define ADDRESS_ANY 1 << 0
#define ADDRESS_NAME 1 << 1
#define ADDRESS_HOUSE_NUMBER 1 << 2
#define ADDRESS_STREET 1 << 3
#define ADDRESS_UNIT 1 << 4
#define ADDRESS_LOCALITY 1 << 7
#define ADDRESS_ADMIN1 1 << 8
#define ADDRESS_ADMIN2 1 << 9
#define ADDRESS_ADMIN3 1 << 10
#define ADDRESS_ADMIN4 1 << 11
#define ADDRESS_ADMIN_OTHER 1 << 12
#define ADDRESS_COUNTRY 1 << 13
#define ADDRESS_POSTAL_CODE 1 << 14
#define ADDRESS_NEIGHBORHOOD 1 << 15
typedef enum dictionary_type {
DICTIONARY_ANY = 1,
DICTIONARY_SYNONYM = 2,
DICTIONARY_STOPWORD = 3,
DICTIONARY_ELISION = 4,
DICTIONARY_STREET_NAME = 10,
DICTIONARY_STREET_TYPE = 11,
DICTIONARY_CONCATENATED_PREFIX_SEPARABLE = 12,
DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE = 13,
DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE = 14,
DICTIONARY_DIRECTIONAL = 15,
DICTIONARY_QUALIFIER = 16,
DICTIONARY_BUILDING_TYPE = 17,
DICTIONARY_LEVEL = 18,
DICTIONARY_UNIT = 19,
DICTIONARY_POST_OFFICE = 20,
DICTIONARY_NO_ADDRESS = 21,
DICTIONARY_NULL = 22,
DICTIONARY_PLACE_NAME = 50,
DICTIONARY_COMPANY_TYPE = 51,
DICTIONARY_GIVEN_NAME = 52,
DICTIONARY_SURNAME = 53,
DICTIONARY_PERSONAL_TITLE = 54,
DICTIONARY_PERSONAL_SUFFIX = 55,
DICTIONARY_ACADEMIC_DEGREE = 56,
DICTIONARY_NAMED_PERSON = 60,
DICTIONARY_NAMED_ORGANIZATION = 61,
DICTIONARY_LOCALITY = 100,
DICTIONARY_ADMIN1 = 101,
DICTIONARY_ADMIN2 = 102,
DICTIONARY_ADMIN3 = 103,
DICTIONARY_ADMIN4 = 104,
DICTIONARY_ADMIN_OTHER = 105,
DICTIONARY_NEIGHBORHOOD = 106,
DICTIONARY_POSTAL_CODE = 107,
DICTIONARY_COUNTRY = 108,
DICTIONARY_TOPONYM = 109
} dictionary_type_t;
typedef struct gazetteer {
dictionary_type_t type;
uint16_t address_components;
} gazetteer_t;
typdef struct named_gazetteer {
char name[64];
gazetteer_t gazetteer;
} named_gazetteer_t;
// Only need these for the in-memory dictionaries
named_gazetteer_t gazetteer_config[] = {
{"academic_degrees", {DICTIONARY_ACADEMIC_DEGREE, ADDRESS_NAME}},
{"building_types", {DICTIONARY_BUILDING_TYPE, ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT}},
{"company_types", {DICTIONARY_COMPANY_TYPE, ADDRESS_NAME}},
{"concatenated_prefixes_inseparable", {DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_STREET}},
{"concatenated_suffixes_inseparable", {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_STREET}},
{"concatenated_suffixes_separable", {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_STREET}},
{"directionals", {DICTIONARY_DIRECTIONAL, ADDRESS_ANY}},
{"elisions", {DICTIONARY_ELISION, ADDRESS_ANY}},
{"given_names", {DICTIONARY_GIVEN_NAME, ADDRESS_STREET | ADDRESS_NAME}},
{"level_types", {DICTIONARY_LEVEL, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT}},
{"no_number", {DICTIONARY_NO_ADDRESS, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET}},
{"nulls", {DICTIONARY_NULL, ADDRESS_ANY}},
{"organizations", {DICTIONARY_NAMED_ORGANIZATION, ADDRESS_NAME}},
{"people", {DICTIONARY_NAMED_PERSON, ADDRESS_NAME | ADDRESS_STREET}},
{"personal_suffixes", {DICTIONARY_PERSONAL_SUFFIX, ADDRESS_NAME | ADDRESS_STREET}},
{"personal_titles", {DICTIONARY_PERSONAL_TITLE, ADDRESS_NAME | ADDRESS_STREET}},
{"place_names", {DICTIONARY_PLACE_NAME, ADDRESS_NAME | ADDRESS_STREET}},
{"post_office", {DICTIONARY_POST_OFFICE, ADDRESS_HOUSE_NUMBER | ADDRESS_STREET}},
{"qualifiers", {DICTIONARY_QUALIFIER, ADDRESS_STREET}},
{"stopwords", {DICTIONARY_STOPWORD, ADDRESS_ANY}},
{"street_types", {DICTIONARY_STREET_TYPE, ADDRESS_STREET}},
{"surnames", {DICTIONARY_SURNAME, ADDRESS_STREET | ADDRESS_NAME}},
{"synonyms", {DICTIONARY_SYNONYM, ADDRESS_ANY}},
{"toponyms", {DICTIONARY_TOPONYM, ADDRESS_LOCALITY | ADDRESS_ADMIN1 | ADDRESS_ADMIN2 | ADDRESS_ADMIN3 | ADDRESS_ADMIN4 | ADDRESS_ADMIN_OTHER | ADDRESS_NEIGHBORHOOD}},
{"unit_types", {DICTIONARY_UNIT, ADDRESS_NAME | ADDRESS_HOUSE_NUMBER | ADDRESS_STREET}}
};
VECTOR_INIT(gazetteer_array, gazetteer_t)
#define NUM_DICTIONARY_TYPES sizeof(gazetteers) / sizeof(gazetteer_t)
#ifdef __cplusplus
}
#endif
#endif