[expand] adding a normalization for a single non-acronym internal period where there's an expansion at the prefix/suffix (for #218 and https://github.com/openvenues/libpostal/issues/216#issuecomment-306617824). Helps in cases like "St.Michaels" or "Jln.Utara" without needing to specify concatenated prefix phrases for every possibility
This commit is contained in:
@@ -160,6 +160,12 @@ bool libpostal_setup_parser(void);
|
||||
bool libpostal_setup_parser_datadir(char *datadir);
|
||||
void libpostal_teardown_parser(void);
|
||||
|
||||
bool libpostal_setup_language_classifier(void);
|
||||
bool libpostal_setup_language_classifier_datadir(char *datadir);
|
||||
void libpostal_teardown_language_classifier(void);
|
||||
|
||||
/* Tokenization and token normalization APIs */
|
||||
|
||||
typedef struct libpostal_token {
|
||||
size_t offset;
|
||||
size_t len;
|
||||
@@ -190,6 +196,7 @@ libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n);
|
||||
#define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6
|
||||
#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7
|
||||
#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8
|
||||
#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS 1 << 9
|
||||
|
||||
#define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE)
|
||||
|
||||
@@ -209,10 +216,6 @@ typedef struct libpostal_normalized_token {
|
||||
|
||||
libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n);
|
||||
|
||||
bool libpostal_setup_language_classifier(void);
|
||||
bool libpostal_setup_language_classifier_datadir(char *datadir);
|
||||
void libpostal_teardown_language_classifier(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user