From c5bb9d8daa489e289ea0bd74933b3a5c3dd1baf3 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 22 Feb 2018 18:47:36 -0500 Subject: [PATCH] [normalize/api] exposing normalize_string_languages and normalized_tokens_languages to the API for pre-normalizing numeric expressions at tokenization time --- src/libpostal.c | 20 +++++++++++++++----- src/libpostal.h | 5 ++++- src/normalize.h | 3 +++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/libpostal.c b/src/libpostal.c index 288f42c9..d56e7000 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -334,19 +334,24 @@ libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n) { return a; } -char *libpostal_normalize_string(char *str, uint64_t options) { + +char *libpostal_normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) { - return normalize_string_latin(str, strlen(str), options); + return normalize_string_latin_languages(str, strlen(str), options, num_languages, languages); } else { - return normalize_string_utf8(str, options); + return normalize_string_utf8_languages(str, options, num_languages, languages); } } -libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) { +inline char *libpostal_normalize_string(char *str, uint64_t options) { + return libpostal_normalize_string_languages(str, options, 0, NULL); +} + +libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n) { if (input == NULL) { return NULL; } - char *normalized = libpostal_normalize_string(input, string_options); + char *normalized = libpostal_normalize_string_languages(input, string_options, num_languages, languages); if (normalized == NULL) { return NULL; } @@ -385,6 +390,11 @@ libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t return result; } +inline libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) { + return libpostal_normalized_tokens_languages(input, string_options, token_options, whitespace, 0, NULL, n); +} + + bool libpostal_setup_language_classifier(void) { return libpostal_setup_language_classifier_datadir(NULL); } diff --git a/src/libpostal.h b/src/libpostal.h index 5f253566..89b9a4c6 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -304,6 +304,7 @@ LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitesp #define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) +LIBPOSTAL_EXPORT char *libpostal_normalize_string_languages(char *input, uint64_t options, size_t num_languages, char **languages); LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options); @@ -312,7 +313,9 @@ typedef struct libpostal_normalized_token { libpostal_token_t token; } libpostal_normalized_token_t; -libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); +LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); +LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n); + #ifdef __cplusplus } diff --git a/src/normalize.h b/src/normalize.h index 9d58f78b..5d10b2ad 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -68,7 +68,10 @@ As well as normalizations for individual string tokens: char *normalize_string_utf8(char *str, uint64_t options); +char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages); char *normalize_string_latin(char *str, size_t len, uint64_t options); +char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages); + // Takes NORMALIZE_TOKEN_* options void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options);