[normalize/api] exposing normalize_string_languages and normalized_tokens_languages to the API for pre-normalizing numeric expressions at tokenization time

This commit is contained in:
Al
2018-02-22 18:47:36 -05:00
parent 0edb897143
commit c5bb9d8daa
3 changed files with 22 additions and 6 deletions

View File

@@ -334,19 +334,24 @@ libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n) {
return a; return a;
} }
char *libpostal_normalize_string(char *str, uint64_t options) {
char *libpostal_normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) { if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) {
return normalize_string_latin(str, strlen(str), options); return normalize_string_latin_languages(str, strlen(str), options, num_languages, languages);
} else { } else {
return normalize_string_utf8(str, options); return normalize_string_utf8_languages(str, options, num_languages, languages);
} }
} }
libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) { inline char *libpostal_normalize_string(char *str, uint64_t options) {
return libpostal_normalize_string_languages(str, options, 0, NULL);
}
libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n) {
if (input == NULL) { if (input == NULL) {
return NULL; return NULL;
} }
char *normalized = libpostal_normalize_string(input, string_options); char *normalized = libpostal_normalize_string_languages(input, string_options, num_languages, languages);
if (normalized == NULL) { if (normalized == NULL) {
return NULL; return NULL;
} }
@@ -385,6 +390,11 @@ libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t
return result; return result;
} }
inline libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) {
return libpostal_normalized_tokens_languages(input, string_options, token_options, whitespace, 0, NULL, n);
}
bool libpostal_setup_language_classifier(void) { bool libpostal_setup_language_classifier(void) {
return libpostal_setup_language_classifier_datadir(NULL); return libpostal_setup_language_classifier_datadir(NULL);
} }

View File

@@ -304,6 +304,7 @@ LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitesp
#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) #define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)
LIBPOSTAL_EXPORT char *libpostal_normalize_string_languages(char *input, uint64_t options, size_t num_languages, char **languages);
LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options); LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options);
@@ -312,7 +313,9 @@ typedef struct libpostal_normalized_token {
libpostal_token_t token; libpostal_token_t token;
} libpostal_normalized_token_t; } libpostal_normalized_token_t;
libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n);
LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@@ -68,7 +68,10 @@ As well as normalizations for individual string tokens:
char *normalize_string_utf8(char *str, uint64_t options); char *normalize_string_utf8(char *str, uint64_t options);
char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages);
char *normalize_string_latin(char *str, size_t len, uint64_t options); char *normalize_string_latin(char *str, size_t len, uint64_t options);
char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages);
// Takes NORMALIZE_TOKEN_* options // Takes NORMALIZE_TOKEN_* options
void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options); void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options);