[normalize/api] exposing normalize_string_languages and normalized_tokens_languages to the API for pre-normalizing numeric expressions at tokenization time
This commit is contained in:
@@ -334,19 +334,24 @@ libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n) {
|
|||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *libpostal_normalize_string(char *str, uint64_t options) {
|
|
||||||
|
char *libpostal_normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
|
||||||
if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) {
|
if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) {
|
||||||
return normalize_string_latin(str, strlen(str), options);
|
return normalize_string_latin_languages(str, strlen(str), options, num_languages, languages);
|
||||||
} else {
|
} else {
|
||||||
return normalize_string_utf8(str, options);
|
return normalize_string_utf8_languages(str, options, num_languages, languages);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) {
|
inline char *libpostal_normalize_string(char *str, uint64_t options) {
|
||||||
|
return libpostal_normalize_string_languages(str, options, 0, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n) {
|
||||||
if (input == NULL) {
|
if (input == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
char *normalized = libpostal_normalize_string(input, string_options);
|
char *normalized = libpostal_normalize_string_languages(input, string_options, num_languages, languages);
|
||||||
if (normalized == NULL) {
|
if (normalized == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@@ -385,6 +390,11 @@ libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) {
|
||||||
|
return libpostal_normalized_tokens_languages(input, string_options, token_options, whitespace, 0, NULL, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
bool libpostal_setup_language_classifier(void) {
|
bool libpostal_setup_language_classifier(void) {
|
||||||
return libpostal_setup_language_classifier_datadir(NULL);
|
return libpostal_setup_language_classifier_datadir(NULL);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -304,6 +304,7 @@ LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitesp
|
|||||||
|
|
||||||
#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)
|
#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)
|
||||||
|
|
||||||
|
LIBPOSTAL_EXPORT char *libpostal_normalize_string_languages(char *input, uint64_t options, size_t num_languages, char **languages);
|
||||||
LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options);
|
LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options);
|
||||||
|
|
||||||
|
|
||||||
@@ -312,7 +313,9 @@ typedef struct libpostal_normalized_token {
|
|||||||
libpostal_token_t token;
|
libpostal_token_t token;
|
||||||
} libpostal_normalized_token_t;
|
} libpostal_normalized_token_t;
|
||||||
|
|
||||||
libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n);
|
LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n);
|
||||||
|
LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -68,7 +68,10 @@ As well as normalizations for individual string tokens:
|
|||||||
|
|
||||||
char *normalize_string_utf8(char *str, uint64_t options);
|
char *normalize_string_utf8(char *str, uint64_t options);
|
||||||
|
|
||||||
|
char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages);
|
||||||
char *normalize_string_latin(char *str, size_t len, uint64_t options);
|
char *normalize_string_latin(char *str, size_t len, uint64_t options);
|
||||||
|
char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages);
|
||||||
|
|
||||||
|
|
||||||
// Takes NORMALIZE_TOKEN_* options
|
// Takes NORMALIZE_TOKEN_* options
|
||||||
void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options);
|
void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options);
|
||||||
|
|||||||
Reference in New Issue
Block a user