Merge pull request #184 from openvenues/remove_ordinal_suffix
Remove ordinal suffixes in libpostal_expand_address
This commit is contained in:
@@ -11,7 +11,7 @@ env:
|
|||||||
- secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
|
- secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
|
||||||
- GH_REF=github.com/openvenues/libpostal
|
- GH_REF=github.com/openvenues/libpostal
|
||||||
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/*.txt" | wc -l)
|
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/*.txt" | wc -l)
|
||||||
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex" | wc -l)
|
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep -E "(resources/numex|src/numex_table_builder.c)|" | wc -l)
|
||||||
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
|
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
|
||||||
compiler:
|
compiler:
|
||||||
- clang
|
- clang
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include
|
|||||||
CFLAGS =
|
CFLAGS =
|
||||||
|
|
||||||
lib_LTLIBRARIES = libpostal.la
|
lib_LTLIBRARIES = libpostal.la
|
||||||
libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c numex.c utf8proc/utf8proc.c cmp/cmp.c normalize.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
|
libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
|
||||||
libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS)
|
libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS)
|
||||||
libpostal_la_CFLAGS = $(CFLAGS_O2)
|
libpostal_la_CFLAGS = $(CFLAGS_O2)
|
||||||
libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@
|
libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@
|
||||||
@@ -34,7 +34,7 @@ libpostal_CFLAGS = $(CFLAGS_O3)
|
|||||||
bench_SOURCES = bench.c
|
bench_SOURCES = bench.c
|
||||||
bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
|
bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
|
||||||
bench_CFLAGS = $(CFLAGS_O3)
|
bench_CFLAGS = $(CFLAGS_O3)
|
||||||
address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c numex.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c
|
address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c
|
||||||
address_parser_LDADD = libscanner.la $(CBLAS_LIBS)
|
address_parser_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
address_parser_CFLAGS = $(CFLAGS_O3)
|
address_parser_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
@@ -44,21 +44,21 @@ build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_ut
|
|||||||
build_numex_table_CFLAGS = $(CFLAGS_O3)
|
build_numex_table_CFLAGS = $(CFLAGS_O3)
|
||||||
build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
|
build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
|
||||||
build_trans_table_CFLAGS = $(CFLAGS_O3)
|
build_trans_table_CFLAGS = $(CFLAGS_O3)
|
||||||
address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c
|
address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c
|
||||||
address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS)
|
address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
address_parser_train_CFLAGS = $(CFLAGS_O3)
|
address_parser_train_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c
|
address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c
|
||||||
address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS)
|
address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
address_parser_test_CFLAGS = $(CFLAGS_O3)
|
address_parser_test_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
language_classifier_train_SOURCES = language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c
|
language_classifier_train_SOURCES = language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c
|
||||||
language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS)
|
language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
language_classifier_train_CFLAGS = $(CFLAGS_O3)
|
language_classifier_train_CFLAGS = $(CFLAGS_O3)
|
||||||
language_classifier_SOURCES = language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
|
language_classifier_SOURCES = language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
|
||||||
language_classifier_LDADD = libscanner.la $(CBLAS_LIBS)
|
language_classifier_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
language_classifier_CFLAGS = $(CFLAGS_O3)
|
language_classifier_CFLAGS = $(CFLAGS_O3)
|
||||||
language_classifier_test_SOURCES = language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
|
language_classifier_test_SOURCES = language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
|
||||||
language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS)
|
language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
language_classifier_test_CFLAGS = $(CFLAGS_O3)
|
language_classifier_test_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
|
|||||||
@@ -79,6 +79,7 @@ static inline uint64_t get_normalize_string_options(libpostal_normalize_options_
|
|||||||
normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0;
|
normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0;
|
||||||
normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0;
|
normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0;
|
||||||
normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0;
|
normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0;
|
||||||
|
normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0;
|
||||||
|
|
||||||
return normalize_string_options;
|
return normalize_string_options;
|
||||||
}
|
}
|
||||||
@@ -558,7 +559,6 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok
|
|||||||
address_expansion_t prefix_expansion;
|
address_expansion_t prefix_expansion;
|
||||||
address_expansion_t suffix_expansion;
|
address_expansion_t suffix_expansion;
|
||||||
|
|
||||||
char_array *key = char_array_new_size(token.len);
|
|
||||||
char *expansion;
|
char *expansion;
|
||||||
|
|
||||||
size_t num_strings = 0;
|
size_t num_strings = 0;
|
||||||
@@ -582,10 +582,11 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!have_suffix && !have_prefix) {
|
if (!have_suffix && !have_prefix) {
|
||||||
char_array_destroy(key);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char_array *key = char_array_new_size(token.len);
|
||||||
|
|
||||||
if (have_prefix && have_suffix) {
|
if (have_prefix && have_suffix) {
|
||||||
for (size_t i = 0; i < prefix_expansions->n; i++) {
|
for (size_t i = 0; i < prefix_expansions->n; i++) {
|
||||||
prefix_expansion = prefix_expansions->a[i];
|
prefix_expansion = prefix_expansions->a[i];
|
||||||
@@ -760,16 +761,34 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to
|
|||||||
|
|
||||||
if ((suffix.len == 0 && prefix.len == 0)) return false;
|
if ((suffix.len == 0 && prefix.len == 0)) return false;
|
||||||
|
|
||||||
|
|
||||||
return add_affix_expansions(tree, str, lang, token, prefix, suffix, options);
|
return add_affix_expansions(tree, str, lang, token, prefix, suffix, options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
|
||||||
|
size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang);
|
||||||
|
|
||||||
|
if (len_ordinal_suffix == 0) return false;
|
||||||
|
|
||||||
|
cstring_array *strings = tree->strings;
|
||||||
|
// Add the original form first. When this function returns true,
|
||||||
|
// add_normalized_strings_token won't be called a second time.
|
||||||
|
add_normalized_strings_token(strings, str, token, options);
|
||||||
|
|
||||||
|
char_array *key = char_array_new_size(token.len - len_ordinal_suffix + 1);
|
||||||
|
char_array_cat_len(key, str + token.offset, token.len - len_ordinal_suffix);
|
||||||
|
char *expansion = char_array_get_string(key);
|
||||||
|
cstring_array_add_string(strings, expansion);
|
||||||
|
char_array_destroy(key);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
|
static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
|
||||||
cstring_array *strings = tree->strings;
|
cstring_array *strings = tree->strings;
|
||||||
|
|
||||||
for (size_t i = 0; i < tokens->n; i++) {
|
for (size_t i = 0; i < tokens->n; i++) {
|
||||||
token_t token = tokens->a[i];
|
token_t token = tokens->a[i];
|
||||||
bool have_phrase = false;
|
bool have_phrase = false;
|
||||||
|
bool have_ordinal = false;
|
||||||
|
|
||||||
if (is_special_token(token.type)) {
|
if (is_special_token(token.type)) {
|
||||||
string_tree_add_string_len(tree, str + token.offset, token.len);
|
string_tree_add_string_len(tree, str + token.offset, token.len);
|
||||||
@@ -783,9 +802,14 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s
|
|||||||
have_phrase = true;
|
have_phrase = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (normalize_ordinal_suffixes(tree, str, lang, token, options)) {
|
||||||
|
have_ordinal = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!have_phrase) {
|
if (!have_phrase && !have_ordinal) {
|
||||||
add_normalized_strings_token(strings, str, token, options);
|
add_normalized_strings_token(strings, str, token, options);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -827,44 +851,17 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_
|
|||||||
|
|
||||||
char *tokenized_str = char_array_get_string(temp_string);
|
char *tokenized_str = char_array_get_string(temp_string);
|
||||||
|
|
||||||
char *new_str = tokenized_str;
|
|
||||||
char *last_numex_str = NULL;
|
|
||||||
if (options.expand_numex) {
|
|
||||||
char *numex_replaced = NULL;
|
|
||||||
for (size_t i = 0; i < options.num_languages; i++) {
|
|
||||||
lang = options.languages[i];
|
|
||||||
|
|
||||||
numex_replaced = replace_numeric_expressions(new_str, lang);
|
|
||||||
if (numex_replaced != NULL) {
|
|
||||||
new_str = numex_replaced;
|
|
||||||
|
|
||||||
if (last_numex_str != NULL) {
|
|
||||||
free(last_numex_str);
|
|
||||||
}
|
|
||||||
last_numex_str = numex_replaced;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
string_tree_t *alternatives;
|
string_tree_t *alternatives;
|
||||||
|
|
||||||
int ret;
|
int ret;
|
||||||
log_debug("new_str=%s\n", new_str);
|
|
||||||
|
|
||||||
log_debug("Adding alternatives for single normalization\n");
|
log_debug("Adding alternatives for single normalization\n");
|
||||||
alternatives = add_string_alternatives(new_str, options);
|
alternatives = add_string_alternatives(tokenized_str, options);
|
||||||
|
|
||||||
if (last_numex_str != NULL) {
|
|
||||||
free(last_numex_str);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (alternatives == NULL) {
|
if (alternatives == NULL) {
|
||||||
log_debug("alternatives = NULL\n");
|
log_debug("alternatives = NULL\n");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
iter = string_tree_iterator_new(alternatives);
|
iter = string_tree_iterator_new(alternatives);
|
||||||
log_debug("iter->num_tokens=%d\n", iter->num_tokens);
|
log_debug("iter->num_tokens=%d\n", iter->num_tokens);
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,24 @@
|
|||||||
#define FULL_STOP_CODEPOINT 0x002e
|
#define FULL_STOP_CODEPOINT 0x002e
|
||||||
#define APOSTROPHE_CODEPOINT 0x0027
|
#define APOSTROPHE_CODEPOINT 0x0027
|
||||||
|
|
||||||
|
char *normalize_replace_numex(char *str, size_t num_languages, char **languages) {
|
||||||
|
char *numex_normalized = NULL;
|
||||||
|
|
||||||
char *normalize_string_utf8(char *str, uint64_t options) {
|
for (size_t i = 0; i < num_languages; i++) {
|
||||||
|
char *lang = languages[i];
|
||||||
|
char *numex_replaced = replace_numeric_expressions(numex_normalized == NULL ? str : numex_normalized, lang);
|
||||||
|
if (numex_replaced != NULL) {
|
||||||
|
if (numex_normalized != NULL) {
|
||||||
|
free(numex_normalized);
|
||||||
|
}
|
||||||
|
numex_normalized = numex_replaced;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return numex_normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
|
||||||
int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC;
|
int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC;
|
||||||
uint8_t *utf8proc_normalized = NULL;
|
uint8_t *utf8proc_normalized = NULL;
|
||||||
|
|
||||||
@@ -63,7 +79,7 @@ char *normalize_string_utf8(char *str, uint64_t options) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (options & NORMALIZE_STRING_REPLACE_HYPHENS && strchr(str, '-') != NULL) {
|
if (options & NORMALIZE_STRING_REPLACE_HYPHENS && string_contains_hyphen(str)) {
|
||||||
char *replaced = string_replace_char(str, '-', ' ');
|
char *replaced = string_replace_char(str, '-', ' ');
|
||||||
if (replaced != NULL) {
|
if (replaced != NULL) {
|
||||||
if (normalized_allocated) {
|
if (normalized_allocated) {
|
||||||
@@ -76,11 +92,28 @@ char *normalize_string_utf8(char *str, uint64_t options) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (options & NORMALIZE_STRING_REPLACE_NUMEX && num_languages > 0) {
|
||||||
|
char *numex_normalized = normalize_replace_numex(str, num_languages, languages);
|
||||||
|
if (numex_normalized != NULL) {
|
||||||
|
if (normalized_allocated) {
|
||||||
|
free(normalized);
|
||||||
|
}
|
||||||
|
normalized = numex_normalized;
|
||||||
|
str = normalized;
|
||||||
|
normalized_allocated = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
return normalized;
|
return normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char *normalize_string_utf8(char *str, uint64_t options) {
|
||||||
|
return normalize_string_utf8_languages(str, options, 0, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
char *normalize_string_latin(char *str, size_t len, uint64_t options) {
|
|
||||||
|
char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages) {
|
||||||
char *latin_transliterator = LATIN_ASCII;
|
char *latin_transliterator = LATIN_ASCII;
|
||||||
if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) {
|
if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) {
|
||||||
latin_transliterator = LATIN_ASCII_SIMPLE;
|
latin_transliterator = LATIN_ASCII_SIMPLE;
|
||||||
@@ -90,9 +123,9 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) {
|
|||||||
|
|
||||||
char *utf8_normalized;
|
char *utf8_normalized;
|
||||||
if (transliterated == NULL) {
|
if (transliterated == NULL) {
|
||||||
utf8_normalized = normalize_string_utf8(str, options);
|
utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages);
|
||||||
} else {
|
} else {
|
||||||
utf8_normalized = normalize_string_utf8(transliterated, options);
|
utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages);
|
||||||
free(transliterated);
|
free(transliterated);
|
||||||
transliterated = NULL;
|
transliterated = NULL;
|
||||||
}
|
}
|
||||||
@@ -100,7 +133,11 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) {
|
|||||||
return utf8_normalized;
|
return utf8_normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) {
|
char *normalize_string_latin(char *str, size_t len, uint64_t options) {
|
||||||
|
return normalize_string_latin_languages(str, len, options, 0, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options, size_t num_languages, char **languages) {
|
||||||
|
|
||||||
char *transliterated = NULL;
|
char *transliterated = NULL;
|
||||||
char *utf8_normalized = NULL;
|
char *utf8_normalized = NULL;
|
||||||
@@ -114,7 +151,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
|
|||||||
if (options & NORMALIZE_STRING_LATIN_ASCII) {
|
if (options & NORMALIZE_STRING_LATIN_ASCII) {
|
||||||
transliterated = transliterate(latin_transliterator, str, len);
|
transliterated = transliterate(latin_transliterator, str, len);
|
||||||
if (transliterated != NULL) {
|
if (transliterated != NULL) {
|
||||||
utf8_normalized = normalize_string_utf8(transliterated, options);
|
utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages);
|
||||||
free(transliterated);
|
free(transliterated);
|
||||||
transliterated = NULL;
|
transliterated = NULL;
|
||||||
}
|
}
|
||||||
@@ -127,7 +164,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
char *str_copy = strndup(str, len);
|
char *str_copy = strndup(str, len);
|
||||||
utf8_normalized = normalize_string_utf8(str_copy, options);
|
utf8_normalized = normalize_string_utf8_languages(str_copy, options, num_languages, languages);
|
||||||
free(str_copy);
|
free(str_copy);
|
||||||
|
|
||||||
if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) {
|
if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) {
|
||||||
@@ -150,9 +187,9 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
|
|||||||
if (prev_string != NULL) {
|
if (prev_string != NULL) {
|
||||||
free(prev_string);
|
free(prev_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
|
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
|
||||||
size_t len = strlen(str);
|
size_t len = strlen(str);
|
||||||
string_tree_t *tree = string_tree_new_size(len);
|
string_tree_t *tree = string_tree_new_size(len);
|
||||||
@@ -161,15 +198,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
|
|||||||
|
|
||||||
khash_t(int_set) *scripts = kh_init(int_set);
|
khash_t(int_set) *scripts = kh_init(int_set);
|
||||||
char *utf8_normalized = NULL;
|
char *utf8_normalized = NULL;
|
||||||
|
char *numex_replaced = NULL;
|
||||||
char *ptr = str;
|
|
||||||
|
|
||||||
script_t script;
|
script_t script;
|
||||||
|
|
||||||
char *trans_name = NULL;
|
char *trans_name = NULL;
|
||||||
char *lang;
|
char *lang;
|
||||||
|
|
||||||
bool transliterate_latin = false;
|
char *ptr = str;
|
||||||
|
|
||||||
|
bool have_latin_transliterator = false;
|
||||||
while (consumed < len) {
|
while (consumed < len) {
|
||||||
string_script_t script_span = get_string_script(ptr, len - consumed);
|
string_script_t script_span = get_string_script(ptr, len - consumed);
|
||||||
script = script_span.script;
|
script = script_span.script;
|
||||||
@@ -182,12 +220,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
|
|||||||
if (html_escaped != NULL) {
|
if (html_escaped != NULL) {
|
||||||
str = html_escaped;
|
str = html_escaped;
|
||||||
}
|
}
|
||||||
utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE);
|
|
||||||
|
options ^= NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_STRIP_ACCENTS | NORMALIZE_STRING_LATIN_ASCII;
|
||||||
|
|
||||||
|
utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages);
|
||||||
if (utf8_normalized != NULL) {
|
if (utf8_normalized != NULL) {
|
||||||
if (html_escaped != NULL) {
|
if (html_escaped != NULL) {
|
||||||
free(html_escaped);
|
free(html_escaped);
|
||||||
html_escaped = NULL;
|
html_escaped = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
string_tree_add_string(tree, utf8_normalized);
|
string_tree_add_string(tree, utf8_normalized);
|
||||||
string_tree_finalize_token(tree);
|
string_tree_finalize_token(tree);
|
||||||
free(utf8_normalized);
|
free(utf8_normalized);
|
||||||
@@ -200,22 +242,22 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
|
|||||||
|
|
||||||
log_debug("script_len=%zu\n", script_len);
|
log_debug("script_len=%zu\n", script_len);
|
||||||
|
|
||||||
if (script == SCRIPT_LATIN && num_languages > 0 && !transliterate_latin) {
|
if (script == SCRIPT_LATIN && num_languages > 0 && !have_latin_transliterator) {
|
||||||
for (size_t i = 0; i < num_languages; i++) {
|
for (size_t i = 0; i < num_languages; i++) {
|
||||||
lang = languages[i];
|
lang = languages[i];
|
||||||
foreach_transliterator(script, lang, trans_name, {
|
foreach_transliterator(script, lang, trans_name, {
|
||||||
if (!string_equals(trans_name, LATIN_ASCII)) {
|
if (!string_equals(trans_name, LATIN_ASCII)) {
|
||||||
transliterate_latin = true;
|
have_latin_transliterator = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
if (transliterate_latin) break;
|
if (have_latin_transliterator) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((script != SCRIPT_LATIN || transliterate_latin) && script_len > 0) {
|
if ((script != SCRIPT_LATIN || have_latin_transliterator) && script_len > 0) {
|
||||||
int ret;
|
int ret;
|
||||||
khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret);
|
khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
@@ -230,8 +272,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
|
|||||||
ptr += script_len;
|
ptr += script_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!transliterate_latin) {
|
if (!have_latin_transliterator) {
|
||||||
add_latin_alternatives(tree, str, len, options);
|
add_latin_alternatives(tree, str, len, options, num_languages, languages);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t transliterate_scripts = kh_size(scripts);
|
size_t transliterate_scripts = kh_size(scripts);
|
||||||
@@ -276,7 +318,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
|
|||||||
prev = transliterated;
|
prev = transliterated;
|
||||||
})
|
})
|
||||||
|
|
||||||
add_latin_alternatives(tree, transliterated, strlen(transliterated), options);
|
add_latin_alternatives(tree, transliterated, strlen(transliterated), options, num_languages, languages);
|
||||||
if (transliterated != str) {
|
if (transliterated != str) {
|
||||||
free(transliterated);
|
free(transliterated);
|
||||||
}
|
}
|
||||||
@@ -287,8 +329,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (transliterate_latin) {
|
if (have_latin_transliterator) {
|
||||||
add_latin_alternatives(tree, str, len, options);
|
add_latin_alternatives(tree, str, len, options, num_languages, languages);
|
||||||
}
|
}
|
||||||
|
|
||||||
kh_destroy(int_set, scripts);
|
kh_destroy(int_set, scripts);
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ As well as normalizations for individual string tokens:
|
|||||||
#include "string_utils.h"
|
#include "string_utils.h"
|
||||||
#include "utf8proc/utf8proc.h"
|
#include "utf8proc/utf8proc.h"
|
||||||
#include "unicode_scripts.h"
|
#include "unicode_scripts.h"
|
||||||
|
#include "numex.h"
|
||||||
#include "transliterate.h"
|
#include "transliterate.h"
|
||||||
#include "trie.h"
|
#include "trie.h"
|
||||||
#include "tokens.h"
|
#include "tokens.h"
|
||||||
@@ -47,6 +48,7 @@ As well as normalizations for individual string tokens:
|
|||||||
#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
|
#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
|
||||||
#define NORMALIZE_STRING_COMPOSE 1 << 7
|
#define NORMALIZE_STRING_COMPOSE 1 << 7
|
||||||
#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8
|
#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8
|
||||||
|
#define NORMALIZE_STRING_REPLACE_NUMEX 1 << 9
|
||||||
|
|
||||||
#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
|
#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
|
||||||
#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
|
#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
|
||||||
|
|||||||
93
src/numex.c
93
src/numex.c
@@ -911,7 +911,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lang, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) {
|
static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lang, char *ns, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) {
|
||||||
numex_language_t *language = get_numex_language(lang);
|
numex_language_t *language = get_numex_language(lang);
|
||||||
|
|
||||||
if (language == NULL) {
|
if (language == NULL) {
|
||||||
@@ -926,7 +926,7 @@ static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lan
|
|||||||
return NULL_PREFIX_RESULT;
|
return NULL_PREFIX_RESULT;
|
||||||
}
|
}
|
||||||
|
|
||||||
prefix = trie_get_prefix_from_index(trie, ORDINAL_NAMESPACE_PREFIX, ORDINAL_NAMESPACE_PREFIX_LEN, prefix.node_id, prefix.tail_pos);
|
prefix = trie_get_prefix_from_index(trie, ns, strlen(ns), prefix.node_id, prefix.tail_pos);
|
||||||
|
|
||||||
if (prefix.node_id == NULL_NODE_ID) {
|
if (prefix.node_id == NULL_NODE_ID) {
|
||||||
return NULL_PREFIX_RESULT;
|
return NULL_PREFIX_RESULT;
|
||||||
@@ -976,7 +976,7 @@ static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lan
|
|||||||
return prefix;
|
return prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result) {
|
static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, gender_t gender, grammatical_category_t category) {
|
||||||
if (numex_table == NULL) {
|
if (numex_table == NULL) {
|
||||||
log_error(NUMEX_SETUP_ERROR);
|
log_error(NUMEX_SETUP_ERROR);
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -988,13 +988,13 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool use_default_if_not_found = true;
|
bool use_default_if_not_found = true;
|
||||||
trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, result.gender, result.category, use_default_if_not_found);
|
trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, ORDINAL_NAMESPACE_PREFIX, gender, category, use_default_if_not_found);
|
||||||
|
|
||||||
if (prefix.node_id == NULL_NODE_ID) {
|
if (prefix.node_id == NULL_NODE_ID) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
phrase_t phrase = trie_search_suffixes_from_index(trie, numeric_string, strlen(numeric_string), prefix.node_id);
|
phrase_t phrase = trie_search_suffixes_from_index(trie, numeric_string, len, prefix.node_id);
|
||||||
|
|
||||||
if (phrase.len == 0) {
|
if (phrase.len == 0) {
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -1009,6 +1009,87 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t possible_ordinal_digit_len(char *str, size_t len) {
|
||||||
|
uint8_t *ptr = (uint8_t *)str;
|
||||||
|
size_t idx = 0;
|
||||||
|
|
||||||
|
bool ignorable = true;
|
||||||
|
|
||||||
|
bool is_digit = false;
|
||||||
|
bool last_was_digit = false;
|
||||||
|
|
||||||
|
int32_t ch;
|
||||||
|
|
||||||
|
size_t digit_len = 0;
|
||||||
|
|
||||||
|
while (idx < len) {
|
||||||
|
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
|
||||||
|
|
||||||
|
if (char_len <= 0) break;
|
||||||
|
if (ch == 0) break;
|
||||||
|
if (!(utf8proc_codepoint_valid(ch))) return 0;
|
||||||
|
|
||||||
|
// 0-9 only for this
|
||||||
|
is_digit = ch >= 48 && ch <= 57;
|
||||||
|
|
||||||
|
if ((idx == 0 && !is_digit) || (idx > 0 && is_digit && !last_was_digit)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_digit) {
|
||||||
|
digit_len += char_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
ptr += char_len;
|
||||||
|
idx += char_len;
|
||||||
|
last_was_digit = is_digit;
|
||||||
|
}
|
||||||
|
|
||||||
|
return digit_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ordinal_suffix_len(char *str, size_t len, char *lang) {
|
||||||
|
if (str == NULL || len == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ordinal_digit_len = possible_ordinal_digit_len(str, len);
|
||||||
|
if (ordinal_digit_len == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numex_table == NULL) {
|
||||||
|
log_error(NUMEX_SETUP_ERROR);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
trie_t *trie = numex_table->trie;
|
||||||
|
if (trie == NULL) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool use_default_if_not_found = false;
|
||||||
|
|
||||||
|
// Default (GENDER_NONE and CATEGORY_DEFAULT) are at the end of the enums, so iterate backward
|
||||||
|
for (int gender = NUM_GENDERS - 1; gender >= 0; gender--) {
|
||||||
|
for (int category = NUM_CATEGORIES - 1; category >= 0; category--) {
|
||||||
|
trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, ORDINAL_PHRASE_NAMESPACE_PREFIX, gender, category, use_default_if_not_found);
|
||||||
|
|
||||||
|
if (prefix.node_id == NULL_NODE_ID) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
phrase_t phrase = trie_search_suffixes_from_index(trie, str, len, prefix.node_id);
|
||||||
|
|
||||||
|
if (phrase.len == len - ordinal_digit_len) {
|
||||||
|
return len - ordinal_digit_len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
char *replace_numeric_expressions(char *str, char *lang) {
|
char *replace_numeric_expressions(char *str, char *lang) {
|
||||||
numex_result_array *results = convert_numeric_expressions(str, lang);
|
numex_result_array *results = convert_numeric_expressions(str, lang);
|
||||||
if (results == NULL) return NULL;
|
if (results == NULL) return NULL;
|
||||||
@@ -1040,7 +1121,7 @@ char *replace_numeric_expressions(char *str, char *lang) {
|
|||||||
char_array_append(replacement, numeric_string);
|
char_array_append(replacement, numeric_string);
|
||||||
|
|
||||||
if (result.is_ordinal) {
|
if (result.is_ordinal) {
|
||||||
char *ordinal_suffix = get_ordinal_suffix(numeric_string, lang, result);
|
char *ordinal_suffix = get_ordinal_suffix(numeric_string, strlen(numeric_string), lang, result.gender, result.category);
|
||||||
if (ordinal_suffix != NULL) {
|
if (ordinal_suffix != NULL) {
|
||||||
char_array_append(replacement, ordinal_suffix);
|
char_array_append(replacement, ordinal_suffix);
|
||||||
}
|
}
|
||||||
|
|||||||
12
src/numex.h
12
src/numex.h
@@ -34,7 +34,8 @@ typedef enum {
|
|||||||
GENDER_MASCULINE,
|
GENDER_MASCULINE,
|
||||||
GENDER_FEMININE,
|
GENDER_FEMININE,
|
||||||
GENDER_NEUTER,
|
GENDER_NEUTER,
|
||||||
GENDER_NONE
|
GENDER_NONE,
|
||||||
|
NUM_GENDERS
|
||||||
} gender_t;
|
} gender_t;
|
||||||
|
|
||||||
#define CATEGORY_PLURAL_PREFIX "p"
|
#define CATEGORY_PLURAL_PREFIX "p"
|
||||||
@@ -42,7 +43,8 @@ typedef enum {
|
|||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
CATEGORY_PLURAL,
|
CATEGORY_PLURAL,
|
||||||
CATEGORY_DEFAULT
|
CATEGORY_DEFAULT,
|
||||||
|
NUM_CATEGORIES
|
||||||
} grammatical_category_t;
|
} grammatical_category_t;
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
@@ -85,10 +87,14 @@ typedef struct numex_rule {
|
|||||||
VECTOR_INIT(numex_rule_array, numex_rule_t)
|
VECTOR_INIT(numex_rule_array, numex_rule_t)
|
||||||
|
|
||||||
#define ORDINAL_NAMESPACE_CHAR "o"
|
#define ORDINAL_NAMESPACE_CHAR "o"
|
||||||
|
#define ORDINAL_PHRASE_NAMESPACE_CHAR "p"
|
||||||
|
|
||||||
#define ORDINAL_NAMESPACE_PREFIX NAMESPACE_SEPARATOR_CHAR ORDINAL_NAMESPACE_CHAR NAMESPACE_SEPARATOR_CHAR
|
#define ORDINAL_NAMESPACE_PREFIX NAMESPACE_SEPARATOR_CHAR ORDINAL_NAMESPACE_CHAR NAMESPACE_SEPARATOR_CHAR
|
||||||
#define ORDINAL_NAMESPACE_PREFIX_LEN strlen(ORDINAL_NAMESPACE_PREFIX)
|
#define ORDINAL_NAMESPACE_PREFIX_LEN strlen(ORDINAL_NAMESPACE_PREFIX)
|
||||||
|
|
||||||
|
#define ORDINAL_PHRASE_NAMESPACE_PREFIX NAMESPACE_SEPARATOR_CHAR ORDINAL_PHRASE_NAMESPACE_CHAR NAMESPACE_SEPARATOR_CHAR
|
||||||
|
#define ORDINAL_PHRASE_NAMESPACE_PREFIX_LEN strlen(ORDINAL_PHRASE_NAMESPACE_PREFIX)
|
||||||
|
|
||||||
typedef struct ordinal_indicator {
|
typedef struct ordinal_indicator {
|
||||||
char *key;
|
char *key;
|
||||||
gender_t gender;
|
gender_t gender;
|
||||||
@@ -142,7 +148,7 @@ VECTOR_INIT(numex_result_array, numex_result_t)
|
|||||||
|
|
||||||
char *replace_numeric_expressions(char *str, char *lang);
|
char *replace_numeric_expressions(char *str, char *lang);
|
||||||
numex_result_array *convert_numeric_expressions(char *str, char *lang);
|
numex_result_array *convert_numeric_expressions(char *str, char *lang);
|
||||||
char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result);
|
size_t ordinal_suffix_len(char *s, size_t len, char *lang);
|
||||||
|
|
||||||
bool numex_table_write(FILE *file);
|
bool numex_table_write(FILE *file);
|
||||||
bool numex_table_save(char *filename);
|
bool numex_table_save(char *filename);
|
||||||
|
|||||||
@@ -92,72 +92,84 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (j = ordinal_indicator_index; j < ordinal_indicator_index + num_ordinal_indicators; j++) {
|
for (j = ordinal_indicator_index; j < ordinal_indicator_index + num_ordinal_indicators; j++) {
|
||||||
value = numex_table->ordinal_indicators->n;
|
for (int ordinal_phrases = 0; ordinal_phrases <= 1; ordinal_phrases++) {
|
||||||
ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j];
|
value = numex_table->ordinal_indicators->n;
|
||||||
|
ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j];
|
||||||
|
|
||||||
if (ordinal_source.key == NULL) {
|
if (ordinal_source.key == NULL) {
|
||||||
log_error("ordinal source key was NULL at index %d\n", j);
|
log_error("ordinal source key was NULL at index %d\n", j);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
char *ordinal_indicator_key = strdup(ordinal_source.key);
|
char *ordinal_indicator_key = strdup(ordinal_source.key);
|
||||||
if (ordinal_indicator_key == NULL) {
|
if (ordinal_indicator_key == NULL) {
|
||||||
log_error("Error in strdup\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
char *suffix = NULL;
|
|
||||||
if (ordinal_source.suffix != NULL) {
|
|
||||||
suffix = strdup(ordinal_source.suffix);
|
|
||||||
if (suffix == NULL) {
|
|
||||||
log_error("Error in strdup\n");
|
log_error("Error in strdup\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_indicator_key, ordinal_source.gender, ordinal_source.category, suffix);
|
|
||||||
ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal);
|
|
||||||
|
|
||||||
char_array_clear(key);
|
char *suffix = NULL;
|
||||||
char_array_cat(key, lang);
|
if (ordinal_source.suffix != NULL) {
|
||||||
char_array_cat(key, ORDINAL_NAMESPACE_PREFIX);
|
suffix = strdup(ordinal_source.suffix);
|
||||||
|
if (suffix == NULL) {
|
||||||
|
log_error("Error in strdup\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
switch (ordinal_source.gender) {
|
char_array_clear(key);
|
||||||
case GENDER_MASCULINE:
|
char_array_cat(key, lang);
|
||||||
char_array_cat(key, GENDER_MASCULINE_PREFIX);
|
|
||||||
break;
|
|
||||||
case GENDER_FEMININE:
|
|
||||||
char_array_cat(key, GENDER_FEMININE_PREFIX);
|
|
||||||
break;
|
|
||||||
case GENDER_NEUTER:
|
|
||||||
char_array_cat(key, GENDER_NEUTER_PREFIX);
|
|
||||||
break;
|
|
||||||
case GENDER_NONE:
|
|
||||||
default:
|
|
||||||
char_array_cat(key, GENDER_NONE_PREFIX);
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (ordinal_source.category) {
|
if (!ordinal_phrases) {
|
||||||
case CATEGORY_PLURAL:
|
ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_indicator_key, ordinal_source.gender, ordinal_source.category, suffix);
|
||||||
char_array_cat(key, CATEGORY_PLURAL_PREFIX);
|
ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal);
|
||||||
break;
|
|
||||||
case CATEGORY_DEFAULT:
|
|
||||||
default:
|
|
||||||
char_array_cat(key, CATEGORY_DEFAULT_PREFIX);
|
|
||||||
|
|
||||||
}
|
char_array_cat(key, ORDINAL_NAMESPACE_PREFIX);
|
||||||
|
} else {
|
||||||
|
char_array_cat(key, ORDINAL_PHRASE_NAMESPACE_PREFIX);
|
||||||
|
}
|
||||||
|
|
||||||
char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
|
switch (ordinal_source.gender) {
|
||||||
|
case GENDER_MASCULINE:
|
||||||
|
char_array_cat(key, GENDER_MASCULINE_PREFIX);
|
||||||
|
break;
|
||||||
|
case GENDER_FEMININE:
|
||||||
|
char_array_cat(key, GENDER_FEMININE_PREFIX);
|
||||||
|
break;
|
||||||
|
case GENDER_NEUTER:
|
||||||
|
char_array_cat(key, GENDER_NEUTER_PREFIX);
|
||||||
|
break;
|
||||||
|
case GENDER_NONE:
|
||||||
|
default:
|
||||||
|
char_array_cat(key, GENDER_NONE_PREFIX);
|
||||||
|
}
|
||||||
|
|
||||||
char *reversed = utf8_reversed_string(ordinal_source.key);
|
switch (ordinal_source.category) {
|
||||||
char_array_cat(key, reversed);
|
case CATEGORY_PLURAL:
|
||||||
free(reversed);
|
char_array_cat(key, CATEGORY_PLURAL_PREFIX);
|
||||||
|
break;
|
||||||
|
case CATEGORY_DEFAULT:
|
||||||
|
default:
|
||||||
|
char_array_cat(key, CATEGORY_DEFAULT_PREFIX);
|
||||||
|
|
||||||
char *str_key = char_array_get_string(key);
|
}
|
||||||
|
|
||||||
if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) {
|
char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
|
||||||
trie_add(numex_table->trie, str_key, value);
|
|
||||||
} else {
|
char *key_str = ordinal_source.key;
|
||||||
log_warn("Key exists: %s, skipping\n", str_key);
|
|
||||||
|
if (ordinal_phrases) {
|
||||||
|
key_str = suffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *reversed = utf8_reversed_string(key_str);
|
||||||
|
char_array_cat(key, reversed);
|
||||||
|
free(reversed);
|
||||||
|
|
||||||
|
char *str_key = char_array_get_string(key);
|
||||||
|
|
||||||
|
if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) {
|
||||||
|
trie_add(numex_table->trie, str_key, value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user