From 283be99b44eb1c667b42f495a57562d1f1d8b0ff Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 24 Feb 2018 00:31:02 -0500 Subject: [PATCH] [numex] helper function to retrieve ordinal suffix lengths from a tokenized string for use in deduping --- src/expand.c | 36 ------------------------------------ src/numex.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ src/numex.h | 3 +++ 3 files changed, 49 insertions(+), 36 deletions(-) diff --git a/src/expand.c b/src/expand.c index 45b506ab..77c623cc 100644 --- a/src/expand.c +++ b/src/expand.c @@ -44,39 +44,6 @@ inline uint64_t get_normalize_string_options(libpostal_normalize_options_t optio return normalize_string_options; } - -static inline size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) { - size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); - - int32_t unichr = 0; - const uint8_t *ptr = (const uint8_t *)str; - - if (len_ordinal_suffix > 0) { - ssize_t start = 0; - size_t token_offset = token.offset; - size_t token_len = token.len; - - if (len_ordinal_suffix < token.len) { - start = token.offset + token.len - len_ordinal_suffix; - token_offset = token.offset; - token_len = token.len - len_ordinal_suffix; - } else { - start = prev_token.offset + prev_token.len; - token_offset = prev_token.offset; - token_len = prev_token.len; - } - ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); - if (prev_char_len <= 0) return 0; - if (!utf8_is_digit(utf8proc_category(unichr)) && !is_likely_roman_numeral_len(str + token_offset, token_len)) { - return 0; - } - } else { - return 0; - } - - return len_ordinal_suffix; -} - void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { uint64_t normalize_token_options = get_normalize_token_options(options); @@ -174,8 +141,6 @@ void add_postprocessed_string(cstring_array *strings, char *str, libpostal_norma } - - address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); @@ -1659,7 +1624,6 @@ cstring_array *expand_address_root(char *input, libpostal_normalize_options_t op } - void expansion_array_destroy(char **expansions, size_t n) { for (size_t i = 0; i < n; i++) { free(expansions[i]); diff --git a/src/numex.c b/src/numex.c index bc342310..6edaca1a 100644 --- a/src/numex.c +++ b/src/numex.c @@ -1137,6 +1137,52 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { return 0; } +size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) { + size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); + + int32_t unichr = 0; + const uint8_t *ptr = (const uint8_t *)str; + + if (len_ordinal_suffix > 0) { + ssize_t start = 0; + size_t token_offset = token.offset; + size_t token_len = token.len; + + if (len_ordinal_suffix < token.len) { + start = token.offset + token.len - len_ordinal_suffix; + token_offset = token.offset; + token_len = token.len - len_ordinal_suffix; + } else { + start = prev_token.offset + prev_token.len; + token_offset = prev_token.offset; + token_len = prev_token.len; + } + ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); + if (prev_char_len <= 0) return 0; + if (!utf8_is_digit(utf8proc_category(unichr)) && !is_likely_roman_numeral_len(str + token_offset, token_len)) { + return 0; + } + } else { + return 0; + } + + return len_ordinal_suffix; +} + +bool add_ordinal_suffix_lengths(uint32_array *suffixes, char *str, token_array *tokens_array, char *lang) { + if (suffixes == NULL || str == NULL || tokens_array == NULL) return false; + size_t n = tokens_array->n; + token_t *tokens = tokens_array->a; + token_t prev_token = NULL_TOKEN; + for (size_t i = 0; i < n; i++) { + token_t token = tokens[i]; + size_t suffix_len = valid_ordinal_suffix_len(str, token, prev_token, lang); + uint32_array_push(suffixes, (uint32_t)suffix_len); + prev_token = token; + } + return true; +} + static inline bool is_roman_numeral_char(char c) { diff --git a/src/numex.h b/src/numex.h index 1a0d89b7..538404ac 100644 --- a/src/numex.h +++ b/src/numex.h @@ -152,6 +152,9 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang); size_t ordinal_suffix_len(char *s, size_t len, char *lang); size_t possible_ordinal_digit_len(char *str, size_t len); +size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang); +bool add_ordinal_suffix_lengths(uint32_array *suffixes, char *str, token_array *tokens_array, char *lang); + bool is_likely_roman_numeral(char *str); bool is_likely_roman_numeral_len(char *str, size_t len);