[dedupe] adding a function to acronyms module to detect existing/known acronyms like MS for middle school, HS for high school, etc. Forms like MS have to be deined in the dictionaries specifically but any acronym written like M.S. will be detected as such by the tokenizer

2018-01-15 23:46:49 -05:00
parent 133381f439
commit 0ee18b4f6c
4 changed files with 64 additions and 0 deletions
--- a/src/acronyms.c
+++ b/src/acronyms.c
@@ -1,4 +1,62 @@
 #include "acronyms.h"
 #include "token_types.h"
 bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages) {
    if (existing_acronyms_array == NULL || token_array == NULL) return false;
    size_t num_tokens = token_array->n;
    if (existing_acronyms_array->n != num_tokens) {
        uint32_array_resize_fixed(existing_acronyms_array, num_tokens);
    }
    uint32_array_zero(existing_acronyms_array->a, existing_acronyms_array->n);
    uint32_t *existing_acronyms = existing_acronyms_array->a;
    token_t *tokens = token_array->a;
    for (size_t i = 0; i < num_tokens; i++) {
        token_t token = tokens[i];
        if (token.type == ACRONYM) {
            existing_acronyms[i] = 1;
        }
    }
    for (size_t l = 0; l < num_languages; l++) {
        char *lang = languages[l];
        phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, token_array, lang);
        if (lang_phrases != NULL) {
            size_t num_lang_phrases = lang_phrases->n;
            for (size_t p = 0; p < num_lang_phrases; p++) {
                phrase_t phrase = lang_phrases->a[p];
                address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data);
                if (value == NULL) continue;
                address_expansion_array *expansions_array = value->expansions;
                if (expansions_array == NULL) continue;
                size_t num_expansions = expansions_array->n;
                address_expansion_t *expansions = expansions_array->a;
                for (size_t i = 0; i < num_expansions; i++) {
                    address_expansion_t expansion = expansions[i];
                    if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
                        char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
                        if (string_contains(canonical, " ")) {
                            for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
                                existing_acronyms[j] = 1;
                            }
                        }
                    }
                }
            }
            phrase_array_destroy(lang_phrases);
        }
    }
    return true;
 }
 bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) {
    if (stopwords_array == NULL) return false;
--- a/src/acronyms.h
+++ b/src/acronyms.h
@@ -10,6 +10,7 @@
 #include "token_types.h"
 bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages);
 bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages);
 phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages);
--- a/src/string_utils.c
+++ b/src/string_utils.c
@@ -721,6 +721,10 @@ ssize_t string_next_hyphen_index(char *str, size_t len) {
    return -1;
 }
 inline bool string_contains(char *str, char *sub) {
    return str != NULL && sub != NULL && strstr(str, sub) != NULL;
 }
 inline bool string_contains_hyphen_len(char *str, size_t len) {
    return string_next_hyphen_index(str, len) >= 0;
 }
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -107,6 +107,7 @@ bool string_is_digit(char *str, size_t len);
 bool string_is_ignorable(char *str, size_t len);
 ssize_t string_next_hyphen_index(char *str, size_t len);
 bool string_contains(char *str, char *sub);
 bool string_contains_hyphen(char *str);
 bool string_contains_hyphen_len(char *str, size_t len);