From 0ee18b4f6c614ca927ce837f1f09b0f180987077 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 15 Jan 2018 23:46:49 -0500 Subject: [PATCH] [dedupe] adding a function to acronyms module to detect existing/known acronyms like MS for middle school, HS for high school, etc. Forms like MS have to be deined in the dictionaries specifically but any acronym written like M.S. will be detected as such by the tokenizer --- src/acronyms.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++ src/acronyms.h | 1 + src/string_utils.c | 4 ++++ src/string_utils.h | 1 + 4 files changed, 64 insertions(+) diff --git a/src/acronyms.c b/src/acronyms.c index 64a02746..644f2109 100644 --- a/src/acronyms.c +++ b/src/acronyms.c @@ -1,4 +1,62 @@ #include "acronyms.h" +#include "token_types.h" + + +bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages) { + if (existing_acronyms_array == NULL || token_array == NULL) return false; + size_t num_tokens = token_array->n; + if (existing_acronyms_array->n != num_tokens) { + uint32_array_resize_fixed(existing_acronyms_array, num_tokens); + } + + uint32_array_zero(existing_acronyms_array->a, existing_acronyms_array->n); + uint32_t *existing_acronyms = existing_acronyms_array->a; + + token_t *tokens = token_array->a; + for (size_t i = 0; i < num_tokens; i++) { + token_t token = tokens[i]; + if (token.type == ACRONYM) { + existing_acronyms[i] = 1; + } + } + + for (size_t l = 0; l < num_languages; l++) { + char *lang = languages[l]; + phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, token_array, lang); + + if (lang_phrases != NULL) { + size_t num_lang_phrases = lang_phrases->n; + for (size_t p = 0; p < num_lang_phrases; p++) { + phrase_t phrase = lang_phrases->a[p]; + + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) continue; + + address_expansion_array *expansions_array = value->expansions; + if (expansions_array == NULL) continue; + + size_t num_expansions = expansions_array->n; + address_expansion_t *expansions = expansions_array->a; + + for (size_t i = 0; i < num_expansions; i++) { + address_expansion_t expansion = expansions[i]; + if (expansion.canonical_index != NULL_CANONICAL_INDEX) { + char *canonical = address_dictionary_get_canonical(expansion.canonical_index); + if (string_contains(canonical, " ")) { + for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { + existing_acronyms[j] = 1; + } + } + } + } + + } + phrase_array_destroy(lang_phrases); + } + } + + return true; +} bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) { if (stopwords_array == NULL) return false; diff --git a/src/acronyms.h b/src/acronyms.h index 8b6a0dd9..2c2d7d67 100644 --- a/src/acronyms.h +++ b/src/acronyms.h @@ -10,6 +10,7 @@ #include "token_types.h" bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages); +bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages); phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); diff --git a/src/string_utils.c b/src/string_utils.c index 45cc1373..950e3004 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -721,6 +721,10 @@ ssize_t string_next_hyphen_index(char *str, size_t len) { return -1; } +inline bool string_contains(char *str, char *sub) { + return str != NULL && sub != NULL && strstr(str, sub) != NULL; +} + inline bool string_contains_hyphen_len(char *str, size_t len) { return string_next_hyphen_index(str, len) >= 0; } diff --git a/src/string_utils.h b/src/string_utils.h index 1ddcc626..30c583b6 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -107,6 +107,7 @@ bool string_is_digit(char *str, size_t len); bool string_is_ignorable(char *str, size_t len); ssize_t string_next_hyphen_index(char *str, size_t len); +bool string_contains(char *str, char *sub); bool string_contains_hyphen(char *str); bool string_contains_hyphen_len(char *str, size_t len);