From cfa5b1ce42ff908a26aecab35b7c22335857d3b6 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 4 Dec 2017 15:21:09 -0500 Subject: [PATCH] [similarity] adding a stopword-aware acronym alignment method for matching U.N. with United Nations, Museum of Modern Art with MoMA, as well as things like University of California - Los Angeles with UCLA. All of these should work across languages, including non-Latin character sets like Cyrllic (but not ideograms as the concept doesn't make as much sense there). Skipping tokens like "of" or "the" depends only on the stopwords dictionary being defined for a given language. --- src/acronyms.c | 140 +++++++++++++++++++++++++++++++++++++++ src/acronyms.h | 15 +++++ src/address_dictionary.c | 18 +++++ src/address_dictionary.h | 1 + src/string_utils.c | 8 ++- src/string_utils.h | 1 + 6 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 src/acronyms.c create mode 100644 src/acronyms.h diff --git a/src/acronyms.c b/src/acronyms.c new file mode 100644 index 00000000..ed91b5a6 --- /dev/null +++ b/src/acronyms.c @@ -0,0 +1,140 @@ +#include "acronyms.h" + +phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { + if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) { + return NULL; + } + + size_t len1 = tokens1->n; + size_t len2 = tokens2->n; + if (len1 == 0 || len2 == 0 || len1 == len2) return NULL; + + if (len1 > len2) { + const char *tmp_s = s1; + s1 = s2; + s2 = tmp_s; + + token_array *tmp_t = tokens1; + tokens1 = tokens2; + tokens2 = tmp_t; + + size_t tmp_l = len1; + len1 = len2; + len2 = tmp_l; + } + + phrase_array *alignments = NULL; + + token_t *t1 = tokens1->a; + token_t *t2 = tokens2->a; + + uint32_array *stopwords_array = uint32_array_new_zeros(len2); + + uint32_t *stopwords = stopwords_array->a; + + for (size_t l = 0; l < num_languages; l++) { + char *lang = languages[l]; + phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)s2, tokens2, lang); + + if (lang_phrases != NULL) { + size_t num_lang_phrases = lang_phrases->n; + for (size_t p = 0; p < num_lang_phrases; p++) { + phrase_t phrase = lang_phrases->a[p]; + + if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) { + for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) { + stopwords[stop_idx] = 1; + } + } + } + phrase_array_destroy(lang_phrases); + } + } + + ssize_t acronym_start = -1; + ssize_t acronym_token_pos = -1; + + uint8_t *ptr1 = (uint8_t *)s1; + uint8_t *ptr2 = (uint8_t *)s2; + + int32_t c1, c2; + ssize_t c1_len; + ssize_t c2_len; + + size_t t2_consumed = 0; + + for (size_t i = 0; i < len1; i++) { + token_t ti = t1[i]; + + c1_len = utf8proc_iterate(ptr1 + ti.offset, ti.len, &c1); + if (c1_len <= 0 || c1 == 0) { + break; + } + + // Make sure it's a non-ideographic word. Single letter abbreviations will be captured by other methods + if (!is_word_token(ti.type) || is_ideographic(ti.type) || ti.len == c1_len) { + acronym_token_pos = -1; + continue; + } + + size_t ti_pos = 0; + + for (size_t j = t2_consumed; j < len2; j++) { + token_t tj = t2[j]; + c2_len = utf8proc_iterate(ptr2 + tj.offset, tj.len, &c2); + if (c2_len <= 0) { + break; + } + + if (utf8proc_tolower(c1) == utf8proc_tolower(c2)) { + ti_pos += c1_len; + if (acronym_start < 0) { + acronym_start = j; + acronym_token_pos = 0; + } + acronym_token_pos++; + c1_len = utf8proc_iterate(ptr1 + ti.offset + ti_pos, ti.len, &c1); + } else if (stopwords[j] && acronym_token_pos > 0) { + continue; + } else if (is_punctuation(tj.type) && acronym_token_pos > 0) { + continue; + } else if (ti_pos < ti.len) { + acronym_token_pos = -1; + acronym_start = -1; + ti_pos = 0; + continue; + } + + if ((utf8_is_period(c1) || utf8_is_hyphen(c1)) && ti_pos < ti.len) { + ti_pos += c1_len; + if (ti_pos < ti.len) { + c1_len = utf8proc_iterate(ptr1 + ti.offset + ti_pos, ti.len, &c1); + if (c1_len <= 0 || c1 == 0) { + break; + } + } + } + + if (ti_pos == ti.len) { + phrase_t phrase = (phrase_t){acronym_start, j - acronym_start + 1, i}; + // got alignment + if (alignments == NULL) { + alignments = phrase_array_new(); + } + + phrase_array_push(alignments, phrase); + + ti_pos = 0; + acronym_token_pos = -1; + acronym_start = -1; + } + } + + } + + uint32_array_destroy(stopwords_array); + + return alignments; +} + + diff --git a/src/acronyms.h b/src/acronyms.h new file mode 100644 index 00000000..5c61002e --- /dev/null +++ b/src/acronyms.h @@ -0,0 +1,15 @@ +#ifndef ACRONYMS_H +#define ACRONYMS_H + +#include +#include + +#include "address_dictionary.h" +#include "collections.h" +#include "tokens.h" +#include "token_types.h" + +phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); + + +#endif \ No newline at end of file diff --git a/src/address_dictionary.c b/src/address_dictionary.c index fd3fe471..957306b8 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -35,6 +35,24 @@ inline bool address_expansion_in_dictionary(address_expansion_t expansion, uint1 } +bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (address_expansion_in_dictionary(expansion, dictionary_id)) { + return true; + } + } + return false; +} + int32_t address_dictionary_next_canonical_index(void) { if (address_dict == NULL || address_dict->canonical == NULL) { diff --git a/src/address_dictionary.h b/src/address_dictionary.h index 1a80ed6c..0ee7934f 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -69,6 +69,7 @@ phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); address_expansion_value_t *address_dictionary_get_expansions(uint32_t i); bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id); +bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id); char *address_dictionary_get_canonical(uint32_t index); int32_t address_dictionary_next_canonical_index(void); bool address_dictionary_add_canonical(char *canonical); diff --git a/src/string_utils.c b/src/string_utils.c index fcd35d74..567c2213 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -314,6 +314,12 @@ inline bool utf8_is_hyphen(int32_t ch) { return cat == UTF8PROC_CATEGORY_PD || ch == 0x2212; } +#define PERIOD_CODEPOINT 46 + +inline bool utf8_is_period(int32_t codepoint) { + return codepoint == PERIOD_CODEPOINT; +} + inline bool utf8_is_punctuation(int cat) { return cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PE \ || cat == UTF8PROC_CATEGORY_PF || cat == UTF8PROC_CATEGORY_PI \ @@ -703,8 +709,6 @@ ssize_t string_next_codepoint(char *str, uint32_t codepoint) { return string_next_codepoint_len(str, codepoint, strlen(str)); } -#define PERIOD_CODEPOINT 46 - ssize_t string_next_period_len(char *str, size_t len) { return string_next_codepoint_len(str, PERIOD_CODEPOINT, len); } diff --git a/src/string_utils.h b/src/string_utils.h index eb27651f..86a018d8 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -91,6 +91,7 @@ uint32_array *unicode_codepoints(const char *str); bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array); bool utf8_is_hyphen(int32_t ch); +bool utf8_is_period(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat); bool utf8_is_digit(int cat);