diff --git a/src/acronyms.c b/src/acronyms.c new file mode 100644 index 00000000..ed91b5a6 --- /dev/null +++ b/src/acronyms.c @@ -0,0 +1,140 @@ +#include "acronyms.h" + +phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { + if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) { + return NULL; + } + + size_t len1 = tokens1->n; + size_t len2 = tokens2->n; + if (len1 == 0 || len2 == 0 || len1 == len2) return NULL; + + if (len1 > len2) { + const char *tmp_s = s1; + s1 = s2; + s2 = tmp_s; + + token_array *tmp_t = tokens1; + tokens1 = tokens2; + tokens2 = tmp_t; + + size_t tmp_l = len1; + len1 = len2; + len2 = tmp_l; + } + + phrase_array *alignments = NULL; + + token_t *t1 = tokens1->a; + token_t *t2 = tokens2->a; + + uint32_array *stopwords_array = uint32_array_new_zeros(len2); + + uint32_t *stopwords = stopwords_array->a; + + for (size_t l = 0; l < num_languages; l++) { + char *lang = languages[l]; + phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)s2, tokens2, lang); + + if (lang_phrases != NULL) { + size_t num_lang_phrases = lang_phrases->n; + for (size_t p = 0; p < num_lang_phrases; p++) { + phrase_t phrase = lang_phrases->a[p]; + + if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) { + for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) { + stopwords[stop_idx] = 1; + } + } + } + phrase_array_destroy(lang_phrases); + } + } + + ssize_t acronym_start = -1; + ssize_t acronym_token_pos = -1; + + uint8_t *ptr1 = (uint8_t *)s1; + uint8_t *ptr2 = (uint8_t *)s2; + + int32_t c1, c2; + ssize_t c1_len; + ssize_t c2_len; + + size_t t2_consumed = 0; + + for (size_t i = 0; i < len1; i++) { + token_t ti = t1[i]; + + c1_len = utf8proc_iterate(ptr1 + ti.offset, ti.len, &c1); + if (c1_len <= 0 || c1 == 0) { + break; + } + + // Make sure it's a non-ideographic word. Single letter abbreviations will be captured by other methods + if (!is_word_token(ti.type) || is_ideographic(ti.type) || ti.len == c1_len) { + acronym_token_pos = -1; + continue; + } + + size_t ti_pos = 0; + + for (size_t j = t2_consumed; j < len2; j++) { + token_t tj = t2[j]; + c2_len = utf8proc_iterate(ptr2 + tj.offset, tj.len, &c2); + if (c2_len <= 0) { + break; + } + + if (utf8proc_tolower(c1) == utf8proc_tolower(c2)) { + ti_pos += c1_len; + if (acronym_start < 0) { + acronym_start = j; + acronym_token_pos = 0; + } + acronym_token_pos++; + c1_len = utf8proc_iterate(ptr1 + ti.offset + ti_pos, ti.len, &c1); + } else if (stopwords[j] && acronym_token_pos > 0) { + continue; + } else if (is_punctuation(tj.type) && acronym_token_pos > 0) { + continue; + } else if (ti_pos < ti.len) { + acronym_token_pos = -1; + acronym_start = -1; + ti_pos = 0; + continue; + } + + if ((utf8_is_period(c1) || utf8_is_hyphen(c1)) && ti_pos < ti.len) { + ti_pos += c1_len; + if (ti_pos < ti.len) { + c1_len = utf8proc_iterate(ptr1 + ti.offset + ti_pos, ti.len, &c1); + if (c1_len <= 0 || c1 == 0) { + break; + } + } + } + + if (ti_pos == ti.len) { + phrase_t phrase = (phrase_t){acronym_start, j - acronym_start + 1, i}; + // got alignment + if (alignments == NULL) { + alignments = phrase_array_new(); + } + + phrase_array_push(alignments, phrase); + + ti_pos = 0; + acronym_token_pos = -1; + acronym_start = -1; + } + } + + } + + uint32_array_destroy(stopwords_array); + + return alignments; +} + + diff --git a/src/acronyms.h b/src/acronyms.h new file mode 100644 index 00000000..5c61002e --- /dev/null +++ b/src/acronyms.h @@ -0,0 +1,15 @@ +#ifndef ACRONYMS_H +#define ACRONYMS_H + +#include +#include + +#include "address_dictionary.h" +#include "collections.h" +#include "tokens.h" +#include "token_types.h" + +phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); + + +#endif \ No newline at end of file diff --git a/src/address_dictionary.c b/src/address_dictionary.c index fd3fe471..957306b8 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -35,6 +35,24 @@ inline bool address_expansion_in_dictionary(address_expansion_t expansion, uint1 } +bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (address_expansion_in_dictionary(expansion, dictionary_id)) { + return true; + } + } + return false; +} + int32_t address_dictionary_next_canonical_index(void) { if (address_dict == NULL || address_dict->canonical == NULL) { diff --git a/src/address_dictionary.h b/src/address_dictionary.h index 1a80ed6c..0ee7934f 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -69,6 +69,7 @@ phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); address_expansion_value_t *address_dictionary_get_expansions(uint32_t i); bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id); +bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id); char *address_dictionary_get_canonical(uint32_t index); int32_t address_dictionary_next_canonical_index(void); bool address_dictionary_add_canonical(char *canonical); diff --git a/src/string_utils.c b/src/string_utils.c index fcd35d74..567c2213 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -314,6 +314,12 @@ inline bool utf8_is_hyphen(int32_t ch) { return cat == UTF8PROC_CATEGORY_PD || ch == 0x2212; } +#define PERIOD_CODEPOINT 46 + +inline bool utf8_is_period(int32_t codepoint) { + return codepoint == PERIOD_CODEPOINT; +} + inline bool utf8_is_punctuation(int cat) { return cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PE \ || cat == UTF8PROC_CATEGORY_PF || cat == UTF8PROC_CATEGORY_PI \ @@ -703,8 +709,6 @@ ssize_t string_next_codepoint(char *str, uint32_t codepoint) { return string_next_codepoint_len(str, codepoint, strlen(str)); } -#define PERIOD_CODEPOINT 46 - ssize_t string_next_period_len(char *str, size_t len) { return string_next_codepoint_len(str, PERIOD_CODEPOINT, len); } diff --git a/src/string_utils.h b/src/string_utils.h index eb27651f..86a018d8 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -91,6 +91,7 @@ uint32_array *unicode_codepoints(const char *str); bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array); bool utf8_is_hyphen(int32_t ch); +bool utf8_is_period(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat); bool utf8_is_digit(int cat);