[similarity] adding a stopword-aware acronym alignment method for matching U.N. with United Nations, Museum of Modern Art with MoMA, as well as things like University of California - Los Angeles with UCLA. All of these should work across languages, including non-Latin character sets like Cyrllic (but not ideograms as the concept doesn't make as much sense there). Skipping tokens like "of" or "the" depends only on the stopwords dictionary being defined for a given language.

2017-12-04 15:21:09 -05:00
parent 252d5a0f37
commit cfa5b1ce42
6 changed files with 181 additions and 2 deletions
--- a/src/address_dictionary.c
+++ b/src/address_dictionary.c
@@ -35,6 +35,24 @@ inline bool address_expansion_in_dictionary(address_expansion_t expansion, uint1
 }


+bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id) {
+    address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data);
+    if (value == NULL) return false;
+
+    address_expansion_array *expansions = value->expansions;
+    if (expansions == NULL) return false;
+
+    address_expansion_t *expansions_array = expansions->a;
+
+    for (size_t i = 0; i < expansions->n; i++) {
+        address_expansion_t expansion = expansions_array[i];
+        if (address_expansion_in_dictionary(expansion, dictionary_id)) {
+            return true;
+        }
+    }
+    return false;
+}
+

 int32_t address_dictionary_next_canonical_index(void) {
    if (address_dict == NULL || address_dict->canonical == NULL) {