From c5ad080fb0ed3c85f83d65e64965330375a28dd3 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 29 Dec 2017 02:42:22 -0500
Subject: [PATCH] [similarity] moving stopword tokens array to a separate
 function in acronym token alignments

---
 src/acronyms.c | 54 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/src/acronyms.c b/src/acronyms.c
index ed91b5a6..425b64f2 100644
--- a/src/acronyms.c
+++ b/src/acronyms.c
@@ -1,5 +1,33 @@
 #include "acronyms.h"
 
+static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_t num_languages, char **languages) {
+    size_t len = tokens->n;
+    uint32_array *stopwords_array = uint32_array_new_zeros(len);
+
+    uint32_t *stopwords = stopwords_array->a;
+
+    for (size_t l = 0; l < num_languages; l++) {
+        char *lang = languages[l];
+        phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, tokens, lang);
+
+        if (lang_phrases != NULL) {
+            size_t num_lang_phrases = lang_phrases->n;
+            for (size_t p = 0; p < num_lang_phrases; p++) {
+                phrase_t phrase = lang_phrases->a[p];
+
+                if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) {
+                    for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) {
+                        stopwords[stop_idx] = 1;
+                    }
+                }
+            }
+            phrase_array_destroy(lang_phrases);
+        }
+    }
+
+    return stopwords_array;
+}
+
 phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) {
     if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) {
         return NULL;
@@ -28,29 +56,13 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con
     token_t *t1 = tokens1->a;
     token_t *t2 = tokens2->a;
 
-    uint32_array *stopwords_array = uint32_array_new_zeros(len2);
+    uint32_array *stopwords_array = stopword_tokens(s2, tokens2, num_languages, languages);
+    if (stopwords_array == NULL) {
+        return NULL;
+    }
 
     uint32_t *stopwords = stopwords_array->a;
 
-    for (size_t l = 0; l < num_languages; l++) {
-        char *lang = languages[l];
-        phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)s2, tokens2, lang);
-
-        if (lang_phrases != NULL) {
-            size_t num_lang_phrases = lang_phrases->n;
-            for (size_t p = 0; p < num_lang_phrases; p++) {
-                phrase_t phrase = lang_phrases->a[p];
-
-                if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) {
-                    for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) {
-                        stopwords[stop_idx] = 1;
-                    }
-                }
-            }
-            phrase_array_destroy(lang_phrases);
-        }
-    }
-
     ssize_t acronym_start = -1;
     ssize_t acronym_token_pos = -1;
 
@@ -136,5 +148,3 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con
 
     return alignments;   
 }
-
-