[language_classification] Adding script feature for any non-Latin script. Even if the script doesn't directly identify the language, it can act as a modified intercept (all Han script addresses will share the Han feature, even if we haven't seen one of the > 80k Han characters)

2016-01-17 21:37:45 -05:00
parent b9a3230f65
commit 0dfd8d6439
1 changed files with 15 additions and 25 deletions
--- a/src/language_features.c
+++ b/src/language_features.c
@@ -213,6 +213,14 @@ static void add_token_features(khash_t(str_double) *features, char *prefix, char
    }
 }

+static void add_script_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, script_t script) {
+    char_array_clear(feature_array);
+    char_array_append(feature_array, "sc=");
+    char_array_cat_printf(feature_array, "%d", script);
+    char *feature = char_array_get_string(feature_array);
+    feature_counts_add(features, feature, 1.0);
+}
+

 khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array) {
    if (str == NULL || tokens == NULL || feature_array == NULL) return NULL;
@@ -264,55 +272,37 @@ khash_t(str_double) *extract_language_features(char *str, char *country, token_a
            tokenize_add_tokens(tokens, (const char *)normalized_str, strlen(normalized_str), keep_whitespace);

            token_t prev_token;
-            bool prev_was_phrase = false;
            char *phrase = NULL;

            // Search address dictionaries for any language
            phrase_array *phrases = search_address_dictionaries_tokens(normalized_str, tokens, NULL);
            log_debug("normalized_str=%s\n", normalized_str);

-            size_t start = 0;
-            size_t end = 0;
            size_t i, j;

            if (phrases != NULL) {
                for (i = 0; i < phrases->n; i++) {
                    phrase_t phrase = phrases->a[i];
                    log_debug("phrase (%d, %d)\n", phrase.start, phrase.len);
-
-                    end = phrase.start;
-                    for (j = start; j < end; j++) {
-                        token = tokens->a[j];
-                        log_debug("j=%zu, token.offset=%zu, token.len=%zu\n", j, token.offset, token.len);
-                        add_token_features(features, prefix, feature_array, normalized_str, token);
-                    }
-
-                    log_debug("done with start tokens\n");
                    add_phrase_feature(features, prefix, feature_array, normalized_str, phrase, tokens);
-                    start = phrase.start + phrase.len;
                }

                phrase_array_destroy(phrases);
-
            }

-            for (j = start; j < tokens->n; j++) {
-                log_debug("end token: %zu\n", j);
+            for (j = 0; j < tokens->n; j++) {
                token = tokens->a[j];
                
                add_token_features(features, prefix, feature_array, normalized_str, token);
            }

-        } else if (str_script.script != SCRIPT_UNKNOWN && str_script.script != SCRIPT_COMMON && script_langs.num_languages > 0) {
-            char_array_clear(feature_array);
-            char_array_append(feature_array, "sc=");
-            char_array_cat_printf(feature_array, "%d", str_script.script);
-            feature = char_array_get_string(feature_array);
-            log_debug("script feature=%s\n", feature);
-
-            if (feature != NULL) {
-                feature_counts_add(features, feature, 1.0);
+            if (str_script.script != SCRIPT_LATIN) {
+                add_script_feature(features, prefix, feature_array, str_script.script);
+                log_debug("script feature=%s\n", feature);                
            }
+
+        } else if (str_script.script != SCRIPT_UNKNOWN && str_script.script != SCRIPT_COMMON && script_langs.num_languages > 0) {
+            add_script_feature(features, prefix, feature_array, str_script.script);
        }

        consumed += str_script.len;