From b13462f8efaedf0c3ea45506de815e5b7d38201a Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sat, 9 Jan 2016 03:42:57 -0500
Subject: [PATCH] [language_classifier] Features for address languages
 classification, quadgrams for most languages, unigrams for ideographic
 characters, script for single-script languages like Thai, Hebrew, etc.

---
 src/language_features.c | 325 ++++++++++++++++++++++++++++++++++++++++
 src/language_features.h |  16 ++
 2 files changed, 341 insertions(+)
 create mode 100644 src/language_features.c
 create mode 100644 src/language_features.h

diff --git a/src/language_features.c b/src/language_features.c
new file mode 100644
index 00000000..8ac11725
--- /dev/null
+++ b/src/language_features.c
@@ -0,0 +1,325 @@
+#include "language_features.h"
+#include "language_classifier.h"
+#include "address_dictionary.h"
+#include "features.h"
+#include "normalize.h"
+#include "scanner.h"
+#include "unicode_scripts.h"
+
+#define UNIGRAMS 1
+#define BIGRAMS 2
+#define QUADGRAMS 4
+#define OCTAGRAMS 8
+
+#define LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
+#define LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
+
+inline char *language_classifier_normalize_string(char *str) {
+    return normalize_string_latin(str, strlen(str), LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS);
+}
+
+
+inline void language_classifier_normalize_token(char_array *array, char *str, token_t token) {
+    char_array_strip_nul_byte(array);
+    if (is_word_token(token.type)) {
+        add_normalized_token(array, str, token, LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS);
+    } else {
+        char_array_add(array, " ");
+    }
+}
+
+
+static inline void append_prefix(char_array *array, char *prefix) {
+    if (prefix != NULL) {
+        char_array_append(array, prefix);
+        char_array_append(array, NAMESPACE_SEPARATOR_CHAR);
+    }
+}
+
+static inline void add_full_token_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) {
+    if (features == NULL || feature_array == NULL) return;
+
+    char_array_clear(feature_array);
+    append_prefix(feature_array, prefix);
+
+    char_array_add_len(feature_array, str + token.offset, token.len);
+
+    if (feature_array->n <= 1) return;
+    char *feature = char_array_get_string(feature_array);
+    log_debug("full token feature=%s\n", feature);
+    feature_counts_add(features, feature, 1.0);
+}
+
+
+static void add_ngram_features(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token, size_t n) {
+    char *feature_namespace;
+    if (features == NULL || feature_array == NULL) return;
+
+    if (n == 0 || !is_word_token(token.type)) return;
+    
+    size_t lengths[n];
+    size_t num_chars = 0;
+
+    size_t offset = token.offset;
+
+    uint8_t *ptr = (uint8_t *)str + offset;
+
+    int32_t ch;
+
+    size_t idx = 0;
+    size_t len = token.len;
+
+    size_t gram_len = 0;
+    size_t gram_offset = 0;
+    size_t consumed = 0;
+
+    bool beginning = true;
+
+    log_debug("len = %zu\n", len);
+
+    while (idx < len) {
+        ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
+        if (char_len <= 0 || ch == 0) break;
+
+        // Not at min characters yet
+        if (num_chars < n) {
+            lengths[num_chars] = (size_t)char_len;
+            num_chars++;
+            gram_len += char_len;
+        }
+
+        // We have a full gram of size n
+        if (num_chars == n) {
+            char_array_clear(feature_array);
+            append_prefix(feature_array, prefix);
+
+            if (beginning) {
+                beginning = false;
+            } else {
+                char_array_append(feature_array, "_");
+                gram_len -= lengths[0];
+                gram_offset += lengths[0];
+                gram_len += char_len;
+
+                for (size_t i = 1; i < n; i++) {
+                    lengths[i - 1] = lengths[i];
+                }
+                lengths[n - 1] = (size_t)char_len;
+            }
+
+            char_array_append_len(feature_array, str + offset + gram_offset, gram_len);
+
+            if (idx + char_len < len) {
+                char_array_append(feature_array, "_");
+            }
+
+            char_array_terminate(feature_array);
+            if (feature_array->n <= 1) continue;
+            char *feature = char_array_get_string(feature_array);
+            log_debug("feature=%s\n", feature);
+
+            feature_counts_add(features, feature, 1.0);
+        }
+
+        idx += char_len;
+        ptr += char_len;
+        consumed += char_len;
+    }
+
+    if (num_chars < n) {
+        add_full_token_feature(features, prefix, feature_array, str, token);
+    }
+
+}
+
+static void add_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_array *tokens) {
+    if (features == NULL || feature_array == NULL || tokens == NULL || tokens->n == 0) return;
+    char_array_clear(feature_array);
+    append_prefix(feature_array, prefix);
+
+    token_t token;
+    for (size_t i = phrase.start; i < phrase.start + phrase.len; i++) {
+        token = tokens->a[i];
+        char_array_append_len(feature_array, str + token.offset, token.len);
+        if (i < phrase.start + phrase.len - 1 && !is_ideographic(token.type)) {
+            char_array_append(feature_array, " ");
+        }
+    }
+
+    char_array_terminate(feature_array);
+    if (feature_array->n <= 1) return;
+    char *feature = char_array_get_string(feature_array);
+    feature_counts_add(features, feature, 1.0);
+}
+
+
+static void add_prefix_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_t token) {
+    if (features == NULL || feature_array == NULL || phrase.len == 0 || phrase.len >= token.len) return;
+    char_array_clear(feature_array);
+
+    append_prefix(feature_array, prefix);
+
+    char_array_append(feature_array, "pfx=");
+
+    char_array_add_len(feature_array, str + token.offset, phrase.len);
+
+    if (feature_array->n <= 1) return;
+    char *feature = char_array_get_string(feature_array);
+    feature_counts_add(features, feature, 1.0);
+
+}
+
+
+static void add_suffix_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_t token) {
+    if (features == NULL || feature_array == NULL || phrase.len == 0 || phrase.len >= token.len) return;
+    char_array_clear(feature_array);
+
+    append_prefix(feature_array, prefix);
+
+    char_array_append(feature_array, "sfx=");
+
+    char_array_add_len(feature_array, str + token.offset + token.len - phrase.len, phrase.len);
+
+    if (feature_array->n <= 1) return;
+    char *feature = char_array_get_string(feature_array);
+    feature_counts_add(features, feature, 1.0);
+
+}
+
+
+static void add_token_features(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) {
+    // Non-words don't convey any language information
+    // TODO: ordinal number suffixes may be worth investigating
+    if (!is_word_token(token.type)) {
+        return;
+    }
+
+    phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, NULL);
+    if (prefix_phrase.len > 0 && prefix_phrase.len < token.len) {
+        add_prefix_phrase_feature(features, prefix, feature_array, str, prefix_phrase, token);
+    }
+
+    phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, NULL);
+    if (suffix_phrase.len > 0 && suffix_phrase.len < token.len) {
+        add_suffix_phrase_feature(features, prefix, feature_array, str, suffix_phrase, token);
+    }
+
+    if (!is_ideographic(token.type)) {
+        // Add quadgram features
+        add_ngram_features(features, prefix, feature_array, str, token, QUADGRAMS);
+    } else {
+        // For ideographic scripts, use single ideograms
+        add_full_token_feature(features, prefix, feature_array, str, token);
+    }
+}
+
+
+khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array) {
+    if (str == NULL || tokens == NULL || feature_array == NULL) return NULL;
+
+    char *feature;
+
+    char *prefix = country;
+
+    size_t consumed = 0;
+    size_t len = strlen(str);
+    if (len == 0) return NULL;
+
+    char_array *normalized = char_array_new_size(len);
+    if (normalized == NULL) {
+        return NULL;
+    }
+
+    khash_t(str_double) *features = kh_init(str_double);
+    if (features == NULL) {
+        char_array_destroy(normalized);
+        return NULL;
+    }
+
+    while (consumed < len) {
+        string_script_t str_script = get_string_script(str, len - consumed);
+        log_debug("str=%s, len=%zu, consumed=%zu, script_len=%zu\n", str, strlen(str), consumed, str_script.len);
+
+        script_languages_t script_langs = get_script_languages(str_script.script);
+
+        if (script_langs.num_languages > 1) {
+            token_array_clear(tokens);
+            bool keep_whitespace = true;
+            tokenize_add_tokens(tokens, (const char *)str, str_script.len, keep_whitespace);
+
+            size_t num_tokens = tokens->n;
+            token_t token;
+
+            char_array_clear(normalized);
+
+            for (size_t i = 0; i < num_tokens; i++) {
+                token = tokens->a[i];
+                language_classifier_normalize_token(normalized, str, token);
+            }
+            char_array_terminate(normalized);
+
+            char *normalized_str = char_array_get_string(normalized);
+            token_array_clear(tokens);
+            keep_whitespace = false;
+            tokenize_add_tokens(tokens, (const char *)normalized_str, strlen(normalized_str), keep_whitespace);
+
+            token_t prev_token;
+            bool prev_was_phrase = false;
+            char *phrase = NULL;
+
+            // Search address dictionaries for any language
+            phrase_array *phrases = search_address_dictionaries_tokens(normalized_str, tokens, NULL);
+            log_debug("normalized_str=%s\n", normalized_str);
+
+            size_t start = 0;
+            size_t end = 0;
+            size_t i, j;
+
+            if (phrases != NULL) {
+                for (i = 0; i < phrases->n; i++) {
+                    phrase_t phrase = phrases->a[i];
+                    log_debug("phrase (%d, %d)\n", phrase.start, phrase.len);
+
+                    end = phrase.start;
+                    for (j = start; j < end; j++) {
+                        token = tokens->a[j];
+                        log_debug("j=%zu, token.offset=%zu, token.len=%zu\n", j, token.offset, token.len);
+                        add_token_features(features, prefix, feature_array, normalized_str, token);
+                    }
+
+                    log_debug("done with start tokens\n");
+                    add_phrase_feature(features, prefix, feature_array, normalized_str, phrase, tokens);
+                    start = phrase.start + phrase.len;
+                }
+
+                phrase_array_destroy(phrases);
+
+            }
+
+            for (j = start; j < tokens->n; j++) {
+                log_debug("end token: %zu\n", j);
+                token = tokens->a[j];
+                
+                add_token_features(features, prefix, feature_array, normalized_str, token);
+            }
+
+        } else if (str_script.script != SCRIPT_UNKNOWN && str_script.script != SCRIPT_COMMON && script_langs.num_languages > 0) {
+            char_array_clear(feature_array);
+            char_array_append(feature_array, "sc=");
+            char_array_cat_printf(feature_array, "%d", str_script.script);
+            feature = char_array_get_string(feature_array);
+            log_debug("script feature=%s\n", feature);
+
+            if (feature != NULL) {
+                feature_counts_add(features, feature, 1.0);
+            }
+        }
+
+        consumed += str_script.len;
+        str += str_script.len;
+    }
+
+    char_array_destroy(normalized);
+
+    return features;
+}
diff --git a/src/language_features.h b/src/language_features.h
new file mode 100644
index 00000000..b12bf3af
--- /dev/null
+++ b/src/language_features.h
@@ -0,0 +1,16 @@
+#ifndef LANGUAGE_FEATURES_H
+#define LANGUAGE_FEATURES_H
+
+#include <stdlib.h>
+
+#include "collections.h"
+#include "string_utils.h"
+#include "tokens.h"
+
+
+char *language_classifier_normalize_string(char *str);
+void language_classifier_normalize_token(char_array *array, char *str, token_t token);
+
+khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array);
+
+#endif
\ No newline at end of file