From b13462f8efaedf0c3ea45506de815e5b7d38201a Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 9 Jan 2016 03:42:57 -0500 Subject: [PATCH] [language_classifier] Features for address languages classification, quadgrams for most languages, unigrams for ideographic characters, script for single-script languages like Thai, Hebrew, etc. --- src/language_features.c | 325 ++++++++++++++++++++++++++++++++++++++++ src/language_features.h | 16 ++ 2 files changed, 341 insertions(+) create mode 100644 src/language_features.c create mode 100644 src/language_features.h diff --git a/src/language_features.c b/src/language_features.c new file mode 100644 index 00000000..8ac11725 --- /dev/null +++ b/src/language_features.c @@ -0,0 +1,325 @@ +#include "language_features.h" +#include "language_classifier.h" +#include "address_dictionary.h" +#include "features.h" +#include "normalize.h" +#include "scanner.h" +#include "unicode_scripts.h" + +#define UNIGRAMS 1 +#define BIGRAMS 2 +#define QUADGRAMS 4 +#define OCTAGRAMS 8 + +#define LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII +#define LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS + +inline char *language_classifier_normalize_string(char *str) { + return normalize_string_latin(str, strlen(str), LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS); +} + + +inline void language_classifier_normalize_token(char_array *array, char *str, token_t token) { + char_array_strip_nul_byte(array); + if (is_word_token(token.type)) { + add_normalized_token(array, str, token, LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS); + } else { + char_array_add(array, " "); + } +} + + +static inline void append_prefix(char_array *array, char *prefix) { + if (prefix != NULL) { + char_array_append(array, prefix); + char_array_append(array, NAMESPACE_SEPARATOR_CHAR); + } +} + +static inline void add_full_token_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) { + if (features == NULL || feature_array == NULL) return; + + char_array_clear(feature_array); + append_prefix(feature_array, prefix); + + char_array_add_len(feature_array, str + token.offset, token.len); + + if (feature_array->n <= 1) return; + char *feature = char_array_get_string(feature_array); + log_debug("full token feature=%s\n", feature); + feature_counts_add(features, feature, 1.0); +} + + +static void add_ngram_features(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token, size_t n) { + char *feature_namespace; + if (features == NULL || feature_array == NULL) return; + + if (n == 0 || !is_word_token(token.type)) return; + + size_t lengths[n]; + size_t num_chars = 0; + + size_t offset = token.offset; + + uint8_t *ptr = (uint8_t *)str + offset; + + int32_t ch; + + size_t idx = 0; + size_t len = token.len; + + size_t gram_len = 0; + size_t gram_offset = 0; + size_t consumed = 0; + + bool beginning = true; + + log_debug("len = %zu\n", len); + + while (idx < len) { + ssize_t char_len = utf8proc_iterate(ptr, len, &ch); + if (char_len <= 0 || ch == 0) break; + + // Not at min characters yet + if (num_chars < n) { + lengths[num_chars] = (size_t)char_len; + num_chars++; + gram_len += char_len; + } + + // We have a full gram of size n + if (num_chars == n) { + char_array_clear(feature_array); + append_prefix(feature_array, prefix); + + if (beginning) { + beginning = false; + } else { + char_array_append(feature_array, "_"); + gram_len -= lengths[0]; + gram_offset += lengths[0]; + gram_len += char_len; + + for (size_t i = 1; i < n; i++) { + lengths[i - 1] = lengths[i]; + } + lengths[n - 1] = (size_t)char_len; + } + + char_array_append_len(feature_array, str + offset + gram_offset, gram_len); + + if (idx + char_len < len) { + char_array_append(feature_array, "_"); + } + + char_array_terminate(feature_array); + if (feature_array->n <= 1) continue; + char *feature = char_array_get_string(feature_array); + log_debug("feature=%s\n", feature); + + feature_counts_add(features, feature, 1.0); + } + + idx += char_len; + ptr += char_len; + consumed += char_len; + } + + if (num_chars < n) { + add_full_token_feature(features, prefix, feature_array, str, token); + } + +} + +static void add_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_array *tokens) { + if (features == NULL || feature_array == NULL || tokens == NULL || tokens->n == 0) return; + char_array_clear(feature_array); + append_prefix(feature_array, prefix); + + token_t token; + for (size_t i = phrase.start; i < phrase.start + phrase.len; i++) { + token = tokens->a[i]; + char_array_append_len(feature_array, str + token.offset, token.len); + if (i < phrase.start + phrase.len - 1 && !is_ideographic(token.type)) { + char_array_append(feature_array, " "); + } + } + + char_array_terminate(feature_array); + if (feature_array->n <= 1) return; + char *feature = char_array_get_string(feature_array); + feature_counts_add(features, feature, 1.0); +} + + +static void add_prefix_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_t token) { + if (features == NULL || feature_array == NULL || phrase.len == 0 || phrase.len >= token.len) return; + char_array_clear(feature_array); + + append_prefix(feature_array, prefix); + + char_array_append(feature_array, "pfx="); + + char_array_add_len(feature_array, str + token.offset, phrase.len); + + if (feature_array->n <= 1) return; + char *feature = char_array_get_string(feature_array); + feature_counts_add(features, feature, 1.0); + +} + + +static void add_suffix_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_t token) { + if (features == NULL || feature_array == NULL || phrase.len == 0 || phrase.len >= token.len) return; + char_array_clear(feature_array); + + append_prefix(feature_array, prefix); + + char_array_append(feature_array, "sfx="); + + char_array_add_len(feature_array, str + token.offset + token.len - phrase.len, phrase.len); + + if (feature_array->n <= 1) return; + char *feature = char_array_get_string(feature_array); + feature_counts_add(features, feature, 1.0); + +} + + +static void add_token_features(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) { + // Non-words don't convey any language information + // TODO: ordinal number suffixes may be worth investigating + if (!is_word_token(token.type)) { + return; + } + + phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, NULL); + if (prefix_phrase.len > 0 && prefix_phrase.len < token.len) { + add_prefix_phrase_feature(features, prefix, feature_array, str, prefix_phrase, token); + } + + phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, NULL); + if (suffix_phrase.len > 0 && suffix_phrase.len < token.len) { + add_suffix_phrase_feature(features, prefix, feature_array, str, suffix_phrase, token); + } + + if (!is_ideographic(token.type)) { + // Add quadgram features + add_ngram_features(features, prefix, feature_array, str, token, QUADGRAMS); + } else { + // For ideographic scripts, use single ideograms + add_full_token_feature(features, prefix, feature_array, str, token); + } +} + + +khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array) { + if (str == NULL || tokens == NULL || feature_array == NULL) return NULL; + + char *feature; + + char *prefix = country; + + size_t consumed = 0; + size_t len = strlen(str); + if (len == 0) return NULL; + + char_array *normalized = char_array_new_size(len); + if (normalized == NULL) { + return NULL; + } + + khash_t(str_double) *features = kh_init(str_double); + if (features == NULL) { + char_array_destroy(normalized); + return NULL; + } + + while (consumed < len) { + string_script_t str_script = get_string_script(str, len - consumed); + log_debug("str=%s, len=%zu, consumed=%zu, script_len=%zu\n", str, strlen(str), consumed, str_script.len); + + script_languages_t script_langs = get_script_languages(str_script.script); + + if (script_langs.num_languages > 1) { + token_array_clear(tokens); + bool keep_whitespace = true; + tokenize_add_tokens(tokens, (const char *)str, str_script.len, keep_whitespace); + + size_t num_tokens = tokens->n; + token_t token; + + char_array_clear(normalized); + + for (size_t i = 0; i < num_tokens; i++) { + token = tokens->a[i]; + language_classifier_normalize_token(normalized, str, token); + } + char_array_terminate(normalized); + + char *normalized_str = char_array_get_string(normalized); + token_array_clear(tokens); + keep_whitespace = false; + tokenize_add_tokens(tokens, (const char *)normalized_str, strlen(normalized_str), keep_whitespace); + + token_t prev_token; + bool prev_was_phrase = false; + char *phrase = NULL; + + // Search address dictionaries for any language + phrase_array *phrases = search_address_dictionaries_tokens(normalized_str, tokens, NULL); + log_debug("normalized_str=%s\n", normalized_str); + + size_t start = 0; + size_t end = 0; + size_t i, j; + + if (phrases != NULL) { + for (i = 0; i < phrases->n; i++) { + phrase_t phrase = phrases->a[i]; + log_debug("phrase (%d, %d)\n", phrase.start, phrase.len); + + end = phrase.start; + for (j = start; j < end; j++) { + token = tokens->a[j]; + log_debug("j=%zu, token.offset=%zu, token.len=%zu\n", j, token.offset, token.len); + add_token_features(features, prefix, feature_array, normalized_str, token); + } + + log_debug("done with start tokens\n"); + add_phrase_feature(features, prefix, feature_array, normalized_str, phrase, tokens); + start = phrase.start + phrase.len; + } + + phrase_array_destroy(phrases); + + } + + for (j = start; j < tokens->n; j++) { + log_debug("end token: %zu\n", j); + token = tokens->a[j]; + + add_token_features(features, prefix, feature_array, normalized_str, token); + } + + } else if (str_script.script != SCRIPT_UNKNOWN && str_script.script != SCRIPT_COMMON && script_langs.num_languages > 0) { + char_array_clear(feature_array); + char_array_append(feature_array, "sc="); + char_array_cat_printf(feature_array, "%d", str_script.script); + feature = char_array_get_string(feature_array); + log_debug("script feature=%s\n", feature); + + if (feature != NULL) { + feature_counts_add(features, feature, 1.0); + } + } + + consumed += str_script.len; + str += str_script.len; + } + + char_array_destroy(normalized); + + return features; +} diff --git a/src/language_features.h b/src/language_features.h new file mode 100644 index 00000000..b12bf3af --- /dev/null +++ b/src/language_features.h @@ -0,0 +1,16 @@ +#ifndef LANGUAGE_FEATURES_H +#define LANGUAGE_FEATURES_H + +#include + +#include "collections.h" +#include "string_utils.h" +#include "tokens.h" + + +char *language_classifier_normalize_string(char *str); +void language_classifier_normalize_token(char_array *array, char *str, token_t token); + +khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array); + +#endif \ No newline at end of file