[language_classifier] Features for address languages classification, quadgrams for most languages, unigrams for ideographic characters, script for single-script languages like Thai, Hebrew, etc.

This commit is contained in:
Al
2016-01-09 03:42:57 -05:00
parent 29930fa7b6
commit b13462f8ef
2 changed files with 341 additions and 0 deletions

325
src/language_features.c Normal file
View File

@@ -0,0 +1,325 @@
#include "language_features.h"
#include "language_classifier.h"
#include "address_dictionary.h"
#include "features.h"
#include "normalize.h"
#include "scanner.h"
#include "unicode_scripts.h"
#define UNIGRAMS 1
#define BIGRAMS 2
#define QUADGRAMS 4
#define OCTAGRAMS 8
#define LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
#define LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
inline char *language_classifier_normalize_string(char *str) {
return normalize_string_latin(str, strlen(str), LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS);
}
inline void language_classifier_normalize_token(char_array *array, char *str, token_t token) {
char_array_strip_nul_byte(array);
if (is_word_token(token.type)) {
add_normalized_token(array, str, token, LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS);
} else {
char_array_add(array, " ");
}
}
static inline void append_prefix(char_array *array, char *prefix) {
if (prefix != NULL) {
char_array_append(array, prefix);
char_array_append(array, NAMESPACE_SEPARATOR_CHAR);
}
}
static inline void add_full_token_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) {
if (features == NULL || feature_array == NULL) return;
char_array_clear(feature_array);
append_prefix(feature_array, prefix);
char_array_add_len(feature_array, str + token.offset, token.len);
if (feature_array->n <= 1) return;
char *feature = char_array_get_string(feature_array);
log_debug("full token feature=%s\n", feature);
feature_counts_add(features, feature, 1.0);
}
static void add_ngram_features(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token, size_t n) {
char *feature_namespace;
if (features == NULL || feature_array == NULL) return;
if (n == 0 || !is_word_token(token.type)) return;
size_t lengths[n];
size_t num_chars = 0;
size_t offset = token.offset;
uint8_t *ptr = (uint8_t *)str + offset;
int32_t ch;
size_t idx = 0;
size_t len = token.len;
size_t gram_len = 0;
size_t gram_offset = 0;
size_t consumed = 0;
bool beginning = true;
log_debug("len = %zu\n", len);
while (idx < len) {
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
if (char_len <= 0 || ch == 0) break;
// Not at min characters yet
if (num_chars < n) {
lengths[num_chars] = (size_t)char_len;
num_chars++;
gram_len += char_len;
}
// We have a full gram of size n
if (num_chars == n) {
char_array_clear(feature_array);
append_prefix(feature_array, prefix);
if (beginning) {
beginning = false;
} else {
char_array_append(feature_array, "_");
gram_len -= lengths[0];
gram_offset += lengths[0];
gram_len += char_len;
for (size_t i = 1; i < n; i++) {
lengths[i - 1] = lengths[i];
}
lengths[n - 1] = (size_t)char_len;
}
char_array_append_len(feature_array, str + offset + gram_offset, gram_len);
if (idx + char_len < len) {
char_array_append(feature_array, "_");
}
char_array_terminate(feature_array);
if (feature_array->n <= 1) continue;
char *feature = char_array_get_string(feature_array);
log_debug("feature=%s\n", feature);
feature_counts_add(features, feature, 1.0);
}
idx += char_len;
ptr += char_len;
consumed += char_len;
}
if (num_chars < n) {
add_full_token_feature(features, prefix, feature_array, str, token);
}
}
static void add_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_array *tokens) {
if (features == NULL || feature_array == NULL || tokens == NULL || tokens->n == 0) return;
char_array_clear(feature_array);
append_prefix(feature_array, prefix);
token_t token;
for (size_t i = phrase.start; i < phrase.start + phrase.len; i++) {
token = tokens->a[i];
char_array_append_len(feature_array, str + token.offset, token.len);
if (i < phrase.start + phrase.len - 1 && !is_ideographic(token.type)) {
char_array_append(feature_array, " ");
}
}
char_array_terminate(feature_array);
if (feature_array->n <= 1) return;
char *feature = char_array_get_string(feature_array);
feature_counts_add(features, feature, 1.0);
}
static void add_prefix_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_t token) {
if (features == NULL || feature_array == NULL || phrase.len == 0 || phrase.len >= token.len) return;
char_array_clear(feature_array);
append_prefix(feature_array, prefix);
char_array_append(feature_array, "pfx=");
char_array_add_len(feature_array, str + token.offset, phrase.len);
if (feature_array->n <= 1) return;
char *feature = char_array_get_string(feature_array);
feature_counts_add(features, feature, 1.0);
}
static void add_suffix_phrase_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, phrase_t phrase, token_t token) {
if (features == NULL || feature_array == NULL || phrase.len == 0 || phrase.len >= token.len) return;
char_array_clear(feature_array);
append_prefix(feature_array, prefix);
char_array_append(feature_array, "sfx=");
char_array_add_len(feature_array, str + token.offset + token.len - phrase.len, phrase.len);
if (feature_array->n <= 1) return;
char *feature = char_array_get_string(feature_array);
feature_counts_add(features, feature, 1.0);
}
static void add_token_features(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) {
// Non-words don't convey any language information
// TODO: ordinal number suffixes may be worth investigating
if (!is_word_token(token.type)) {
return;
}
phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, NULL);
if (prefix_phrase.len > 0 && prefix_phrase.len < token.len) {
add_prefix_phrase_feature(features, prefix, feature_array, str, prefix_phrase, token);
}
phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, NULL);
if (suffix_phrase.len > 0 && suffix_phrase.len < token.len) {
add_suffix_phrase_feature(features, prefix, feature_array, str, suffix_phrase, token);
}
if (!is_ideographic(token.type)) {
// Add quadgram features
add_ngram_features(features, prefix, feature_array, str, token, QUADGRAMS);
} else {
// For ideographic scripts, use single ideograms
add_full_token_feature(features, prefix, feature_array, str, token);
}
}
khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array) {
if (str == NULL || tokens == NULL || feature_array == NULL) return NULL;
char *feature;
char *prefix = country;
size_t consumed = 0;
size_t len = strlen(str);
if (len == 0) return NULL;
char_array *normalized = char_array_new_size(len);
if (normalized == NULL) {
return NULL;
}
khash_t(str_double) *features = kh_init(str_double);
if (features == NULL) {
char_array_destroy(normalized);
return NULL;
}
while (consumed < len) {
string_script_t str_script = get_string_script(str, len - consumed);
log_debug("str=%s, len=%zu, consumed=%zu, script_len=%zu\n", str, strlen(str), consumed, str_script.len);
script_languages_t script_langs = get_script_languages(str_script.script);
if (script_langs.num_languages > 1) {
token_array_clear(tokens);
bool keep_whitespace = true;
tokenize_add_tokens(tokens, (const char *)str, str_script.len, keep_whitespace);
size_t num_tokens = tokens->n;
token_t token;
char_array_clear(normalized);
for (size_t i = 0; i < num_tokens; i++) {
token = tokens->a[i];
language_classifier_normalize_token(normalized, str, token);
}
char_array_terminate(normalized);
char *normalized_str = char_array_get_string(normalized);
token_array_clear(tokens);
keep_whitespace = false;
tokenize_add_tokens(tokens, (const char *)normalized_str, strlen(normalized_str), keep_whitespace);
token_t prev_token;
bool prev_was_phrase = false;
char *phrase = NULL;
// Search address dictionaries for any language
phrase_array *phrases = search_address_dictionaries_tokens(normalized_str, tokens, NULL);
log_debug("normalized_str=%s\n", normalized_str);
size_t start = 0;
size_t end = 0;
size_t i, j;
if (phrases != NULL) {
for (i = 0; i < phrases->n; i++) {
phrase_t phrase = phrases->a[i];
log_debug("phrase (%d, %d)\n", phrase.start, phrase.len);
end = phrase.start;
for (j = start; j < end; j++) {
token = tokens->a[j];
log_debug("j=%zu, token.offset=%zu, token.len=%zu\n", j, token.offset, token.len);
add_token_features(features, prefix, feature_array, normalized_str, token);
}
log_debug("done with start tokens\n");
add_phrase_feature(features, prefix, feature_array, normalized_str, phrase, tokens);
start = phrase.start + phrase.len;
}
phrase_array_destroy(phrases);
}
for (j = start; j < tokens->n; j++) {
log_debug("end token: %zu\n", j);
token = tokens->a[j];
add_token_features(features, prefix, feature_array, normalized_str, token);
}
} else if (str_script.script != SCRIPT_UNKNOWN && str_script.script != SCRIPT_COMMON && script_langs.num_languages > 0) {
char_array_clear(feature_array);
char_array_append(feature_array, "sc=");
char_array_cat_printf(feature_array, "%d", str_script.script);
feature = char_array_get_string(feature_array);
log_debug("script feature=%s\n", feature);
if (feature != NULL) {
feature_counts_add(features, feature, 1.0);
}
}
consumed += str_script.len;
str += str_script.len;
}
char_array_destroy(normalized);
return features;
}

16
src/language_features.h Normal file
View File

@@ -0,0 +1,16 @@
#ifndef LANGUAGE_FEATURES_H
#define LANGUAGE_FEATURES_H
#include <stdlib.h>
#include "collections.h"
#include "string_utils.h"
#include "tokens.h"
char *language_classifier_normalize_string(char *str);
void language_classifier_normalize_token(char_array *array, char *str, token_t token);
khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array);
#endif