[language_classifier] Features for address languages classification, quadgrams for most languages, unigrams for ideographic characters, script for single-script languages like Thai, Hebrew, etc.

This commit is contained in:
Al
2016-01-09 03:42:57 -05:00
parent 29930fa7b6
commit b13462f8ef
2 changed files with 341 additions and 0 deletions

16
src/language_features.h Normal file
View File

@@ -0,0 +1,16 @@
#ifndef LANGUAGE_FEATURES_H
#define LANGUAGE_FEATURES_H
#include <stdlib.h>
#include "collections.h"
#include "string_utils.h"
#include "tokens.h"
char *language_classifier_normalize_string(char *str);
void language_classifier_normalize_token(char_array *array, char *str, token_t token);
khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array);
#endif