[language_classification] Removing the per-country classifier, text-based alone is doing close to 99% accuracy now
This commit is contained in:
@@ -13,7 +13,6 @@
|
|||||||
#define MIN_PROB (0.05 - DBL_EPSILON)
|
#define MIN_PROB (0.05 - DBL_EPSILON)
|
||||||
|
|
||||||
static language_classifier_t *language_classifier = NULL;
|
static language_classifier_t *language_classifier = NULL;
|
||||||
static language_classifier_t *language_classifier_country = NULL;
|
|
||||||
|
|
||||||
void language_classifier_destroy(language_classifier_t *self) {
|
void language_classifier_destroy(language_classifier_t *self) {
|
||||||
if (self == NULL) return;
|
if (self == NULL) return;
|
||||||
@@ -42,19 +41,22 @@ language_classifier_t *get_language_classifier(void) {
|
|||||||
return language_classifier;
|
return language_classifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
language_classifier_t *get_language_classifier_country(void) {
|
void language_classifier_response_destroy(language_classifier_response_t *self) {
|
||||||
return language_classifier_country;
|
if (self == NULL) return;
|
||||||
}
|
if (self->languages != NULL) {
|
||||||
|
free(self->languages);
|
||||||
language_classifier_response_t *classify_languages(char *address, char *country) {
|
|
||||||
language_classifier_t *classifier = NULL;
|
|
||||||
|
|
||||||
if (country == NULL) {
|
|
||||||
classifier = get_language_classifier();
|
|
||||||
} else {
|
|
||||||
classifier = get_language_classifier_country();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (self->probs) {
|
||||||
|
free(self->probs);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(self);
|
||||||
|
}
|
||||||
|
|
||||||
|
language_classifier_response_t *classify_languages(char *address) {
|
||||||
|
language_classifier_t *classifier = get_language_classifier();
|
||||||
|
|
||||||
if (classifier == NULL) {
|
if (classifier == NULL) {
|
||||||
log_error("classifier NULL\n");
|
log_error("classifier NULL\n");
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -65,7 +67,23 @@ language_classifier_response_t *classify_languages(char *address, char *country)
|
|||||||
token_array *tokens = token_array_new();
|
token_array *tokens = token_array_new();
|
||||||
char_array *feature_array = char_array_new();
|
char_array *feature_array = char_array_new();
|
||||||
|
|
||||||
khash_t(str_double) *feature_counts = extract_language_features(normalized, country, tokens, feature_array);
|
khash_t(str_double) *feature_counts = extract_language_features(normalized, NULL, tokens, feature_array);
|
||||||
|
if (feature_counts == NULL || kh_size(feature_counts) == 0) {
|
||||||
|
token_array_destroy(tokens);
|
||||||
|
char_array_destroy(feature_array);
|
||||||
|
if (feature_counts != NULL) {
|
||||||
|
kh_destroy(str_double, feature_counts);
|
||||||
|
}
|
||||||
|
free(normalized);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *f;
|
||||||
|
double c;
|
||||||
|
kh_foreach(feature_counts, f, c, {
|
||||||
|
printf("%s (%f)\n", f, c);
|
||||||
|
})
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
sparse_matrix_t *x = feature_vector(classifier->features, feature_counts);
|
sparse_matrix_t *x = feature_vector(classifier->features, feature_counts);
|
||||||
|
|
||||||
@@ -112,7 +130,8 @@ language_classifier_response_t *classify_languages(char *address, char *country)
|
|||||||
response->probs = probs;
|
response->probs = probs;
|
||||||
}
|
}
|
||||||
|
|
||||||
exit_tokens_created:
|
sparse_matrix_destroy(x);
|
||||||
|
matrix_destroy(p_y);
|
||||||
token_array_destroy(tokens);
|
token_array_destroy(tokens);
|
||||||
char_array_destroy(feature_array);
|
char_array_destroy(feature_array);
|
||||||
const char *key;
|
const char *key;
|
||||||
@@ -120,6 +139,7 @@ exit_tokens_created:
|
|||||||
free((char *)key);
|
free((char *)key);
|
||||||
})
|
})
|
||||||
kh_destroy(str_double, feature_counts);
|
kh_destroy(str_double, feature_counts);
|
||||||
|
free(normalized);
|
||||||
return response;
|
return response;
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -235,7 +255,7 @@ bool language_classifier_save(language_classifier_t *self, char *path) {
|
|||||||
// Module setup/teardown
|
// Module setup/teardown
|
||||||
|
|
||||||
bool language_classifier_module_setup(char *dir) {
|
bool language_classifier_module_setup(char *dir) {
|
||||||
if (language_classifier != NULL && language_classifier_country != NULL) {
|
if (language_classifier != NULL) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -245,7 +265,7 @@ bool language_classifier_module_setup(char *dir) {
|
|||||||
|
|
||||||
char *classifier_path;
|
char *classifier_path;
|
||||||
|
|
||||||
char_array *path = char_array_new_size(strlen(dir) + PATH_SEPARATOR_LEN + strlen(LANGUAGE_CLASSIFIER_COUNTRY_FILENAME));
|
char_array *path = char_array_new_size(strlen(dir) + PATH_SEPARATOR_LEN + strlen(LANGUAGE_CLASSIFIER_FILENAME));
|
||||||
if (language_classifier == NULL) {
|
if (language_classifier == NULL) {
|
||||||
char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, LANGUAGE_CLASSIFIER_FILENAME);
|
char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, LANGUAGE_CLASSIFIER_FILENAME);
|
||||||
classifier_path = char_array_get_string(path);
|
classifier_path = char_array_get_string(path);
|
||||||
@@ -254,15 +274,6 @@ bool language_classifier_module_setup(char *dir) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (language_classifier_country == NULL) {
|
|
||||||
char_array_clear(path);
|
|
||||||
char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, LANGUAGE_CLASSIFIER_COUNTRY_FILENAME);
|
|
||||||
classifier_path = char_array_get_string(path);
|
|
||||||
|
|
||||||
language_classifier_country = language_classifier_load(classifier_path);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
char_array_destroy(path);
|
char_array_destroy(path);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -271,9 +282,5 @@ void language_classifier_module_teardown(void) {
|
|||||||
if (language_classifier != NULL) {
|
if (language_classifier != NULL) {
|
||||||
language_classifier_destroy(language_classifier);
|
language_classifier_destroy(language_classifier);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (language_classifier_country != NULL) {
|
|
||||||
language_classifier_destroy(language_classifier_country);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,8 @@ language_classifier_t *language_classifier_new(void);
|
|||||||
language_classifier_t *get_language_classifier(void);
|
language_classifier_t *get_language_classifier(void);
|
||||||
language_classifier_t *get_language_classifier_country(void);
|
language_classifier_t *get_language_classifier_country(void);
|
||||||
|
|
||||||
language_classifier_response_t *classify_languages(char *address, char *country);
|
language_classifier_response_t *classify_languages(char *address);
|
||||||
|
void language_classifier_response_destroy(language_classifier_response_t *self);
|
||||||
|
|
||||||
void language_classifier_destroy(language_classifier_t *self);
|
void language_classifier_destroy(language_classifier_t *self);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user