[parser] for the min updates method to work, the feature that have not yet reached the min_updates threshold also need to be ignored when scoring, that way the model has to perform without those features, and should make more updates if they're relevant

2017-03-08 15:40:12 -05:00
parent a63c182e96
commit 4c03e563e0
1 changed files with 14 additions and 7 deletions
--- a/src/averaged_perceptron_trainer.c
+++ b/src/averaged_perceptron_trainer.c
@@ -316,20 +316,27 @@ uint32_t averaged_perceptron_trainer_predict(averaged_perceptron_trainer_t *self

    double_array_zero(scores->a, scores->n);

+    uint64_t *update_counts = self->update_counts->a;
+
    cstring_array_foreach(features, i, feature, {
        if (!averaged_perceptron_trainer_get_feature_id(self, feature, &feature_id, add_if_missing)) {
            continue;
        }

-        weights = averaged_perceptron_trainer_get_class_weights(self, feature_id, add_if_missing);
+        uint64_t update_count = update_counts[feature_id];
+        bool keep_feature = update_count >= self->min_updates;

-        if (weights == NULL) {
-            continue;
+        if (keep_feature) {
+            weights = averaged_perceptron_trainer_get_class_weights(self, feature_id, add_if_missing);
+
+            if (weights == NULL) {
+                continue;
+            }
+
+            kh_foreach(weights, class_id, weight, {
+                scores->a[class_id] += weight.value;
+            })
        }
-
-        kh_foreach(weights, class_id, weight, {
-            scores->a[class_id] += weight.value;
-        })
    })

    int64_t max_score = double_array_argmax(scores->a, scores->n);