diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c
new file mode 100644
index 00000000..1bd43220
--- /dev/null
+++ b/src/soft_tfidf.c
@@ -0,0 +1,170 @@
+#include "soft_tfidf.h"
+#include "float_utils.h"
+#include "string_similarity.h"
+#include "string_utils.h"
+
+static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = {
+    .jaro_winkler_min = 0.9,
+    .damerau_levenshtein_max = 1,
+    .damerau_levenshtein_min_length = 4,
+    .use_abbreviations = true
+};
+
+
+soft_tfidf_options_t soft_tfidf_default_options(void) {
+    return DEFAULT_SOFT_TFIDF_OPTIONS;
+}
+
+
+double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) {
+    if (token_scores1 == NULL || token_scores2 == NULL) return 0.0;
+
+    if (num_tokens2 < num_tokens1) {
+        double *tmp_scores = token_scores1;
+        token_scores1 = token_scores2;
+        token_scores2 = tmp_scores;
+        char **tmp_tokens = tokens1;
+        tokens1 = tokens2;
+        tokens2 = tmp_tokens;
+
+        size_t tmp_num_tokens = num_tokens1;
+        num_tokens1 = num_tokens2;
+        num_tokens2 = tmp_num_tokens;
+    }
+
+    size_t len1 = num_tokens1;
+    size_t len2 = num_tokens2;
+
+    double total_sim = 0.0;
+
+    uint32_array **t1_tokens_unicode = NULL;
+    uint32_array **t2_tokens_unicode = NULL;
+
+    uint32_array *t1_unicode;
+    uint32_array *t2_unicode;
+
+    t1_tokens_unicode = calloc(len1, sizeof(uint32_array *));
+    if (t1_tokens_unicode == NULL) {
+        total_sim = -1.0;
+        goto return_soft_tfidf_score;
+    }
+    for (size_t i = 0; i < len1; i++) {
+        t1_unicode = unicode_codepoints(tokens1[i]);
+        if (t1_unicode == NULL) {
+            total_sim = -1.0;
+            goto return_soft_tfidf_score;
+        }
+        t1_tokens_unicode[i] = t1_unicode;
+    }
+
+    t2_tokens_unicode = calloc(len2, sizeof(uint32_array *));
+    if (t2_tokens_unicode == NULL) {
+        total_sim = -1.0;
+        goto return_soft_tfidf_score;
+    }
+
+    for (size_t i = 0; i < len2; i++) {
+        t2_unicode = unicode_codepoints(tokens2[i]);
+        if (t2_unicode == NULL) {
+            total_sim = -1.0;
+            goto return_soft_tfidf_score;
+        }
+        t2_tokens_unicode[i] = t2_unicode;
+    }
+
+    double jaro_winkler_min = options.jaro_winkler_min;
+    size_t damerau_levenshtein_max = options.damerau_levenshtein_max;
+    size_t damerau_levenshtein_min_length = options.damerau_levenshtein_min_length;
+    bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && len1 >= damerau_levenshtein_min_length;
+
+    bool use_abbreviations = options.use_abbreviations;
+
+    for (size_t i = 0; i < len1; i++) {
+        uint32_array *t1u = t1_tokens_unicode[i];
+        uint32_array *t2u;
+        char *t1 = tokens1[i];
+        double t1_score = token_scores1[i];
+
+        double max_sim = 0.0;
+        size_t min_dist = t1u->n;
+        size_t argmax_sim = 0;
+        size_t argmin_dist = 0;
+        double argmin_dist_sim = 0.0;
+        size_t last_abbreviation = 0;
+        double last_abbreviation_sim = 0.0;
+        bool have_abbreviation = false;        
+        double t2_score;
+
+        for (size_t j = 0; j < len2; j++) {
+            char *t2 = tokens2[j];
+            t2u = t2_tokens_unicode[j];
+            if (unicode_equals(t1u, t2u)) {
+                max_sim = 1.0;
+                argmax_sim = j;
+                break;
+            }
+
+            double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u);
+            if (jaro_winkler > max_sim) {
+                max_sim = jaro_winkler;
+                argmax_sim = j;
+            }
+
+            if (use_damerau_levenshtein) {
+                size_t replace_cost = 0;
+                ssize_t dist = damerau_levenshtein_distance_unicode(t1u, t2u, replace_cost);
+                if (dist >= 0 && dist < min_dist) {
+                    min_dist = (size_t)dist;
+                    argmin_dist = j;
+                    argmin_dist_sim = jaro_winkler;
+                }
+            }
+
+            if (use_abbreviations) {
+                bool is_abbreviation = possible_abbreviation_unicode(t1u, t2u);
+                if (is_abbreviation) {
+                    last_abbreviation = j;
+                    last_abbreviation_sim = jaro_winkler;
+                    have_abbreviation = true;
+                }
+            }
+        }
+
+        // Note: here edit distance, affine gap and abbreviations are only used in the thresholding process.
+        // Jaro-Winkler is still used to calculate similarity
+
+        if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) {
+            t2_score = token_scores2[argmax_sim];
+            total_sim += max_sim * t1_score * t2_score;
+        } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) {
+            t2_score = token_scores2[argmin_dist];
+            total_sim += argmin_dist_sim * t1_score * t2_score;
+        } else if (use_abbreviations && have_abbreviation) {
+            t2_score = token_scores2[last_abbreviation];
+            total_sim += last_abbreviation_sim * t1_score * t2_score;
+        }
+    }
+
+return_soft_tfidf_score:
+    if (t1_tokens_unicode != NULL) {
+        for (size_t i = 0; i < len1; i++) {
+            t1_unicode = t1_tokens_unicode[i];
+            if (t1_unicode != NULL) {
+                uint32_array_destroy(t1_unicode);
+            }
+        }
+        free(t1_tokens_unicode);
+    }
+
+    if (t2_tokens_unicode != NULL) {
+        for (size_t i = 0; i < len2; i++) {
+            t2_unicode = t2_tokens_unicode[i];
+            if (t2_unicode != NULL) {
+                uint32_array_destroy(t2_unicode);
+            }
+        }
+        free(t2_tokens_unicode);
+    }
+
+    return total_sim;
+}
\ No newline at end of file
diff --git a/src/soft_tfidf.h b/src/soft_tfidf.h
new file mode 100644
index 00000000..7d777fc5
--- /dev/null
+++ b/src/soft_tfidf.h
@@ -0,0 +1,46 @@
+#ifndef SOFT_TFIDF_H
+#define SOFT_TFIDF_H
+
+#include <stdlib.h>
+#include "collections.h"
+#include "libpostal.h"
+
+/*
+This is a variant of Soft-TFIDF as described in:
+
+Cohen, Ravikumar, and Fienberg. A comparison of string distance
+metrics for name-matching tasks. (2003)
+https://www.cs.cmu.edu/~wcohen/postscript/ijcai-ws-2003.pdf
+
+Soft TFIDF is a hybrid similarity function for strings, typically names,
+which combines both global statistics (TF-IDF) and a local similarity
+function (e.g. Jaro-Winkler, which the authors suggest performs best).
+
+Given two strings, s1 and s2, each token t1 in s1 is matched with its most
+similar counterpart t2 in s2 according to the local distance function.
+
+The Soft-TFIDF similarity is then the dot product of the max token
+similarities and the cosine similarity of the TF-IDF vectors for all tokens
+if the max similarity is >= a given threshold theta.
+
+This version is a modified Soft-TFIDF. Jaro-Winkler is used as the secondary
+distance metric. However, the defintion of two tokens being "similar" is
+defined as either:
+
+1. Jaro-Winkler distance >= theta
+2. Damerau-Levenshtein edit distance <= max_edit_distance
+3. Affine gap edit counts indicate a possible abbreviation (# matches == min(len1, len2))
+*/
+
+typedef struct soft_tfidf_options {
+    double jaro_winkler_min;
+    size_t damerau_levenshtein_max;
+    size_t damerau_levenshtein_min_length;
+    bool use_abbreviations;
+} soft_tfidf_options_t;
+
+soft_tfidf_options_t soft_tfidf_default_options(void);
+
+double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options);
+
+#endif
\ No newline at end of file