[similarity] string similarity measures for Damerau-Levenshtein and Jaro-Winkler distances. Both operate on unicode points internally for lengths, etc. instead of byte strings and the Levenshtein distance uses only one array instead of needing to store the full matrix of transitions.

This commit is contained in:
Al
2017-10-19 04:51:28 -04:00
parent 245aa226e0
commit bd477976d1
2 changed files with 234 additions and 0 deletions

18
src/string_similarity.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef STRING_SIMILARITY_H
#define STRING_SIMILARITY_H
#include <stdio.h>
#include <stdlib.h>
#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1
#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7
ssize_t damerau_levenshtein_distance(char *s1, char *s2);
ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost);
double jaro_distance(const char *s1, const char *s2);
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold);
double jaro_winkler_distance(const char *s1, const char *s2);
#endif