[ngrams] adding function to extract an array of ngrams from a string, with optional special prefixes/suffixes for the edges

This commit is contained in:
Al
2016-12-17 01:33:18 -05:00
parent 3c6ed7489c
commit 3ed95a175e
2 changed files with 74 additions and 0 deletions

71
src/ngrams.c Normal file
View File

@@ -0,0 +1,71 @@
#include "ngrams.h"
#include "utf8proc/utf8proc.h"
bool add_ngrams(cstring_array *grams, char *str, size_t n, size_t len, bool prefix, bool suffix) {
if (n == 0) return false;
size_t lengths[n];
size_t num_chars = 0;
uint8_t *ptr = (uint8_t *)str;
int32_t ch;
size_t idx = 0;
size_t gram_len = 0;
size_t gram_offset = 0;
size_t consumed = 0;
size_t num_grams = 0;
bool beginning = true;
while (idx < len) {
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
if (char_len <= 0 || ch == 0) break;
// Not at min characters yet
if (num_chars < n) {
lengths[num_chars] = (size_t)char_len;
num_chars++;
gram_len += char_len;
}
// We have a full gram of size n
if (num_chars == n && (num_grams > 0 || idx + char_len < len)) {
uint32_t token_index = cstring_array_start_token(grams);
if (beginning) {
beginning = false;
} else {
if (prefix) {
cstring_array_append_string(grams, "_");
}
gram_len -= lengths[0];
gram_offset += lengths[0];
gram_len += char_len;
for (size_t i = 1; i < n; i++) {
lengths[i - 1] = lengths[i];
}
lengths[n - 1] = (size_t)char_len;
}
cstring_array_append_string_len(grams, str + gram_offset, gram_len);
if (idx + char_len < len && suffix) {
cstring_array_append_string(grams, "_");
}
cstring_array_terminate(grams);
num_grams++;
}
idx += char_len;
ptr += char_len;
consumed += char_len;
}
return num_grams > 0;
}

3
src/ngrams.h Normal file
View File

@@ -0,0 +1,3 @@
#include "string_utils.h"
bool add_ngrams(cstring_array *grams, char *str, size_t n, size_t len, bool prefix, bool suffix);