Files
libpostal/src/ngrams.c
2016-12-21 18:09:45 -05:00

72 lines
1.8 KiB
C

#include "ngrams.h"
#include "utf8proc/utf8proc.h"
bool add_ngrams(cstring_array *grams, size_t n, char *str, size_t len, bool prefix, bool suffix) {
if (n == 0) return false;
size_t lengths[n];
size_t num_chars = 0;
uint8_t *ptr = (uint8_t *)str;
int32_t ch;
size_t idx = 0;
size_t gram_len = 0;
size_t gram_offset = 0;
size_t consumed = 0;
size_t num_grams = 0;
bool beginning = true;
while (idx < len) {
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
if (char_len <= 0 || ch == 0) break;
// Not at min characters yet
if (num_chars < n) {
lengths[num_chars] = (size_t)char_len;
num_chars++;
gram_len += char_len;
}
// We have a full gram of size n
if (num_chars == n && (num_grams > 0 || idx + char_len < len)) {
uint32_t token_index = cstring_array_start_token(grams);
if (beginning) {
beginning = false;
} else {
if (prefix) {
cstring_array_append_string(grams, "_");
}
gram_len -= lengths[0];
gram_offset += lengths[0];
gram_len += char_len;
for (size_t i = 1; i < n; i++) {
lengths[i - 1] = lengths[i];
}
lengths[n - 1] = (size_t)char_len;
}
cstring_array_append_string_len(grams, str + gram_offset, gram_len);
if (idx + char_len < len && suffix) {
cstring_array_append_string(grams, "_");
}
cstring_array_terminate(grams);
num_grams++;
}
idx += char_len;
ptr += char_len;
consumed += char_len;
}
return num_grams > 0;
}