From 6e4f641743c6218bc55a836912646773137c1b30 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 8 Feb 2017 01:59:39 -0500 Subject: [PATCH] [phrases] adding token_phrase_memberships to trie_search for reuse --- src/trie_search.c | 28 ++++++++++++++++++++++++++++ src/trie_search.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/src/trie_search.c b/src/trie_search.c index bb5dc262..62dfd222 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -778,6 +778,34 @@ inline phrase_t trie_search_prefixes(trie_t *self, char *word, size_t len) { return trie_search_prefixes_from_index_get_prefix_char(self, word, len, ROOT_NODE_ID); } +bool token_phrase_memberships(phrase_array *phrases, int64_array *phrase_memberships, size_t len) { + if (phrases == NULL || phrase_memberships == NULL) { + return false; + } + + int64_t i = 0; + for (int64_t j = 0; j < phrases->n; j++) { + phrase_t phrase = phrases->a[j]; + + for (; i < phrase.start; i++) { + int64_array_push(phrase_memberships, NULL_PHRASE_MEMBERSHIP); + log_debug("token i=%lld, null phrase membership\n", i); + } + + for (i = phrase.start; i < phrase.start + phrase.len; i++) { + log_debug("token i=%lld, phrase membership=%lld\n", i, j); + int64_array_push(phrase_memberships, j); + } + } + + for (; i < len; i++) { + log_debug("token i=%lld, null phrase membership\n", i); + int64_array_push(phrase_memberships, NULL_PHRASE_MEMBERSHIP); + } + + return true; +} + inline char *cstring_array_get_phrase(cstring_array *str, char_array *phrase_tokens, phrase_t phrase) { char_array_clear(phrase_tokens); diff --git a/src/trie_search.h b/src/trie_search.h index 7480ee49..df1817e7 100644 --- a/src/trie_search.h +++ b/src/trie_search.h @@ -26,6 +26,7 @@ typedef struct phrase { VECTOR_INIT(phrase_array, phrase_t) #define NULL_PHRASE (phrase_t){0, 0, 0} +#define NULL_PHRASE_MEMBERSHIP -1 phrase_array *trie_search(trie_t *self, char *text); bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, phrase_array **phrases); @@ -40,6 +41,8 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u phrase_t trie_search_prefixes_from_index_get_prefix_char(trie_t *self, char *word, size_t len, uint32_t start_node_id); phrase_t trie_search_prefixes(trie_t *self, char *word, size_t len); +bool token_phrase_memberships(phrase_array *phrases, int64_array *phrase_memberships, size_t len); + char *cstring_array_get_phrase(cstring_array *str, char_array *phrase_tokens, phrase_t phrase); #endif