[tokenization] Adding an acronym token type for things like U.N. so we can delete internal periods on those tokens
This commit is contained in:
@@ -11,10 +11,12 @@ extern "C" {
|
||||
|
||||
// Word types
|
||||
#define WORD 1 // Any letter-only word (includes all unicode letters)
|
||||
#define ABBREVIATION 2 // Loose abbreviations (ending in ".")
|
||||
#define ABBREVIATION 2 // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses)
|
||||
#define IDEOGRAPHIC_CHAR 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character
|
||||
#define HANGUL_SYLLABLE 4 // Hangul syllable sequences which contain more than one codepoint
|
||||
#define PHRASE 5 // Not part of the first stage tokenizer, but may be used after phrase parsing
|
||||
#define ACRONYM 5 // Specifically things like U.N. where we may delete internal periods
|
||||
|
||||
#define PHRASE 10 // Not part of the first stage tokenizer, but may be used after phrase parsing
|
||||
|
||||
// Special tokens
|
||||
#define EMAIL 20 // Make sure emails are tokenized altogether
|
||||
|
||||
Reference in New Issue
Block a user