[tokenization] Adding an acronym token type for things like U.N. so we can delete internal periods on those tokens

2015-06-29 03:00:46 -04:00
parent 47efce4b7e
commit 3279b31b09
3 changed files with 6 additions and 4 deletions
--- a/src/token_types.h
+++ b/src/token_types.h
@@ -11,10 +11,12 @@ extern "C" {

 // Word types
 #define WORD 1                  // Any letter-only word (includes all unicode letters)
-#define ABBREVIATION 2          // Loose abbreviations (ending in ".")
+#define ABBREVIATION 2          // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses)
 #define IDEOGRAPHIC_CHAR 3      // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character
 #define HANGUL_SYLLABLE 4       // Hangul syllable sequences which contain more than one codepoint
-#define PHRASE 5                // Not part of the first stage tokenizer, but may be used after phrase parsing
+#define ACRONYM 5               // Specifically things like U.N. where we may delete internal periods
+
+#define PHRASE 10               // Not part of the first stage tokenizer, but may be used after phrase parsing

 // Special tokens
 #define EMAIL 20                // Make sure emails are tokenized altogether