[tokenization] Adding a Hangul syllable class in tokenization for syllables written out as Jamo

2015-06-16 12:52:04 -04:00
parent f04fad0e93
commit 77760f207c
2 changed files with 12 additions and 1 deletions
--- a/src/scanner.re
+++ b/src/scanner.re
--- a/src/token_types.h
+++ b/src/token_types.h
@@ -13,7 +13,8 @@ extern "C" {
 #define WORD 1                  // Any letter-only word (includes all unicode letters)
 #define ABBREVIATION 2          // Loose abbreviations (ending in ".")
 #define IDEOGRAPHIC_CHAR 3      // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character
-#define PHRASE 4                // Not part of the first stage tokenizer, but may be used after phrase parsing
+#define HANGUL_SYLLABLE 4       // Hangul syllable sequences which contain more than one codepoint
+#define PHRASE 5                // Not part of the first stage tokenizer, but may be used after phrase parsing

 // Special tokens
 #define EMAIL 20                // Make sure emails are tokenized altogether