diff --git a/src/scanner.c b/src/scanner.c index bd88e8c8..82a8dcdf 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -11364,7 +11364,7 @@ yy860: } yy861: #line 152 "scanner.re" - { return IDEOGRAPH; } + { return IDEOGRAM; } #line 11369 "scanner.c" yy862: yyaccept = 14; diff --git a/src/scanner.re b/src/scanner.re index f77e9d0e..a7fdbde4 100644 --- a/src/scanner.re +++ b/src/scanner.re @@ -149,7 +149,7 @@ abbreviation = ({word})"\."; {numeric} { return NUMERIC; } {apos_word} { return WORD; } -{ideographic_symbol} { return IDEOGRAPH; } +{ideographic_symbol} { return IDEOGRAM; } {word} { return WORD; } {katakana} { return WORD; } {any_word} { return WORD; } diff --git a/src/token_types.h b/src/token_types.h index 21741fcc..c1ae854f 100644 --- a/src/token_types.h +++ b/src/token_types.h @@ -12,6 +12,8 @@ extern "C" { // Word types #define WORD 1 // Any letter-only word (includes all unicode letters) #define ABBREVIATION 2 // Loose abbreviations (ending in ".") +#define IDEOGRAM 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character +#define PHRASE 4 // Not part of the first stage tokenizer, but may be used after phrase parsing // Numbers and numeric types #define NUMBER 50 // All digits