From a446290829185be9aad65e0f2069202dac3c19f0 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 11 Mar 2015 17:33:53 -0400 Subject: [PATCH] [fix] IDEOGRAM class name --- src/scanner.c | 2 +- src/scanner.re | 2 +- src/token_types.h | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/scanner.c b/src/scanner.c index bd88e8c8..82a8dcdf 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -11364,7 +11364,7 @@ yy860: } yy861: #line 152 "scanner.re" - { return IDEOGRAPH; } + { return IDEOGRAM; } #line 11369 "scanner.c" yy862: yyaccept = 14; diff --git a/src/scanner.re b/src/scanner.re index f77e9d0e..a7fdbde4 100644 --- a/src/scanner.re +++ b/src/scanner.re @@ -149,7 +149,7 @@ abbreviation = ({word})"\."; {numeric} { return NUMERIC; } {apos_word} { return WORD; } -{ideographic_symbol} { return IDEOGRAPH; } +{ideographic_symbol} { return IDEOGRAM; } {word} { return WORD; } {katakana} { return WORD; } {any_word} { return WORD; } diff --git a/src/token_types.h b/src/token_types.h index 21741fcc..c1ae854f 100644 --- a/src/token_types.h +++ b/src/token_types.h @@ -12,6 +12,8 @@ extern "C" { // Word types #define WORD 1 // Any letter-only word (includes all unicode letters) #define ABBREVIATION 2 // Loose abbreviations (ending in ".") +#define IDEOGRAM 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character +#define PHRASE 4 // Not part of the first stage tokenizer, but may be used after phrase parsing // Numbers and numeric types #define NUMBER 50 // All digits