diff --git a/src/scanner.c b/src/scanner.c index 92ecc645..a2e2cf32 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -46105,7 +46105,7 @@ yy3315: } yy3316: #line 166 "scanner.re" - { return ABBREVIATION; } + { return ACRONYM; } #line 46110 "scanner.c" yy3317: yyaccept = 6; diff --git a/src/scanner.re b/src/scanner.re index 91b771e3..a60e970a 100644 --- a/src/scanner.re +++ b/src/scanner.re @@ -163,7 +163,7 @@ email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3}); {us_phone_number} { return US_PHONE; } {international_phone_number} { return INTL_PHONE; } -{multi_punct_abbreviation} { return ABBREVIATION; } +{multi_punct_abbreviation} { return ACRONYM; } {hyphen_plus_abbreviation} { return ABBREVIATION; } {abbreviation} { return ABBREVIATION; } diff --git a/src/token_types.h b/src/token_types.h index 15a2a83d..472f345e 100644 --- a/src/token_types.h +++ b/src/token_types.h @@ -11,10 +11,12 @@ extern "C" { // Word types #define WORD 1 // Any letter-only word (includes all unicode letters) -#define ABBREVIATION 2 // Loose abbreviations (ending in ".") +#define ABBREVIATION 2 // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses) #define IDEOGRAPHIC_CHAR 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character #define HANGUL_SYLLABLE 4 // Hangul syllable sequences which contain more than one codepoint -#define PHRASE 5 // Not part of the first stage tokenizer, but may be used after phrase parsing +#define ACRONYM 5 // Specifically things like U.N. where we may delete internal periods + +#define PHRASE 10 // Not part of the first stage tokenizer, but may be used after phrase parsing // Special tokens #define EMAIL 20 // Make sure emails are tokenized altogether