[tokenization] Adding an acronym token type for things like U.N. so we can delete internal periods on those tokens

This commit is contained in:
Al
2015-06-29 03:00:46 -04:00
parent 47efce4b7e
commit 3279b31b09
3 changed files with 6 additions and 4 deletions

View File

@@ -46105,7 +46105,7 @@ yy3315:
}
yy3316:
#line 166 "scanner.re"
{ return ABBREVIATION; }
{ return ACRONYM; }
#line 46110 "scanner.c"
yy3317:
yyaccept = 6;

View File

@@ -163,7 +163,7 @@ email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3});
{us_phone_number} { return US_PHONE; }
{international_phone_number} { return INTL_PHONE; }
{multi_punct_abbreviation} { return ABBREVIATION; }
{multi_punct_abbreviation} { return ACRONYM; }
{hyphen_plus_abbreviation} { return ABBREVIATION; }
{abbreviation} { return ABBREVIATION; }

View File

@@ -11,10 +11,12 @@ extern "C" {
// Word types
#define WORD 1 // Any letter-only word (includes all unicode letters)
#define ABBREVIATION 2 // Loose abbreviations (ending in ".")
#define ABBREVIATION 2 // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses)
#define IDEOGRAPHIC_CHAR 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character
#define HANGUL_SYLLABLE 4 // Hangul syllable sequences which contain more than one codepoint
#define PHRASE 5 // Not part of the first stage tokenizer, but may be used after phrase parsing
#define ACRONYM 5 // Specifically things like U.N. where we may delete internal periods
#define PHRASE 10 // Not part of the first stage tokenizer, but may be used after phrase parsing
// Special tokens
#define EMAIL 20 // Make sure emails are tokenized altogether