From 689b830ad268723a8c1695a6c5ab8584371b817e Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 30 Sep 2015 04:10:04 -0400 Subject: [PATCH] [tokenization] Acronym vs abbreviation --- src/scanner.re | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scanner.re b/src/scanner.re index 1a82748e..feb5cdf6 100644 --- a/src/scanner.re +++ b/src/scanner.re @@ -136,6 +136,7 @@ apos_word = ("'"?({latinish_letter}+"'")+{latinish_letter}+"'"?); ellipsis = ("\."{2,}|"\u2026"); +acronym = ({letter}"\.")+{letter}?; multi_punct_abbreviation = ({letter}+"\.")+{letter}?; abbrev_word = (({letter}|{possible_word_char})+"\.")+({letter}|{possible_word_char}*); @@ -165,7 +166,8 @@ email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3}); {us_phone_number} { return US_PHONE; } {international_phone_number} { return INTL_PHONE; } -{multi_punct_abbreviation} { return ACRONYM; } +{acronym} { return ACRONYM; } +{multi_punct_abbreviation} { return ABBREVIATION; } {hyphen_plus_abbreviation} { return ABBREVIATION; } {abbreviation} { return ABBREVIATION; }