[tokenization] Acronym vs abbreviation

This commit is contained in:
Al
2015-09-30 04:10:04 -04:00
parent 7dfbcce9ec
commit 689b830ad2

View File

@@ -136,6 +136,7 @@ apos_word = ("'"?({latinish_letter}+"'")+{latinish_letter}+"'"?);
ellipsis = ("\."{2,}|"\u2026"); ellipsis = ("\."{2,}|"\u2026");
acronym = ({letter}"\.")+{letter}?;
multi_punct_abbreviation = ({letter}+"\.")+{letter}?; multi_punct_abbreviation = ({letter}+"\.")+{letter}?;
abbrev_word = (({letter}|{possible_word_char})+"\.")+({letter}|{possible_word_char}*); abbrev_word = (({letter}|{possible_word_char})+"\.")+({letter}|{possible_word_char}*);
@@ -165,7 +166,8 @@ email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3});
{us_phone_number} { return US_PHONE; } {us_phone_number} { return US_PHONE; }
{international_phone_number} { return INTL_PHONE; } {international_phone_number} { return INTL_PHONE; }
{multi_punct_abbreviation} { return ACRONYM; } {acronym} { return ACRONYM; }
{multi_punct_abbreviation} { return ABBREVIATION; }
{hyphen_plus_abbreviation} { return ABBREVIATION; } {hyphen_plus_abbreviation} { return ABBREVIATION; }
{abbreviation} { return ABBREVIATION; } {abbreviation} { return ABBREVIATION; }