[tokenization] Reverting commit for tokenizing initial/final apostrophes as part of words as it may be more effective to handle during post-processing

This commit is contained in:
Al
2016-05-30 11:59:37 -04:00
parent 0a8f46bdc3
commit 2454b98c6d
2 changed files with 265761 additions and 250128 deletions

515881
src/scanner.c

File diff suppressed because it is too large Load Diff

View File

@@ -109,7 +109,7 @@ basic_word = {letter}+;
// WB6
word_non_breaking_mid_char = {letter}+(({mid_letter_chars}|{non_breaking_dash}|{mid_num_letter_chars}|{single_quote}){letter}+)+;
// WB7
word_end_single_quote = {letter}+(({mid_letter_chars}|{non_breaking_dash}|{mid_num_letter_chars}|{single_quote}){letter}+)*{single_quote};
word_end_single_quote = {letter}+(({mid_letter_chars}|{mid_num_letter_chars}){letter}+{single_quote});
// WB7a
hebrew_word_single_quote = ({hebrew_letter_chars}+{single_quote})+;
// WB7b and WB7c
@@ -132,8 +132,7 @@ any_word = (({possible_word_char}*{letter}+{possible_word_char}*{non_breaking_da
// GB6, GB7 & GB8
hangul_syllable = (({hangul_syllable_class_L}(hangul_syllable_class_L|hangul_syllable_class_V|hangul_syllable_class_LV|hangul_syllable_class_LVT))|(({hangul_syllable_class_L}|{hangul_syllable_class_V})({hangul_syllable_class_V}{hangul_syllable_class_T}))|(({hangul_syllable_class_LVT}|{hangul_syllable_class_T}){hangul_syllable_class_T}));
// Words like 't or 's in Dutch, 'na in Gaelic
word_begin_single_quote = {single_quote}{letter}+(({mid_letter_chars}|{non_breaking_dash}|{mid_num_letter_chars}|{single_quote}){letter}+)*;
apos_word = ("'"?({latinish_letter}+"'"){latinish_letter}+"'"?);
ellipsis = ("\."{2,}|"\u2026");
@@ -144,7 +143,7 @@ abbrev_word = (({letter}|{possible_word_char})+"\.")+({letter}|{possible_word_ch
hyphen_plus_abbreviation = ((({abbrev_word}|{any_word}){hyphen})+({abbrev_word}))|(({abbrev_word}|{any_word}){hyphen})*({abbrev_word}{hyphen})+(({abbrev_word}|{any_word}){hyphen})*({abbrev_word}|{any_word});
word = ({basic_word})|({word_non_breaking_mid_char})|({word_begin_single_quote})|({word_end_single_quote})|({hebrew_word_single_quote})|({hebrew_word_double_quote})|({word_extend_num_letter});
word = ({basic_word})|({word_non_breaking_mid_char})|({word_end_single_quote})|({hebrew_word_single_quote})|({hebrew_word_double_quote})|({word_extend_num_letter});
abbreviation = ({word})"\.";
@@ -176,6 +175,7 @@ invalid_chars = ({control_chars}|{other_format_chars}|{other_private_use_chars})
{number} { return NUMERIC; }
{numeric} { return NUMERIC; }
{apos_word} { return WORD; }
{hangul_syllable} { return HANGUL_SYLLABLE; }
{ideographic_chars} { return IDEOGRAPHIC_CHAR; }
{ideographic_numeric_chars} { return IDEOGRAPHIC_NUMBER; }