[tokenization] breaking dashes or double hyphens break a word while other dashes don't

2015-04-17 19:14:42 -04:00
parent 6718182443
commit 606a669c01
2 changed files with 219311 additions and 157493 deletions
--- a/src/scanner.c
+++ b/src/scanner.c
--- a/src/scanner.re
+++ b/src/scanner.re
@@ -117,9 +117,9 @@ katakana = {katakana_chars}+;
 // WB13a and WB13b
 word_extend_num_letter = ({letter}|{numeric_chars}|{katakana}|{extend_num_letter_chars})+{extend_num_letter_chars}({letter}|{numeric_chars}|{katakana}|{extend_num_letter_chars})*;

-possible_word_char = {letter}|{mark_spacing_combining_chars}|{mark_enclosing_chars}|{mark_nonspacing_chars}|{non_breaking_dash}|{punct_connector_chars}|{currency_symbol_chars}|{symbol_modifier_chars}|{symbol_math_chars}|{symbol_other_chars}|{digit};
+possible_word_char = {letter}|{mark_spacing_combining_chars}|{mark_enclosing_chars}|{mark_nonspacing_chars}|{punct_connector_chars}|{currency_symbol_chars}|{symbol_modifier_chars}|{symbol_math_chars}|{symbol_other_chars}|{digit};
 //possible_word_char = [^\+\(\)\[\]}{\/\\\,:;!"&\? 0-9\t\u00A0\u2000-\u200A\u3000\r\n\u2028\u2029\u000B\u000C\u0085\u0096\u0097\u2013\u2014\u2015\u0000];
-any_word = ({possible_word_char}*{letter}+{possible_word_char}*);
+any_word = (({possible_word_char}*{letter}+{possible_word_char}*{non_breaking_dash})*{possible_word_char}*{letter}+{possible_word_char}*);

 apos_word = ("'"?({latinish_letter}+"'")+{latinish_letter}+"'"?);