From 3cbb1b3976d123e9846926734e27f88467b8e7b9 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 3 Jul 2016 23:51:13 -0400 Subject: [PATCH] [tokenization] Hyphens, etc. between non-ASCII digits (e.g. Unicode full-width numbers) should be single tokens --- src/scanner.re | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scanner.re b/src/scanner.re index b99c8219..eae1286d 100644 --- a/src/scanner.re +++ b/src/scanner.re @@ -117,8 +117,8 @@ hebrew_word_double_quote = ({hebrew_letter_chars}+{double_quote})+{hebrew_letter // WB11 and WB12 (modified slightly) possible_numeric_chars = ({numeric_chars}|{letter}|{non_breaking_dash}|{hyphen}|{mid_num_letter_chars}|{single_quote}); -number = ({non_breaking_dash}?((({numeric_chars}|{number_or_digit_chars})+({mid_number_chars}|{mid_num_letter_chars})*)*)({numeric_chars}|{number_or_digit_chars})+); -numeric = (({non_breaking_dash}?|((({numeric_chars}|{letter})+{possible_numeric_chars}*)*)){numeric_chars}+({possible_numeric_chars}*({numeric_chars}|{letter})+)*)|({number_or_digit_chars}+); +number = ({non_breaking_dash}?((({numeric_chars}|{number_or_digit_chars})+({mid_number_chars}|{mid_num_letter_chars}))*)({numeric_chars}|{number_or_digit_chars})+); +numeric = (({non_breaking_dash}|((({numeric_chars}|{number_or_digit_chars}|{letter})+{possible_numeric_chars}*)*))({numeric_chars}|{number_or_digit_chars})+({possible_numeric_chars}*({numeric_chars}|{number_or_digit_chars}|{letter})+)*); // WB13