From 43293d0ae3dd6f6d43a4340e5d5bd92a20b21ab8 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Tue, 14 Jul 2015 18:15:58 -0400
Subject: [PATCH] [tokenization] Fixing a tokenization where mid-number
 characters appear in the middle of a word+numeric sequence e.g. Zigor,2
 should be 3 separate tokens. Sequences like 35,37,39 are still treated as a
 single token for the moment.

---
 src/scanner.re | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/scanner.re b/src/scanner.re
index a60e970a..8a30f91d 100644
--- a/src/scanner.re
+++ b/src/scanner.re
@@ -114,8 +114,8 @@ hebrew_word_single_quote = ({hebrew_letter_chars}+{single_quote})+;
 hebrew_word_double_quote = ({hebrew_letter_chars}+{double_quote})+{hebrew_letter_chars}*;
 
 // WB11 and WB12 (modified slightly)
-// N.B. this does not capture German-style ordinals such as 2. as that is too ambiguous and can be accounted for in parsing
-possible_numeric_chars = ({numeric_chars}|{letter}|{non_breaking_dash}|{mid_number_chars}|{mid_num_letter_chars}|{single_quote});
+possible_numeric_chars = ({numeric_chars}|{letter}|{non_breaking_dash}|{hyphen}|{mid_num_letter_chars}|{single_quote});
+number = ({non_breaking_dash}?(({numeric_chars}+({mid_number_chars}|{mid_num_letter_chars})*)*){numeric_chars}+);
 numeric = (({non_breaking_dash}?|((({numeric_chars}|{letter})+{possible_numeric_chars}*)*)){numeric_chars}+({possible_numeric_chars}*({numeric_chars}|{letter})+)*);
 
 // WB13
@@ -167,6 +167,7 @@ email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3});
 {hyphen_plus_abbreviation}      { return ABBREVIATION; }
 {abbreviation}                  { return ABBREVIATION; }
 
+{number}                        { return NUMERIC; }
 {numeric}                       { return NUMERIC; }
 {apos_word}                     { return WORD; }
 {hangul_syllable}               { return HANGUL_SYLLABLE; }