[tokenization] Better scanner support for ideographic languages (Chinese, Japanese, Korean, etc.) with an IDEOGRAM token class in the scanner so we know when we're dealing with those languages vs. other random characters
This commit is contained in:
474145
src/scanner.c
474145
src/scanner.c
File diff suppressed because it is too large
Load Diff
@@ -75,12 +75,17 @@ mid_num_letter_chars = [\u002e\u2018\u2019\u2024\ufe52\uff07\uff0e];
|
||||
numeric_chars = [\u0030-\u0039\u0660-\u0669\u066b\u06f0-\u06f9\u07c0-\u07c9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be6-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0de6-\u0def\u0e50-\u0e59\u0ed0-\u0ed9\u0f20-\u0f29\u1040-\u1049\u1090-\u1099\u17e0-\u17e9\u1810-\u1819\u1946-\u194f\u19d0-\u19d9\u1a80-\u1a89\u1a90-\u1a99\u1b50-\u1b59\u1bb0-\u1bb9\u1c40-\u1c49\u1c50-\u1c59\ua620-\ua629\ua8d0-\ua8d9\ua900-\ua909\ua9d0-\ua9d9\ua9f0-\ua9f9\uaa50-\uaa59\uabf0-\uabf9];
|
||||
extend_num_letter_chars = [\u005f\u203f-\u2040\u2054\ufe33-\ufe34\ufe4d-\ufe4f\uff3f];
|
||||
|
||||
ideographic_numeric_chars = [\u00b2-\u00b3\u00b9\u00bc-\u00be\u2070\u2074-\u2079\u2080-\u2089\u2150-\u215f\u2189\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3192-\u3195\u3220-\u3229\u3248-\u324f\u3251-\u325f\u3280-\u3289\u32b1-\u32bf\ua830-\ua835\u2160-\u2182\u2185-\u2188\u09f4-\u09f9\u0b72-\u0b77\u0bf0-\u0bf2\u0c78-\u0c7e\u0d70-\u0d75\u0f2a-\u0f33\u1369-\u137c\u16ee-\u16f0\u17f0-\u17f9\u3007\u3021-\u3029\u3038-\u303a\u2cfd\u19da\ua6e6-\ua6ef];
|
||||
ideographic_chars = [\u0e01-\u0e30\u0e31\u0e32-\u0e33\u0e34-\u0e3a\u0e40-\u0e45\u0e46\u0e47-\u0e4e\u0e4f\u0e50-\u0e59\u0e5a-\u0e5b\u0e81-\u0e82\u0e84\u0e87-\u0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab\u0ead-\u0eb0\u0eb1\u0eb2-\u0eb3\u0eb4-\u0eb9\u0ebb-\u0ebc\u0ebd\u0ec0-\u0ec4\u0ec6\u0ec8-\u0ecd\u0ed0-\u0ed9\u0edc-\u0edf\u0f00\u0f01-\u0f03\u0f04-\u0f12\u0f13\u0f14\u0f15-\u0f17\u0f18-\u0f19\u0f1a-\u0f1f\u0f20-\u0f29\u0f2a-\u0f33\u0f34\u0f35\u0f36\u0f37\u0f38\u0f39\u0f3a\u0f3b\u0f3c\u0f3d\u0f3e-\u0f3f\u0f40-\u0f47\u0f49-\u0f6c\u0f71-\u0f7e\u0f7f\u0f80-\u0f84\u0f85\u0f86-\u0f87\u0f88-\u0f8c\u0f8d-\u0f97\u0f99-\u0fbc\u0fbe-\u0fc5\u0fc6\u0fc7-\u0fcc\u0fce-\u0fcf\u0fd0-\u0fd4\u0fd9-\u0fda\u1100-\u11ff\u302e-\u302f\u3131-\u318e\u3200-\u321e\u3260-\u327e\ua960-\ua97c\uac00-\ud7a3\ud7b0-\ud7c6\ud7cb-\ud7fb\uffa0-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc\u3041-\u3096\u309d-\u309e\u309f\u2e80-\u2e99\u2e9b-\u2ef3\u2f00-\u2fd5\u3005\u3007\u3021-\u3029\u3038-\u303a\u303b\u3400-\u4db5\u4e00-\u9fcc\uf900-\ufa6d\ufa70-\ufad9\ua000-\ua014\ua015\ua016-\ua48c\ua490-\ua4c6\u1b00-\u1b03\u1b04\u1b05-\u1b33\u1b34\u1b35\u1b36-\u1b3a\u1b3b\u1b3c\u1b3d-\u1b41\u1b42\u1b43-\u1b44\u1b45-\u1b4b\u1b50-\u1b59\u1b5a-\u1b60\u1b61-\u1b6a\u1b6b-\u1b73\u1b74-\u1b7c\ua980-\ua982\ua983\ua984-\ua9b2\ua9b3\ua9b4-\ua9b5\ua9b6-\ua9b9\ua9ba-\ua9bb\ua9bc\ua9bd-\ua9c0\ua9c1-\ua9cd\ua9d0-\ua9d9\ua9de-\ua9df];
|
||||
|
||||
single_quote = [\u0027];
|
||||
double_quote = [\u0022];
|
||||
|
||||
letter = {letter_lower_chars}|{letter_upper_chars}|{letter_title_chars}|{letter_other_alpha_chars}|{hebrew_letter_chars};
|
||||
latinish_letter = {letter_lower_chars}|{letter_upper_chars}|{letter_title_chars};
|
||||
|
||||
ideographic_symbol = {ideographic_chars}|{ideographic_numeric_chars};
|
||||
|
||||
underscore = [_];
|
||||
|
||||
hyphen = [\-];
|
||||
@@ -144,6 +149,7 @@ abbreviation = ({word})"\.";
|
||||
|
||||
{numeric} { return NUMERIC; }
|
||||
{apos_word} { return WORD; }
|
||||
{ideographic_symbol} { return IDEOGRAPH; }
|
||||
{word} { return WORD; }
|
||||
{katakana} { return WORD; }
|
||||
{any_word} { return WORD; }
|
||||
|
||||
Reference in New Issue
Block a user