[tokenization] Better scanner support for ideographic languages (Chinese, Japanese, Korean, etc.) with an IDEOGRAM token class in the scanner so we know when we're dealing with those languages vs. other random characters
This commit is contained in:
482163
src/scanner.c
482163
src/scanner.c
File diff suppressed because it is too large
Load Diff
@@ -75,12 +75,17 @@ mid_num_letter_chars = [\u002e\u2018\u2019\u2024\ufe52\uff07\uff0e];
|
||||
numeric_chars = [\u0030-\u0039\u0660-\u0669\u066b\u06f0-\u06f9\u07c0-\u07c9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be6-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0de6-\u0def\u0e50-\u0e59\u0ed0-\u0ed9\u0f20-\u0f29\u1040-\u1049\u1090-\u1099\u17e0-\u17e9\u1810-\u1819\u1946-\u194f\u19d0-\u19d9\u1a80-\u1a89\u1a90-\u1a99\u1b50-\u1b59\u1bb0-\u1bb9\u1c40-\u1c49\u1c50-\u1c59\ua620-\ua629\ua8d0-\ua8d9\ua900-\ua909\ua9d0-\ua9d9\ua9f0-\ua9f9\uaa50-\uaa59\uabf0-\uabf9];
|
||||
extend_num_letter_chars = [\u005f\u203f-\u2040\u2054\ufe33-\ufe34\ufe4d-\ufe4f\uff3f];
|
||||
|
||||
ideographic_numeric_chars = [\u00b2-\u00b3\u00b9\u00bc-\u00be\u2070\u2074-\u2079\u2080-\u2089\u2150-\u215f\u2189\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3192-\u3195\u3220-\u3229\u3248-\u324f\u3251-\u325f\u3280-\u3289\u32b1-\u32bf\ua830-\ua835\u2160-\u2182\u2185-\u2188\u09f4-\u09f9\u0b72-\u0b77\u0bf0-\u0bf2\u0c78-\u0c7e\u0d70-\u0d75\u0f2a-\u0f33\u1369-\u137c\u16ee-\u16f0\u17f0-\u17f9\u3007\u3021-\u3029\u3038-\u303a\u2cfd\u19da\ua6e6-\ua6ef];
|
||||
ideographic_chars = [\u0e01-\u0e30\u0e31\u0e32-\u0e33\u0e34-\u0e3a\u0e40-\u0e45\u0e46\u0e47-\u0e4e\u0e4f\u0e50-\u0e59\u0e5a-\u0e5b\u0e81-\u0e82\u0e84\u0e87-\u0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab\u0ead-\u0eb0\u0eb1\u0eb2-\u0eb3\u0eb4-\u0eb9\u0ebb-\u0ebc\u0ebd\u0ec0-\u0ec4\u0ec6\u0ec8-\u0ecd\u0ed0-\u0ed9\u0edc-\u0edf\u0f00\u0f01-\u0f03\u0f04-\u0f12\u0f13\u0f14\u0f15-\u0f17\u0f18-\u0f19\u0f1a-\u0f1f\u0f20-\u0f29\u0f2a-\u0f33\u0f34\u0f35\u0f36\u0f37\u0f38\u0f39\u0f3a\u0f3b\u0f3c\u0f3d\u0f3e-\u0f3f\u0f40-\u0f47\u0f49-\u0f6c\u0f71-\u0f7e\u0f7f\u0f80-\u0f84\u0f85\u0f86-\u0f87\u0f88-\u0f8c\u0f8d-\u0f97\u0f99-\u0fbc\u0fbe-\u0fc5\u0fc6\u0fc7-\u0fcc\u0fce-\u0fcf\u0fd0-\u0fd4\u0fd9-\u0fda\u1100-\u11ff\u302e-\u302f\u3131-\u318e\u3200-\u321e\u3260-\u327e\ua960-\ua97c\uac00-\ud7a3\ud7b0-\ud7c6\ud7cb-\ud7fb\uffa0-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc\u3041-\u3096\u309d-\u309e\u309f\u2e80-\u2e99\u2e9b-\u2ef3\u2f00-\u2fd5\u3005\u3007\u3021-\u3029\u3038-\u303a\u303b\u3400-\u4db5\u4e00-\u9fcc\uf900-\ufa6d\ufa70-\ufad9\ua000-\ua014\ua015\ua016-\ua48c\ua490-\ua4c6\u1b00-\u1b03\u1b04\u1b05-\u1b33\u1b34\u1b35\u1b36-\u1b3a\u1b3b\u1b3c\u1b3d-\u1b41\u1b42\u1b43-\u1b44\u1b45-\u1b4b\u1b50-\u1b59\u1b5a-\u1b60\u1b61-\u1b6a\u1b6b-\u1b73\u1b74-\u1b7c\ua980-\ua982\ua983\ua984-\ua9b2\ua9b3\ua9b4-\ua9b5\ua9b6-\ua9b9\ua9ba-\ua9bb\ua9bc\ua9bd-\ua9c0\ua9c1-\ua9cd\ua9d0-\ua9d9\ua9de-\ua9df];
|
||||
|
||||
single_quote = [\u0027];
|
||||
double_quote = [\u0022];
|
||||
|
||||
letter = {letter_lower_chars}|{letter_upper_chars}|{letter_title_chars}|{letter_other_alpha_chars}|{hebrew_letter_chars};
|
||||
latinish_letter = {letter_lower_chars}|{letter_upper_chars}|{letter_title_chars};
|
||||
|
||||
ideographic_symbol = {ideographic_chars}|{ideographic_numeric_chars};
|
||||
|
||||
underscore = [_];
|
||||
|
||||
hyphen = [\-];
|
||||
@@ -144,6 +149,7 @@ abbreviation = ({word})"\.";
|
||||
|
||||
{numeric} { return NUMERIC; }
|
||||
{apos_word} { return WORD; }
|
||||
{ideographic_symbol} { return IDEOGRAPH; }
|
||||
{word} { return WORD; }
|
||||
{katakana} { return WORD; }
|
||||
{any_word} { return WORD; }
|
||||
|
||||
Reference in New Issue
Block a user