[tokenization] Adding url, email, US/international phone numbers, a separate type for ideographic numbers, more general quotes, paren types

This commit is contained in:
Al
2015-03-24 16:43:53 -04:00
parent 50187f28ce
commit 2d1c24a6e9
4 changed files with 298633 additions and 262454 deletions

560979
src/scanner.c

File diff suppressed because it is too large Load Diff

View File

@@ -12,11 +12,11 @@ typedef struct scanner {
unsigned char *src, *cursor, *start, *end;
} scanner_t;
int scan_token(scanner_t *s);
uint16_t scan_token(scanner_t *s);
inline scanner_t scanner_from_string(const char *input);
tokenized_string_t *tokenize(const char *str);
token_array *tokenize(const char *input);
#ifdef __cplusplus

View File

@@ -4,7 +4,7 @@
#include "scanner.h"
int scan_token(scanner_t *s)
uint16_t scan_token(scanner_t *s)
{
s->start = s->cursor;
unsigned char *marker = s->cursor;
@@ -35,7 +35,7 @@ letter_title_chars = [\u01c5\u01c8\u01cb\u01f2\u1f88-\u1f8f\u1f98-\u1f9f\u1fa8-\
letter_upper_chars = [A-Z\u00c0-\u00d6\u00d8-\u00ce\u0100\u0102\u0104\u0106\u0108\u010a\u010c\u010e\u0110\u0112\u0114\u0116\u0118\u011a\u011c\u011e\u0120\u0122\u0124\u0126\u0128\u012a\u012c\u012e\u0130\u0132\u0134\u0136\u0139\u013b\u013d\u013f\u0141\u0143\u0145\u0147\u014a\u014c\u014e\u0150\u0152\u0154\u0156\u0158\u015a\u015c\u015e\u0160\u0162\u0164\u0166\u0168\u016a\u016c\u016e\u0170\u0172\u0174\u0176\u0178-\u0179\u017b\u017d\u0181-\u0182\u0184\u0186-\u0187\u0189-\u018b\u018e-\u0191\u0193-\u0194\u0196-\u0198\u019c-\u019d\u019f-\u01a0\u01a2\u01a4\u01a6-\u01a7\u01a9\u01ac\u01ae-\u01af\u01b1-\u01b3\u01b5\u01b7-\u01b8\u01bc\u01c4\u01c7\u01ca\u01cd\u01cf\u01d1\u01d3\u01d5\u01d7\u01d9\u01db\u01de\u01e0\u01e2\u01e4\u01e6\u01e8\u01ea\u01ec\u01ee\u01f1\u01f4\u01f6-\u01f8\u01fa\u01fc\u01fe\u0200\u0202\u0204\u0206\u0208\u020a\u020c\u020e\u0210\u0212\u0214\u0216\u0218\u021a\u021c\u021e\u0220\u0222\u0224\u0226\u0228\u022a\u022c\u022e\u0230\u0232\u023a-\u023b\u023d-\u023e\u0241\u0243-\u0246\u0248\u024a\u024c\u024e\u0370\u0372\u0376\u0386\u0388-\u038a\u038c\u038e-\u038f\u0391-\u03a1\u03a3-\u03ab\u03cf\u03d2-\u03d4\u03d8\u03da\u03dc\u03de\u03e0\u03e2\u03e4\u03e6\u03e8\u03ea\u03ec\u03ee\u03f4\u03f7\u03f9-\u03fa\u03fd-\u042f\u0460\u0462\u0464\u0466\u0468\u046a\u046c\u046e\u0470\u0472\u0474\u0476\u0478\u047a\u047c\u047e\u0480\u048a\u048c\u048e\u0490\u0492\u0494\u0496\u0498\u049a\u049c\u049e\u04a0\u04a2\u04a4\u04a6\u04a8\u04aa\u04ac\u04ae\u04b0\u04b2\u04b4\u04b6\u04b8\u04ba\u04bc\u04be\u04c0-\u04c1\u04c3\u04c5\u04c7\u04c9\u04cb\u04cd\u04d0\u04d2\u04d4\u04d6\u04d8\u04da\u04dc\u04de\u04e0\u04e2\u04e4\u04e6\u04e8\u04ea\u04ec\u04ee\u04f0\u04f2\u04f4\u04f6\u04f8\u04fa\u04fc\u04fe\u0500\u0502\u0504\u0506\u0508\u050a\u050c\u050e\u0510\u0512\u0514\u0516\u0518\u051a\u051c\u051e\u0520\u0522\u0524\u0531-\u0556\u10a0-\u10c5\u1e00\u1e02\u1e04\u1e06\u1e08\u1e0a\u1e0c\u1e0e\u1e10\u1e12\u1e14\u1e16\u1e18\u1e1a\u1e1c\u1e1e\u1e20\u1e22\u1e24\u1e26\u1e28\u1e2a\u1e2c\u1e2e\u1e30\u1e32\u1e34\u1e36\u1e38\u1e3a\u1e3c\u1e3e\u1e40\u1e42\u1e44\u1e46\u1e48\u1e4a\u1e4c\u1e4e\u1e50\u1e52\u1e54\u1e56\u1e58\u1e5a\u1e5c\u1e5e\u1e60\u1e62\u1e64\u1e66\u1e68\u1e6a\u1e6c\u1e6e\u1e70\u1e72\u1e74\u1e76\u1e78\u1e7a\u1e7c\u1e7e\u1e80\u1e82\u1e84\u1e86\u1e88\u1e8a\u1e8c\u1e8e\u1e90\u1e92\u1e94\u1e9e\u1ea0\u1ea2\u1ea4\u1ea6\u1ea8\u1eaa\u1eac\u1eae\u1eb0\u1eb2\u1eb4\u1eb6\u1eb8\u1eba\u1ebc\u1ebe\u1ec0\u1ec2\u1ec4\u1ec6\u1ec8\u1eca\u1ecc\u1ece\u1ed0\u1ed2\u1ed4\u1ed6\u1ed8\u1eda\u1edc\u1ede\u1ee0\u1ee2\u1ee4\u1ee6\u1ee8\u1eea\u1eec\u1eee\u1ef0\u1ef2\u1ef4\u1ef6\u1ef8\u1efa\u1efc\u1efe\u1f08-\u1f0f\u1f18-\u1f1d\u1f28-\u1f2f\u1f38-\u1f3f\u1f48-\u1f4d\u1f59\u1f5b\u1f5d\u1f5f\u1f68-\u1f6f\u1fb8-\u1fbb\u1fc8-\u1fcb\u1fd8-\u1fdb\u1fe8-\u1fec\u1ff8-\u1ffb\u2102\u2107\u210b-\u210d\u2110-\u2112\u2115\u2119-\u211d\u2124\u2126\u2128\u212a-\u212d\u2130-\u2133\u213e-\u213f\u2145\u2183\u2c00-\u2c2e\u2c60\u2c62-\u2c64\u2c67\u2c69\u2c6b\u2c6d-\u2c70\u2c72\u2c75\u2c7e-\u2c80\u2c82\u2c84\u2c86\u2c88\u2c8a\u2c8c\u2c8e\u2c90\u2c92\u2c94\u2c96\u2c98\u2c9a\u2c9c\u2c9e\u2ca0\u2ca2\u2ca4\u2ca6\u2ca8\u2caa\u2cac\u2cae\u2cb0\u2cb2\u2cb4\u2cb6\u2cb8\u2cba\u2cbc\u2cbe\u2cc0\u2cc2\u2cc4\u2cc6\u2cc8\u2cca\u2ccc\u2cce\u2cd0\u2cd2\u2cd4\u2cd6\u2cd8\u2cda\u2cdc\u2cde\u2ce0\u2ce2\u2ceb\u2ced\ua640\ua642\ua644\ua646\ua648\ua64a\ua64c\ua64e\ua650\ua652\ua654\ua656\ua658\ua65a\ua65c\ua65e\ua662\ua664\ua666\ua668\ua66a\ua66c\ua680\ua682\ua684\ua686\ua688\ua68a\ua68c\ua68e\ua690\ua692\ua694\ua696\ua722\ua724\ua726\ua728\ua72a\ua72c\ua72e\ua732\ua734\ua736\ua738\ua73a\ua73c\ua73e\ua740\ua742\ua744\ua746\ua748\ua74a\ua74c\ua74e\ua750\ua752\ua754\ua756\ua758\ua75a\ua75c\ua75e\ua760\ua762\ua764\ua766\ua768\ua76a\ua76c\ua76e\ua779\ua77b\ua77d-\ua77e\ua780\ua782\ua784\ua786\ua78b\uff21-\uff3a];
mark_spacing_combined_chars = [\u0903\u093e-\u0940\u0949-\u094c\u094e\u0982-\u0983\u09be-\u09c0\u09c7-\u09c8\u09cb-\u09cc\u09d7\u0a03\u0a3e-\u0a40\u0a83\u0abe-\u0ac0\u0ac9\u0acb-\u0acc\u0b02-\u0b03\u0b3e\u0b40\u0b47-\u0b48\u0b4b-\u0b4c\u0b57\u0bbe-\u0bbf\u0bc1-\u0bc2\u0bc6-\u0bc8\u0bca-\u0bcc\u0bd7\u0c01-\u0c03\u0c41-\u0c44\u0c82-\u0c83\u0cbe\u0cc0-\u0cc4\u0cc7-\u0cc8\u0cca-\u0ccb\u0cd5-\u0cd6\u0d02-\u0d03\u0d3e-\u0d40\u0d46-\u0d48\u0d4a-\u0d4c\u0d57\u0d82-\u0d83\u0dcf-\u0dd1\u0dd8-\u0ddf\u0df2-\u0df3\u0f3e-\u0f3f\u0f7f\u102b-\u102c\u1031\u1038\u103b-\u103c\u1056-\u1057\u1062-\u1064\u1067-\u106d\u1083-\u1084\u1087-\u108c\u108f\u109a-\u109c\u17b6\u17be-\u17c5\u17c7-\u17c8\u1923-\u1926\u1929-\u192b\u1930-\u1931\u1933-\u1938\u19b0-\u19c0\u19c8-\u19c9\u1a19-\u1a1b\u1a55\u1a57\u1a61\u1a63-\u1a64\u1a6d-\u1a72\u1b04\u1b35\u1b3b\u1b3d-\u1b41\u1b43-\u1b44\u1b82\u1ba1\u1ba6-\u1ba7\u1baa\u1c24-\u1c2b\u1c34-\u1c35\u1ce1\u1cf2\ua823-\ua824\ua827\ua880-\ua881\ua8b4-\ua8c3\ua952-\ua953\ua983\ua9b4-\ua9b5\ua9ba-\ua9bb\ua9bd-\ua9c0\uaa2f-\uaa30\uaa33-\uaa34\uaa4d\uaa7b\uabe3-\uabe4\uabe6-\uabe7\uabec];
mark_spacing_combining_chars = [\u0903\u093e-\u0940\u0949-\u094c\u094e\u0982-\u0983\u09be-\u09c0\u09c7-\u09c8\u09cb-\u09cc\u09d7\u0a03\u0a3e-\u0a40\u0a83\u0abe-\u0ac0\u0ac9\u0acb-\u0acc\u0b02-\u0b03\u0b3e\u0b40\u0b47-\u0b48\u0b4b-\u0b4c\u0b57\u0bbe-\u0bbf\u0bc1-\u0bc2\u0bc6-\u0bc8\u0bca-\u0bcc\u0bd7\u0c01-\u0c03\u0c41-\u0c44\u0c82-\u0c83\u0cbe\u0cc0-\u0cc4\u0cc7-\u0cc8\u0cca-\u0ccb\u0cd5-\u0cd6\u0d02-\u0d03\u0d3e-\u0d40\u0d46-\u0d48\u0d4a-\u0d4c\u0d57\u0d82-\u0d83\u0dcf-\u0dd1\u0dd8-\u0ddf\u0df2-\u0df3\u0f3e-\u0f3f\u0f7f\u102b-\u102c\u1031\u1038\u103b-\u103c\u1056-\u1057\u1062-\u1064\u1067-\u106d\u1083-\u1084\u1087-\u108c\u108f\u109a-\u109c\u17b6\u17be-\u17c5\u17c7-\u17c8\u1923-\u1926\u1929-\u192b\u1930-\u1931\u1933-\u1938\u19b0-\u19c0\u19c8-\u19c9\u1a19-\u1a1b\u1a55\u1a57\u1a61\u1a63-\u1a64\u1a6d-\u1a72\u1b04\u1b35\u1b3b\u1b3d-\u1b41\u1b43-\u1b44\u1b82\u1ba1\u1ba6-\u1ba7\u1baa\u1c24-\u1c2b\u1c34-\u1c35\u1ce1\u1cf2\ua823-\ua824\ua827\ua880-\ua881\ua8b4-\ua8c3\ua952-\ua953\ua983\ua9b4-\ua9b5\ua9ba-\ua9bb\ua9bd-\ua9c0\uaa2f-\uaa30\uaa33-\uaa34\uaa4d\uaa7b\uabe3-\uabe4\uabe6-\uabe7\uabec];
mark_enclosing_chars = [\u0488-\u0489\u06de\u20dd-\u20e0\u20e2-\u20e4\ua670-\ua672];
mark_nonspacing_chars = [\u0300-\u036f\u0483-\u0487\u0591-\u05bd\u05bf\u05c1-\u05c2\u05c4-\u05c5\u05c7\u0610-\u061a\u064b-\u065e\u0670\u06d6-\u06dc\u06df-\u06e4\u06e7-\u06e8\u06ea-\u06ed\u0711\u0730-\u074a\u07a6-\u07b0\u07eb-\u07f3\u0816-\u0819\u081b-\u0823\u0825-\u0827\u0829-\u082d\u0900-\u0902\u093c\u0941-\u0948\u094d\u0951-\u0955\u0962-\u0963\u0981\u09bc\u09c1-\u09c4\u09cd\u09e2-\u09e3\u0a01-\u0a02\u0a3c\u0a41-\u0a42\u0a47-\u0a48\u0a4b-\u0a4d\u0a51\u0a70-\u0a71\u0a75\u0a81-\u0a82\u0abc\u0ac1-\u0ac5\u0ac7-\u0ac8\u0acd\u0ae2-\u0ae3\u0b01\u0b3c\u0b3f\u0b41-\u0b44\u0b4d\u0b56\u0b62-\u0b63\u0b82\u0bc0\u0bcd\u0c3e-\u0c40\u0c46-\u0c48\u0c4a-\u0c4d\u0c55-\u0c56\u0c62-\u0c63\u0cbc\u0cbf\u0cc6\u0ccc-\u0ccd\u0ce2-\u0ce3\u0d41-\u0d44\u0d4d\u0d62-\u0d63\u0dca\u0dd2-\u0dd4\u0dd6\u0e31\u0e34-\u0e3a\u0e47-\u0e4e\u0eb1\u0eb4-\u0eb9\u0ebb-\u0ebc\u0ec8-\u0ecd\u0f18-\u0f19\u0f35\u0f37\u0f39\u0f71-\u0f7e\u0f80-\u0f84\u0f86-\u0f87\u0f90-\u0f97\u0f99-\u0fbc\u0fc6\u102d-\u1030\u1032-\u1037\u1039-\u103a\u103d-\u103e\u1058-\u1059\u105e-\u1060\u1071-\u1074\u1082\u1085-\u1086\u108d\u109d\u135f\u1712-\u1714\u1732-\u1734\u1752-\u1753\u1772-\u1773\u17b7-\u17bd\u17c6\u17c9-\u17d3\u17dd\u180b-\u180d\u18a9\u1920-\u1922\u1927-\u1928\u1932\u1939-\u193b\u1a17-\u1a18\u1a56\u1a58-\u1a5e\u1a60\u1a62\u1a65-\u1a6c\u1a73-\u1a7c\u1a7f\u1b00-\u1b03\u1b34\u1b36-\u1b3a\u1b3c\u1b42\u1b6b-\u1b73\u1b80-\u1b81\u1ba2-\u1ba5\u1ba8-\u1ba9\u1c2c-\u1c33\u1c36-\u1c37\u1cd0-\u1cd2\u1cd4-\u1ce0\u1ce2-\u1ce8\u1ced\u1dc0-\u1de6\u1dfd-\u1dff\u20d0-\u20dc\u20e1\u20e5-\u20f0\u2cef-\u2cf1\u2de0-\u2dff\u302a-\u302f\u3099-\u309a\ua66f\ua67c-\ua67d\ua6f0-\ua6f1\ua802\ua806\ua80b\ua825-\ua826\ua8c4\ua8e0-\ua8f1\ua926-\ua92d\ua947-\ua951\ua980-\ua982\ua9b3\ua9b6-\ua9b9\ua9bc\uaa29-\uaa2e\uaa31-\uaa32\uaa35-\uaa36\uaa43\uaa4c\uaab0\uaab2-\uaab4\uaab7-\uaab8\uaabe-\uaabf\uaac1\uabe5\uabe8\uabed\ufb1e\ufe00-\ufe0f\ufe20-\ufe26];
number_or_digit_chars = [0-9\u0660-\u0669\u06f0-\u06f9\u07c0-\u07c9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be6-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u0f20-\u0f29\u1040-\u1049\u1090-\u1099\u17e0-\u17e9\u1810-\u1819\u1946-\u194f\u19d0-\u19da\u1a80-\u1a89\u1a90-\u1a99\u1b50-\u1b59\u1bb0-\u1bb9\u1c40-\u1c49\u1c50-\u1c59\ua620-\ua629\ua8d0-\ua8d9\ua900-\ua909\ua9d0-\ua9d9\uaa50-\uaa59\uabf0-\uabf9\uff10-\uff19];
@@ -80,8 +80,6 @@ double_quote = [\u0022];
letter = {letter_lower_chars}|{letter_upper_chars}|{letter_title_chars}|{letter_other_alpha_chars}|{hebrew_letter_chars};
latinish_letter = {letter_lower_chars}|{letter_upper_chars}|{letter_title_chars};
ideographic_symbol = {ideographic_chars}|{ideographic_numeric_chars};
underscore = [_];
hyphen = [\-];
@@ -117,7 +115,8 @@ katakana = {katakana_chars}+;
// WB13a and WB13b
word_extend_num_letter = ({letter}|{numeric_chars}|{katakana}|{extend_num_letter_chars})+{extend_num_letter_chars}({letter}|{numeric_chars}|{katakana}|{extend_num_letter_chars})*;
possible_word_char = [^\+\(\)\[\]}{\/\\\,:;!"&\? 0-9\t\u00A0\u2000-\u200A\u3000\r\n\u2028\u2029\u000B\u000C\u0085\u0096\u0097\u2013\u2014\u2015\u0000];
possible_word_char = {letter}|{mark_spacing_combining_chars}|{mark_enclosing_chars}|{mark_nonspacing_chars}|{punct_connector_chars}|{punct_dash_chars}|{currency_symbol_chars}|{symbol_modifier_chars}|{symbol_math_chars}|{symbol_other_chars};
//possible_word_char = [^\+\(\)\[\]}{\/\\\,:;!"&\? 0-9\t\u00A0\u2000-\u200A\u3000\r\n\u2028\u2029\u000B\u000C\u0085\u0096\u0097\u2013\u2014\u2015\u0000];
any_word = ({possible_word_char}*{letter}+{possible_word_char}*);
apos_word = ("'"?({latinish_letter}+"'")+{latinish_letter}+"'"?);
@@ -134,18 +133,38 @@ word = ({basic_word})|({word_non_breaking_mid_char})|({word_end_single_quote})|(
abbreviation = ({word})"\.";
us_phone_number = ("\+"?"1"[\-\. ]?)?"\("?([2-9][0-8][0-9])"\)"?[\-\. ]?([2-9][0-9]{2})[\-\. ]?([0-9]{4});
international_phone_number = "\+"("9"[976][0-9]|"8"[987530][0-9]|"6"[987][0-9]|"5"[90][0-9]|"420-9"|"3"[875][0-9]|"2"[98654321][0-9]|"9"[8543210]|"8"[6421]|"6"[6543210]|"5"[87654321]|"4"[987654310]|"3"[9643210]|"2"[70]|"7"|"1"){space}*(([()\.\-/ ]{0,1}[0-9]){9}[0-9]{1,2});
// Paste from a list of top-level domains
tlds = ('com'|'net'|'org'|'edu'|'gov'|'mil'|'aero'|'asia'|'biz'|'cat'|'coop'|'info'|'int'|'jobs'|'mobi'|'museum'|'name'|'post'|'pro'|'tel'|'travel'|'xxx'|'ac'|'ad'|'ae'|'af'|'ag'|'ai'|'al'|'am'|'an'|'ao'|'aq'|'ar'|'as'|'at'|'au'|'aw'|'ax'|'az'|'ba'|'bb'|'bd'|'be'|'bf'|'bg'|'bh'|'bi'|'bj'|'bm'|'bn'|'bo'|'br'|'bs'|'bt'|'bv'|'bw'|'by'|'bz'|'ca'|'cc'|'cd'|'cf'|'cg'|'ch'|'ci'|'ck'|'cl'|'cm'|'cn'|'co'|'cr'|'cs'|'cu'|'cv'|'cx'|'cy'|'cz'|'dd'|'de'|'dj'|'dk'|'dm'|'do'|'dz'|'ec'|'ee'|'eg'|'eh'|'er'|'es'|'et'|'eu'|'fi'|'fj'|'fk'|'fm'|'fo'|'fr'|'ga'|'gb'|'gd'|'ge'|'gf'|'gg'|'gh'|'gi'|'gl'|'gm'|'gn'|'gp'|'gq'|'gr'|'gs'|'gt'|'gu'|'gw'|'gy'|'hk'|'hm'|'hn'|'hr'|'ht'|'hu'|'id'|'ie'|'il'|'im'|'in'|'io'|'iq'|'ir'|'is'|'it'|'je'|'jm'|'jo'|'jp'|'ke'|'kg'|'kh'|'ki'|'km'|'kn'|'kp'|'kr'|'kw'|'ky'|'kz'|'la'|'lb'|'lc'|'li'|'lk'|'lr'|'ls'|'lt'|'lu'|'lv'|'ly'|'ma'|'mc'|'md'|'me'|'mg'|'mh'|'mk'|'ml'|'mm'|'mn'|'mo'|'mp'|'mq'|'mr'|'ms'|'mt'|'mu'|'mv'|'mw'|'mx'|'my'|'mz'|'na'|'nc'|'ne'|'nf'|'ng'|'ni'|'nl'|'no'|'np'|'nr'|'nu'|'nz'|'om'|'pa'|'pe'|'pf'|'pg'|'ph'|'pk'|'pl'|'pm'|'pn'|'pr'|'ps'|'pt'|'pw'|'py'|'qa'|'re'|'ro'|'rs'|'ru'|'rw'|'sa'|'sb'|'sc'|'sd'|'se'|'sg'|'sh'|'si'|'sj'|'Ja'|'sk'|'sl'|'sm'|'sn'|'so'|'sr'|'ss'|'st'|'su'|'sv'|'sx'|'sy'|'sz'|'tc'|'td'|'tf'|'tg'|'th'|'tj'|'tk'|'tl'|'tm'|'tn'|'to'|'tp'|'tr'|'tt'|'tv'|'tw'|'tz'|'ua'|'ug'|'uk'|'us'|'uy'|'uz'|'va'|'vc'|'ve'|'vg'|'vi'|'vn'|'vu'|'wf'|'ws'|'ye'|'yt'|'yu'|'za'|'zm'|'zw');
// Gruber's liberal url regex: https://gist.github.com/gruber/8891611
url = (('http''s'?":"("/"{1,3}|[A-Za-z0-9%])|[A-Za-z0-9.\-]+[.]{tlds}"/")([^\u0000() \t\u00A0\u2000-\u200A\u3000\r\n\f<>{}\[\]]+|"\("[^\u0000() \t\u00A0\u2000-\u200A\u3000\r\n\f]*?"\("[^\u0000() \t\u00A0\u2000-\u200A\u3000\r\n\f]+"\)"[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f()]*?"\)"|"\("[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f]+?"\)")+("\("[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f()]*?"\("[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f()]+"\)"[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f()]*?"\)"|"\("[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f]+?"\)"|[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f`!()\[\]{};:\'\"\.,<>?«»“”‘’])|([A-Za-z0-9]+([.\-][A-Za-z0-9]+)*[.]{tlds}"/"?));
email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3});
"\u0000" { return END; }
{space}+ { return WHITESPACE; }
{email} { return EMAIL; }
{url} { return URL; }
{ellipsis} { return ELLIPSIS; }
{us_phone_number} { return US_PHONE; }
{international_phone_number} { return INTL_PHONE; }
{multi_punct_abbreviation} { return ABBREVIATION; }
{hyphen_plus_abbreviation} { return ABBREVIATION; }
{abbreviation} { return ABBREVIATION; }
{numeric} { return NUMERIC; }
{apos_word} { return WORD; }
{ideographic_symbol} { return IDEOGRAM; }
{ideographic_chars} { return IDEOGRAPHIC_CHAR; }
{ideographic_numeric_chars} { return IDEOGRAPHIC_NUMBER; }
{word} { return WORD; }
{katakana} { return WORD; }
{any_word} { return WORD; }
@@ -158,14 +177,14 @@ abbreviation = ({word})"\.";
"," { return COMMA; }
":" { return COLON; }
";" { return SEMICOLON; }
"(" { return LPAREN; }
")" { return RPAREN; }
{punct_open_chars} { return PUNCT_OPEN; }
{punct_close_chars} { return PUNCT_CLOSE; }
"/" { return SLASH; }
"\\" { return BACKSLASH; }
"{" { return LBSQUARE; }
"}" { return RBSQUARE; }
"\"" { return DOUBLE_QUOTE; }
"'" { return SINGLE_QUOTE; }
{punct_initial_quote_chars} { return OPEN_QUOTE; }
{punct_final_quote_chars} { return CLOSE_QUOTE; }
"&" { return AMPERSAND; }
">" { return GREATER_THAN; }
"<" { return LESS_THAN; }
@@ -192,24 +211,29 @@ scanner_t scanner_from_string(const char *input) {
return scanner;
}
tokenized_string_t *tokenize(const char *input) {
token_array *tokenize(const char *input) {
size_t token_start, token_length;
int token_type;
uint16_t token_type;
scanner_t scanner = scanner_from_string(input);
tokenized_string_t *response = tokenized_string_new();
token_array *tokens = token_array_new();
while ( ( token_type = scan_token(&scanner)) != END ) {
token_start = scanner.start - scanner.src;
token_length = scanner.cursor - scanner.start;
if (token_type != WHITESPACE) {
// Caller frees the tokens response
tokenized_string_add_token(response, input, token_length, (uint16_t)token_type, (uint64_t)token_start);
// Caller frees
token_t token;
token.offset = token_start;
token.len = token_length;
token.type = token_type;
token_array_push(tokens, token);
}
}
return response;
return tokens;
}

View File

@@ -7,23 +7,26 @@ extern "C" {
// Doing these as #defines so we can duplicate the values exactly in Python
#define END 0 // Null byte
#define END 0 // Null byte
// Word types
#define WORD 1 // Any letter-only word (includes all unicode letters)
#define ABBREVIATION 2 // Loose abbreviations (ending in ".")
#define IDEOGRAM 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character
#define PHRASE 4 // Not part of the first stage tokenizer, but may be used after phrase parsing
#define WORD 1 // Any letter-only word (includes all unicode letters)
#define ABBREVIATION 2 // Loose abbreviations (ending in ".")
#define IDEOGRAPHIC_CHAR 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character
#define PHRASE 4 // Not part of the first stage tokenizer, but may be used after phrase parsing
// Special tokens
#define EMAIL 20 // Make sure emails are tokenized altogether
#define URL 21 // Make sure urls are tokenized altogether
#define US_PHONE 22 // US phone number (with or without country code)
#define INTL_PHONE 23 // A non-US phone number (must have country code)
// Numbers and numeric types
#define NUMBER 50 // All digits
#define NUMERIC 51 // Any sequence containing a digit
#define ORDINAL 52 // 1st, 2nd, etc.
#define NUMERIC_RANGE 53 // 2-3, Queens addresses, US ZIP+4 codes
#define ORDINAL_RANGE 54 // 1-2nd, 1st-2nd
#define ROMAN_NUMERAL 55 // II, III, VI, etc.
#define US_PHONE 56 // US phone number (with or without country code)
#define INTL_PHONE 57 // A non-US phone number (must have country code)
#define NUMERIC 50 // Any sequence containing a digit
#define ORDINAL 51 // 1st, 2nd, 1er, 1 etc.
#define ROMAN_NUMERAL 52 // II, III, VI, etc.
#define IDEOGRAPHIC_NUMBER 53 // All numeric ideographic characters, includes e.g. Han numbers and chars like "²"
// Punctuation types, may separate a phrase
#define PERIOD 100
@@ -40,16 +43,12 @@ extern "C" {
#define DASH 111
#define BREAKING_DASH 112
#define HYPHEN 113
#define LPAREN 114
#define RPAREN 115
#define LBSQUARE 116
#define RBSQUARE 117
#define DOUBLE_QUOTE 118
#define SINGLE_QUOTE 119
#define LEFT_DOUBLE_QUOTE 120
#define RIGHT_DOUBLE_QUOTE 121
#define LEFT_SINGLE_QUOTE 122
#define RIGHT_SINGLE_QUOTE 123
#define PUNCT_OPEN 114
#define PUNCT_CLOSE 115
#define DOUBLE_QUOTE 119
#define SINGLE_QUOTE 120
#define OPEN_QUOTE 121
#define CLOSE_QUOTE 122
#define SLASH 124
#define BACKSLASH 125
#define GREATER_THAN 126
@@ -58,8 +57,7 @@ extern "C" {
// Non-letters and whitespace
#define OTHER 200
#define WHITESPACE 300
#define NEWLINE 301
#define NEWLINE 301
#ifdef __cplusplus
}