From aa39c45b8762c4712d40400a73ba2e1ec415df2c Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 4 Oct 2015 18:25:44 -0400 Subject: [PATCH] [tokenization] skipping control characters in tokenization, comes up in OSM surprisingly --- src/scanner.re | 5 ++++- src/token_types.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/scanner.re b/src/scanner.re index d090c431..593c5a51 100644 --- a/src/scanner.re +++ b/src/scanner.re @@ -154,8 +154,10 @@ url = ('http''s'?":"("/"{1,3}|[A-Za-z0-9%]))([^\u0000 \t\u00A0\u2000-\u200A\u300 email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3}); +invalid_chars = ({control_chars}|{other_format_chars}|{other_private_use_chars}); "\u0000" { return END; } +{invalid_chars} { return INVALID_CHAR; } {space}+ { return WHITESPACE; } {email} { return EMAIL; } @@ -205,6 +207,7 @@ email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3}); "#" { return POUND; } {other_non_breaking_dash} { return DASH; } {breaking_dash} { return BREAKING_DASH; } +{other_surrogate_chars} { return INVALID_CHAR; } {any} { return OTHER; } */ @@ -233,7 +236,7 @@ void tokenize_add_tokens(token_array *tokens, const char *input, size_t len, boo token_start = scanner.start - scanner.src; token_length = scanner.cursor - scanner.start; - if (token_type == WHITESPACE && !keep_whitespace) { + if ((token_type == WHITESPACE && !keep_whitespace) || (token_type == INVALID_CHAR)) { continue; } diff --git a/src/token_types.h b/src/token_types.h index 8ae0c144..c1d2d02c 100644 --- a/src/token_types.h +++ b/src/token_types.h @@ -60,6 +60,6 @@ #define WHITESPACE 300 #define NEWLINE 301 - +#define INVALID_CHAR 500 #endif