From f794ef72222dace9d67aade57c821f26e4bfe9ef Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 17 Mar 2015 18:38:30 -0400 Subject: [PATCH] [tokenization] Exposing some of the scanner's methods in header for use in the Python scanner so it can avoid the additional allocation --- src/scanner.c | 18 +++++++++++------- src/scanner.h | 8 ++++++++ src/scanner.re | 14 ++++++++++---- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/scanner.c b/src/scanner.c index 82a8dcdf..53b5d27d 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -6,10 +6,6 @@ #include "scanner.h" -typedef struct scanner { - unsigned char *src, *cursor, *start, *end; -} scanner_t; - int scan_token(scanner_t *s) { s->start = s->cursor; @@ -264787,10 +264783,8 @@ yy17828: } -tokenized_string_t *tokenize(const char *input) { - size_t token_start, token_length; - int token_type; +scanner_t scanner_from_string(const char *input) { unsigned char *s = (unsigned char *)input; scanner_t scanner; @@ -264799,6 +264793,15 @@ tokenized_string_t *tokenize(const char *input) { scanner.start = s; scanner.end = s + strlen(input); + return scanner; +} + +tokenized_string_t *tokenize(const char *input) { + size_t token_start, token_length; + int token_type; + + scanner_t scanner = scanner_from_string(input); + tokenized_string_t *response = tokenized_string_new(); while ( ( token_type = scan_token(&scanner)) != END ) { @@ -264814,3 +264817,4 @@ tokenized_string_t *tokenize(const char *input) { return response; } + diff --git a/src/scanner.h b/src/scanner.h index a581b9b9..d62a7483 100644 --- a/src/scanner.h +++ b/src/scanner.h @@ -8,6 +8,14 @@ extern "C" { #include "token_types.h" #include "tokens.h" +typedef struct scanner { + unsigned char *src, *cursor, *start, *end; +} scanner_t; + +int scan_token(scanner_t *s); + +inline scanner_t scanner_from_string(const char *input); + tokenized_string_t *tokenize(const char *str); diff --git a/src/scanner.re b/src/scanner.re index a7fdbde4..ad86cf1f 100644 --- a/src/scanner.re +++ b/src/scanner.re @@ -184,10 +184,7 @@ abbreviation = ({word})"\."; } -tokenized_string_t *tokenize(const char *input) { - size_t token_start, token_length; - int token_type; - +scanner_t scanner_from_string(const char *input) { unsigned char *s = (unsigned char *)input; scanner_t scanner; @@ -196,6 +193,15 @@ tokenized_string_t *tokenize(const char *input) { scanner.start = s; scanner.end = s + strlen(input); + return scanner; +} + +tokenized_string_t *tokenize(const char *input) { + size_t token_start, token_length; + int token_type; + + scanner_t scanner = scanner_from_string(input); + tokenized_string_t *response = tokenized_string_new(); while ( ( token_type = scan_token(&scanner)) != END ) {