From cc0401a8d1a1ede1f15e609eb81a043eb6e8fa39 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 28 Jun 2015 19:37:53 -0400 Subject: [PATCH] [utf8] Adding a boolean struct member for string_script_t return values, set to true if the string is ASCII (no transliteration needed, should be frequent for English addresses) --- src/unicode_scripts.c | 8 +++++++- src/unicode_scripts.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/unicode_scripts.c b/src/unicode_scripts.c index d9e2306c..69c1e719 100644 --- a/src/unicode_scripts.c +++ b/src/unicode_scripts.c @@ -2,6 +2,8 @@ #include "unicode_scripts_data.c" +#define MAX_ASCII 128 + inline script_t get_char_script(uint32_t ch) { if (ch > NUM_CODEPOINTS - 1) return SCRIPT_UNKNOWN; return char_scripts[ch]; @@ -21,6 +23,8 @@ string_script_t get_string_script(char *str, size_t len) { size_t script_len = 0; size_t idx = 0; + bool is_ascii = true; + while (idx < len) { ssize_t char_len = utf8proc_iterate(ptr, -1, &ch); @@ -36,6 +40,8 @@ string_script_t get_string_script(char *str, size_t len) { break; } + is_ascii = is_ascii && ch < MAX_ASCII; + ptr += char_len; idx += char_len; script_len += char_len; @@ -46,5 +52,5 @@ string_script_t get_string_script(char *str, size_t len) { } - return (string_script_t) {last_script, script_len}; + return (string_script_t) {last_script, script_len, is_ascii}; } \ No newline at end of file diff --git a/src/unicode_scripts.h b/src/unicode_scripts.h index e290f012..ba797eea 100644 --- a/src/unicode_scripts.h +++ b/src/unicode_scripts.h @@ -18,6 +18,7 @@ typedef struct script_languages { typedef struct string_script { script_t script; size_t len; + bool ascii; } string_script_t; script_t get_char_script(uint32_t ch);