From 5e71a9d805d6f54fae728919b88f93fd12a03bd7 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 24 Jun 2015 13:29:40 -0500 Subject: [PATCH] [utf8] Adding method to get the script of a string and the length of the span (rolls Common script up with the previuos script) --- src/unicode_scripts.c | 42 +++++++++++++++++++++++++++++++++++++++--- src/unicode_scripts.h | 3 +++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/unicode_scripts.c b/src/unicode_scripts.c index 878879ec..d7709b3f 100644 --- a/src/unicode_scripts.c +++ b/src/unicode_scripts.c @@ -2,11 +2,47 @@ #include "unicode_scripts_data.c" -script_t get_char_script(uint32_t ch) { - if (ch > NUM_CHARS - 1) return SCRIPT_UNKNOWN; +inline script_t get_char_script(uint32_t ch) { + if (ch > NUM_CODEPOINTS - 1) return SCRIPT_UNKNOWN; return char_scripts[ch]; } -script_languages_t get_script_languages(script_t script) { +inline script_languages_t get_script_languages(script_t script) { return script_languages[script]; +} + +script_t string_script(char *str, size_t *len) { + int32_t ch; + script_t last_script = SCRIPT_UNKNOWN; + script_t script = SCRIPT_UNKNOWN; + + uint8_t *ptr = (uint8_t *)str; + + *len = 0; + + while (1) { + ssize_t char_len = utf8proc_iterate(ptr, -1, &ch); + + if (ch == 0) break; + + script = get_char_script((uint32_t)ch); + + if (script == SCRIPT_COMMON) { + script = last_script; + } + + if (last_script != script && last_script != SCRIPT_UNKNOWN) { + break; + } + + ptr += char_len; + *len += char_len; + + if (script != SCRIPT_UNKNOWN || script != SCRIPT_COMMON) { + last_script = script; + } + + } + + return last_script; } \ No newline at end of file diff --git a/src/unicode_scripts.h b/src/unicode_scripts.h index 17919758..16ef2439 100644 --- a/src/unicode_scripts.h +++ b/src/unicode_scripts.h @@ -2,6 +2,7 @@ #define UNICODE_SCRIPTS_H #include +#include "utf8proc/utf8proc.h" #include "unicode_script_types.h" typedef struct script_code { @@ -17,4 +18,6 @@ typedef struct script_languages { script_t get_char_script(uint32_t ch); script_languages_t get_script_languages(script_t script); +script_t string_script(char *str, size_t *len); + #endif