From 25ae5bed33ee23089b184627238c4645a810d3ba Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 11 Jan 2016 16:39:15 -0500 Subject: [PATCH] [unicode] Adding SCRIPT_INHERITED as a common script so diacritics like COMBING CEDILLA don't break the current script and produce false word breaks --- src/unicode_scripts.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/unicode_scripts.c b/src/unicode_scripts.c index 8bd1ad90..c52f8fe9 100644 --- a/src/unicode_scripts.c +++ b/src/unicode_scripts.c @@ -13,6 +13,10 @@ inline script_languages_t get_script_languages(script_t script) { return script_languages[script]; } +inline bool is_common_script(script_t script) { + return script == SCRIPT_COMMON || script == SCRIPT_INHERITED; +} + string_script_t get_string_script(char *str, size_t len) { int32_t ch; script_t last_script = SCRIPT_UNKNOWN; @@ -32,18 +36,18 @@ string_script_t get_string_script(char *str, size_t len) { script = get_char_script((uint32_t)ch); - if (script == SCRIPT_COMMON && last_script != SCRIPT_UNKNOWN) { + if (is_common_script(script) && last_script != SCRIPT_UNKNOWN) { script = last_script; } - if (last_script != script && last_script != SCRIPT_UNKNOWN && last_script != SCRIPT_COMMON) { + if (last_script != script && last_script != SCRIPT_UNKNOWN && !is_common_script(last_script)) { if (script_len < len) { while (true) { char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch); if (ch == 0) break; script = get_char_script((uint32_t)ch); - if (script != SCRIPT_COMMON) { + if (!is_common_script(script)) { break; }