From 88bd0cd158d660796747a451a4d14507e06100f0 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 23 Sep 2015 04:06:34 -0400 Subject: [PATCH] [unicode] better segmentation on script breaks --- src/unicode_scripts.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/unicode_scripts.c b/src/unicode_scripts.c index ddbba825..8bd1ad90 100644 --- a/src/unicode_scripts.c +++ b/src/unicode_scripts.c @@ -37,6 +37,22 @@ string_script_t get_string_script(char *str, size_t len) { } if (last_script != script && last_script != SCRIPT_UNKNOWN && last_script != SCRIPT_COMMON) { + if (script_len < len) { + while (true) { + char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch); + if (ch == 0) break; + + script = get_char_script((uint32_t)ch); + if (script != SCRIPT_COMMON) { + break; + } + + script_len -= char_len; + ptr -= char_len; + idx -= char_len; + } + } + break; }