[utf8] Adding method to get the script of a string and the length of the span (rolls Common script up with the previuos script)

This commit is contained in:
Al
2015-06-24 13:29:40 -05:00
parent 85348e1178
commit 5e71a9d805
2 changed files with 42 additions and 3 deletions

View File

@@ -2,11 +2,47 @@
#include "unicode_scripts_data.c" #include "unicode_scripts_data.c"
script_t get_char_script(uint32_t ch) { inline script_t get_char_script(uint32_t ch) {
if (ch > NUM_CHARS - 1) return SCRIPT_UNKNOWN; if (ch > NUM_CODEPOINTS - 1) return SCRIPT_UNKNOWN;
return char_scripts[ch]; return char_scripts[ch];
} }
script_languages_t get_script_languages(script_t script) { inline script_languages_t get_script_languages(script_t script) {
return script_languages[script]; return script_languages[script];
}
script_t string_script(char *str, size_t *len) {
int32_t ch;
script_t last_script = SCRIPT_UNKNOWN;
script_t script = SCRIPT_UNKNOWN;
uint8_t *ptr = (uint8_t *)str;
*len = 0;
while (1) {
ssize_t char_len = utf8proc_iterate(ptr, -1, &ch);
if (ch == 0) break;
script = get_char_script((uint32_t)ch);
if (script == SCRIPT_COMMON) {
script = last_script;
}
if (last_script != script && last_script != SCRIPT_UNKNOWN) {
break;
}
ptr += char_len;
*len += char_len;
if (script != SCRIPT_UNKNOWN || script != SCRIPT_COMMON) {
last_script = script;
}
}
return last_script;
} }

View File

@@ -2,6 +2,7 @@
#define UNICODE_SCRIPTS_H #define UNICODE_SCRIPTS_H
#include <stdlib.h> #include <stdlib.h>
#include "utf8proc/utf8proc.h"
#include "unicode_script_types.h" #include "unicode_script_types.h"
typedef struct script_code { typedef struct script_code {
@@ -17,4 +18,6 @@ typedef struct script_languages {
script_t get_char_script(uint32_t ch); script_t get_char_script(uint32_t ch);
script_languages_t get_script_languages(script_t script); script_languages_t get_script_languages(script_t script);
script_t string_script(char *str, size_t *len);
#endif #endif