[utf8] Adding method to get the script of a string and the length of the span (rolls Common script up with the previuos script)
This commit is contained in:
@@ -2,11 +2,47 @@
|
||||
|
||||
#include "unicode_scripts_data.c"
|
||||
|
||||
script_t get_char_script(uint32_t ch) {
|
||||
if (ch > NUM_CHARS - 1) return SCRIPT_UNKNOWN;
|
||||
inline script_t get_char_script(uint32_t ch) {
|
||||
if (ch > NUM_CODEPOINTS - 1) return SCRIPT_UNKNOWN;
|
||||
return char_scripts[ch];
|
||||
}
|
||||
|
||||
script_languages_t get_script_languages(script_t script) {
|
||||
inline script_languages_t get_script_languages(script_t script) {
|
||||
return script_languages[script];
|
||||
}
|
||||
|
||||
script_t string_script(char *str, size_t *len) {
|
||||
int32_t ch;
|
||||
script_t last_script = SCRIPT_UNKNOWN;
|
||||
script_t script = SCRIPT_UNKNOWN;
|
||||
|
||||
uint8_t *ptr = (uint8_t *)str;
|
||||
|
||||
*len = 0;
|
||||
|
||||
while (1) {
|
||||
ssize_t char_len = utf8proc_iterate(ptr, -1, &ch);
|
||||
|
||||
if (ch == 0) break;
|
||||
|
||||
script = get_char_script((uint32_t)ch);
|
||||
|
||||
if (script == SCRIPT_COMMON) {
|
||||
script = last_script;
|
||||
}
|
||||
|
||||
if (last_script != script && last_script != SCRIPT_UNKNOWN) {
|
||||
break;
|
||||
}
|
||||
|
||||
ptr += char_len;
|
||||
*len += char_len;
|
||||
|
||||
if (script != SCRIPT_UNKNOWN || script != SCRIPT_COMMON) {
|
||||
last_script = script;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return last_script;
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
#define UNICODE_SCRIPTS_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "utf8proc/utf8proc.h"
|
||||
#include "unicode_script_types.h"
|
||||
|
||||
typedef struct script_code {
|
||||
@@ -17,4 +18,6 @@ typedef struct script_languages {
|
||||
script_t get_char_script(uint32_t ch);
|
||||
script_languages_t get_script_languages(script_t script);
|
||||
|
||||
script_t string_script(char *str, size_t *len);
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user