[utf8] Adding method to get the script of a string and the length of the span (rolls Common script up with the previuos script)
This commit is contained in:
@@ -2,11 +2,47 @@
|
|||||||
|
|
||||||
#include "unicode_scripts_data.c"
|
#include "unicode_scripts_data.c"
|
||||||
|
|
||||||
script_t get_char_script(uint32_t ch) {
|
inline script_t get_char_script(uint32_t ch) {
|
||||||
if (ch > NUM_CHARS - 1) return SCRIPT_UNKNOWN;
|
if (ch > NUM_CODEPOINTS - 1) return SCRIPT_UNKNOWN;
|
||||||
return char_scripts[ch];
|
return char_scripts[ch];
|
||||||
}
|
}
|
||||||
|
|
||||||
script_languages_t get_script_languages(script_t script) {
|
inline script_languages_t get_script_languages(script_t script) {
|
||||||
return script_languages[script];
|
return script_languages[script];
|
||||||
|
}
|
||||||
|
|
||||||
|
script_t string_script(char *str, size_t *len) {
|
||||||
|
int32_t ch;
|
||||||
|
script_t last_script = SCRIPT_UNKNOWN;
|
||||||
|
script_t script = SCRIPT_UNKNOWN;
|
||||||
|
|
||||||
|
uint8_t *ptr = (uint8_t *)str;
|
||||||
|
|
||||||
|
*len = 0;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
ssize_t char_len = utf8proc_iterate(ptr, -1, &ch);
|
||||||
|
|
||||||
|
if (ch == 0) break;
|
||||||
|
|
||||||
|
script = get_char_script((uint32_t)ch);
|
||||||
|
|
||||||
|
if (script == SCRIPT_COMMON) {
|
||||||
|
script = last_script;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last_script != script && last_script != SCRIPT_UNKNOWN) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ptr += char_len;
|
||||||
|
*len += char_len;
|
||||||
|
|
||||||
|
if (script != SCRIPT_UNKNOWN || script != SCRIPT_COMMON) {
|
||||||
|
last_script = script;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return last_script;
|
||||||
}
|
}
|
||||||
@@ -2,6 +2,7 @@
|
|||||||
#define UNICODE_SCRIPTS_H
|
#define UNICODE_SCRIPTS_H
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include "utf8proc/utf8proc.h"
|
||||||
#include "unicode_script_types.h"
|
#include "unicode_script_types.h"
|
||||||
|
|
||||||
typedef struct script_code {
|
typedef struct script_code {
|
||||||
@@ -17,4 +18,6 @@ typedef struct script_languages {
|
|||||||
script_t get_char_script(uint32_t ch);
|
script_t get_char_script(uint32_t ch);
|
||||||
script_languages_t get_script_languages(script_t script);
|
script_languages_t get_script_languages(script_t script);
|
||||||
|
|
||||||
|
script_t string_script(char *str, size_t *len);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user