Files
libpostal/src/unicode_scripts.c
2015-07-25 13:49:42 -04:00

56 lines
1.3 KiB
C

#include "unicode_scripts.h"
#include "unicode_scripts_data.c"
#define MAX_ASCII 128
inline script_t get_char_script(uint32_t ch) {
if (ch > NUM_CODEPOINTS - 1) return SCRIPT_UNKNOWN;
return char_scripts[ch];
}
inline script_languages_t get_script_languages(script_t script) {
return script_languages[script];
}
string_script_t get_string_script(char *str, size_t len) {
int32_t ch;
script_t last_script = SCRIPT_UNKNOWN;
script_t script = SCRIPT_UNKNOWN;
uint8_t *ptr = (uint8_t *)str;
size_t script_len = 0;
size_t idx = 0;
bool is_ascii = true;
while (idx < len) {
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
if (ch == 0) break;
script = get_char_script((uint32_t)ch);
if (script == SCRIPT_COMMON && last_script != SCRIPT_UNKNOWN) {
script = last_script;
}
if (last_script != script && last_script != SCRIPT_UNKNOWN && last_script != SCRIPT_COMMON) {
break;
}
is_ascii = is_ascii && ch < MAX_ASCII;
ptr += char_len;
idx += char_len;
script_len += char_len;
if (script != SCRIPT_UNKNOWN) {
last_script = script;
}
}
return (string_script_t) {last_script, script_len, is_ascii};
}