[unicode] Adding wide char support for language disambiguation (comes up in venue names), despite the likelihood of running on a narrow Python build. Rolling back common script chars at a script break, so in the case of e.g. Cyrllic name (Latin name), the segmentation is done at the space before the paren.
This commit is contained in:
@@ -159,17 +159,20 @@ MAX_ASCII = 127
|
|||||||
|
|
||||||
def get_string_script(s):
|
def get_string_script(s):
|
||||||
s = safe_decode(s)
|
s = safe_decode(s)
|
||||||
|
str_len = len(s)
|
||||||
script = last_script = UNKNOWN_SCRIPT
|
script = last_script = UNKNOWN_SCRIPT
|
||||||
is_ascii = True
|
is_ascii = True
|
||||||
script_len = 0
|
script_len = 0
|
||||||
for c in s:
|
for c in wide_iter(s):
|
||||||
if (ord(c)) < len(char_scripts):
|
script = char_scripts[wide_ord(c)]
|
||||||
script = char_scripts[ord(c)]
|
|
||||||
else:
|
|
||||||
script = UNKNOWN_SCRIPT
|
|
||||||
if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
|
if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
|
||||||
script = last_script
|
script = last_script
|
||||||
if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
|
if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
|
||||||
|
if (script_len < str_len):
|
||||||
|
for c in reversed(list(wide_iter(s[:script_len]))):
|
||||||
|
if char_scripts[wide_ord(c)] == COMMON_SCRIPT:
|
||||||
|
script_len -= 1
|
||||||
break
|
break
|
||||||
is_ascii = is_ascii and ord(c) <= MAX_ASCII
|
is_ascii = is_ascii and ord(c) <= MAX_ASCII
|
||||||
script_len += 1
|
script_len += 1
|
||||||
|
|||||||
Reference in New Issue
Block a user