[unicode] Adding wide char support for language disambiguation (comes up in venue names), despite the likelihood of running on a narrow Python build. Rolling back common script chars at a script break, so in the case of e.g. Cyrllic name (Latin name), the segmentation is done at the space before the paren.

This commit is contained in:
Al
2015-09-23 00:37:53 -04:00
parent 19e5457a0f
commit 8562c7a5cb

View File

@@ -159,17 +159,20 @@ MAX_ASCII = 127
def get_string_script(s): def get_string_script(s):
s = safe_decode(s) s = safe_decode(s)
str_len = len(s)
script = last_script = UNKNOWN_SCRIPT script = last_script = UNKNOWN_SCRIPT
is_ascii = True is_ascii = True
script_len = 0 script_len = 0
for c in s: for c in wide_iter(s):
if (ord(c)) < len(char_scripts): script = char_scripts[wide_ord(c)]
script = char_scripts[ord(c)]
else:
script = UNKNOWN_SCRIPT
if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT: if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
script = last_script script = last_script
if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT: if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
if (script_len < str_len):
for c in reversed(list(wide_iter(s[:script_len]))):
if char_scripts[wide_ord(c)] == COMMON_SCRIPT:
script_len -= 1
break break
is_ascii = is_ascii and ord(c) <= MAX_ASCII is_ascii = is_ascii and ord(c) <= MAX_ASCII
script_len += 1 script_len += 1