[unicode] Adding wide char support for language disambiguation (comes up in venue names), despite the likelihood of running on a narrow Python build. Rolling back common script chars at a script break, so in the case of e.g. Cyrllic name (Latin name), the segmentation is done at the space before the paren.

2015-09-23 00:37:53 -04:00
parent 19e5457a0f
commit 8562c7a5cb
1 changed files with 8 additions and 5 deletions
--- a/scripts/geodata/language_id/disambiguation.py
+++ b/scripts/geodata/language_id/disambiguation.py
@@ -159,17 +159,20 @@ MAX_ASCII = 127

 def get_string_script(s):
    s = safe_decode(s)
+    str_len = len(s)
    script = last_script = UNKNOWN_SCRIPT
    is_ascii = True
    script_len = 0
-    for c in s:
-        if (ord(c)) < len(char_scripts):
-            script = char_scripts[ord(c)]
-        else:
-            script = UNKNOWN_SCRIPT
+    for c in wide_iter(s):
+        script = char_scripts[wide_ord(c)]
+
        if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
            script = last_script
        if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
+            if (script_len < str_len):
+                for c in reversed(list(wide_iter(s[:script_len]))):
+                    if char_scripts[wide_ord(c)] == COMMON_SCRIPT:
+                        script_len -= 1
            break
        is_ascii = is_ascii and ord(c) <= MAX_ASCII
        script_len += 1