From 8562c7a5cbcd2a199c887fc28d7fbdd135dc7646 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 23 Sep 2015 00:37:53 -0400 Subject: [PATCH] [unicode] Adding wide char support for language disambiguation (comes up in venue names), despite the likelihood of running on a narrow Python build. Rolling back common script chars at a script break, so in the case of e.g. Cyrllic name (Latin name), the segmentation is done at the space before the paren. --- scripts/geodata/language_id/disambiguation.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index c01e7fb5..d87f3431 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -159,17 +159,20 @@ MAX_ASCII = 127 def get_string_script(s): s = safe_decode(s) + str_len = len(s) script = last_script = UNKNOWN_SCRIPT is_ascii = True script_len = 0 - for c in s: - if (ord(c)) < len(char_scripts): - script = char_scripts[ord(c)] - else: - script = UNKNOWN_SCRIPT + for c in wide_iter(s): + script = char_scripts[wide_ord(c)] + if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT: script = last_script if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT: + if (script_len < str_len): + for c in reversed(list(wide_iter(s[:script_len]))): + if char_scripts[wide_ord(c)] == COMMON_SCRIPT: + script_len -= 1 break is_ascii = is_ascii and ord(c) <= MAX_ASCII script_len += 1