From 7e057b0fb87502ef093dacce34fd007514e1e4a8 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 23 Sep 2015 00:42:48 -0400 Subject: [PATCH] [utils] basic functions for wide char support for narrow Python builds (unichr, ord, unicode iteration) --- scripts/geodata/string_utils.py | 36 +++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 scripts/geodata/string_utils.py diff --git a/scripts/geodata/string_utils.py b/scripts/geodata/string_utils.py new file mode 100644 index 00000000..3812f112 --- /dev/null +++ b/scripts/geodata/string_utils.py @@ -0,0 +1,36 @@ +import sys +from encoding import safe_decode + +NUM_CODEPOINTS = 0x10FFFF + 1 + + +def wide_unichr(i): + if i <= sys.maxunicode: + return unichr(i) + else: + return '\U{0:08x}'.format(i).decode('unicode-escape') + + +def wide_ord(c): + if len(c) == 1: + return ord(c) + elif len(c) == 2: + h, l = c + return ((ord(h) - 0xD800) * 0x400) + (ord(l) - 0xDC00) + 0x10000 + + return None + + +def wide_iter(s): + skip = False + s = safe_decode(s) + for i, c in enumerate(s): + if skip: + skip = False + continue + + if 0xD800 <= ord(c) <= 0xDBFF: + yield s[i:i+2] + skip = True + continue + yield c