[cldr] simple Python scanner for creating dynamic scanners for CLDR rule parsing

2015-04-14 15:49:24 -04:00
parent efdcbc9eef
commit 5fa03587fb
1 changed files with 37 additions and 0 deletions
--- a/scripts/geodata/i18n/scanner.py
+++ b/scripts/geodata/i18n/scanner.py
@@ -0,0 +1,37 @@
+import re
+import os
+import sys
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode
+
+class Scanner(object):
+    '''
+    Simple scanner implementation in Python using regular expression groups.
+    Used to create dynamic lexicons for parsing various CLDR files
+    without compiling a C scanner. Only C scanners are used at runtime
+    '''
+
+    def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
+        self.lexicon = lexicon
+
+        regexes, responses = zip(*lexicon)
+
+        self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
+        self.responses = responses
+
+    def scan(self, s):
+
+        for match in self.regex.finditer(safe_decode(s)):
+            i = match.lastindex
+            response = self.responses[i - 1]
+            token = match.group(i)
+            if not callable(response):
+                yield (token, response)
+            else:
+                responses = response(match, token)
+                if responses is not None:
+                    for response, token in responses:
+                        yield (token, response)