From 5fa03587fbc1366d3e524a97c7b29c058db0eb1e Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 14 Apr 2015 15:49:24 -0400 Subject: [PATCH] [cldr] simple Python scanner for creating dynamic scanners for CLDR rule parsing --- scripts/geodata/i18n/scanner.py | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 scripts/geodata/i18n/scanner.py diff --git a/scripts/geodata/i18n/scanner.py b/scripts/geodata/i18n/scanner.py new file mode 100644 index 00000000..072aa7b0 --- /dev/null +++ b/scripts/geodata/i18n/scanner.py @@ -0,0 +1,37 @@ +import re +import os +import sys + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + +from geodata.encoding import safe_decode + +class Scanner(object): + ''' + Simple scanner implementation in Python using regular expression groups. + Used to create dynamic lexicons for parsing various CLDR files + without compiling a C scanner. Only C scanners are used at runtime + ''' + + def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE): + self.lexicon = lexicon + + regexes, responses = zip(*lexicon) + + self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags) + self.responses = responses + + def scan(self, s): + + for match in self.regex.finditer(safe_decode(s)): + i = match.lastindex + response = self.responses[i - 1] + token = match.group(i) + if not callable(response): + yield (token, response) + else: + responses = response(match, token) + if responses is not None: + for response, token in responses: + yield (token, response)