Files
libpostal/scripts/geodata/i18n/scanner.py

38 lines
1.2 KiB
Python

import re
import os
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_decode
class Scanner(object):
'''
Simple scanner implementation in Python using regular expression groups.
Used to create dynamic lexicons for parsing various CLDR files
without compiling a C scanner. Only C scanners are used at runtime
'''
def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
self.lexicon = lexicon
regexes, responses = zip(*lexicon)
self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
self.responses = responses
def scan(self, s):
for match in self.regex.finditer(safe_decode(s)):
i = match.lastindex
response = self.responses[i - 1]
token = match.group(i)
if not callable(response):
yield (token, response)
else:
responses = response(match, token)
if responses is not None:
for response, token in responses:
yield (token, response)