Initial fork commit
This commit is contained in:
0
scripts/geodata/i18n/__init__.py
Normal file
0
scripts/geodata/i18n/__init__.py
Normal file
139
scripts/geodata/i18n/cldr_languages.py
Normal file
139
scripts/geodata/i18n/cldr_languages.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import requests
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from cStringIO import StringIO
|
||||
from lxml import etree
|
||||
|
||||
from unicode_paths import CLDR_DIR
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
DEFAULT_LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'language', 'countries')
|
||||
|
||||
CLDR_SUPPLEMENTAL_DATA = os.path.join(CLDR_DIR, 'common', 'supplemental',
|
||||
'supplementalData.xml')
|
||||
|
||||
ISO_639_3 = 'http://www-01.sil.org/iso639-3/iso-639-3.tab'
|
||||
ISO_MACROLANGUAGES = 'http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab'
|
||||
|
||||
ISO_LANGUAGES_FILENAME = 'iso_languages.tsv'
|
||||
MACROLANGUAGES_FILENAME = 'iso_macrolanguages.tsv'
|
||||
COUNTRY_LANGUAGES_FILENAME = 'country_language.tsv'
|
||||
SCRIPT_LANGUAGES_FILENAME = 'script_languages.tsv'
|
||||
|
||||
REGIONAL = 'official_regional'
|
||||
UNKNOWN_COUNTRY = 'zz'
|
||||
UNKNOWN_LANGUAGES = ('und', 'zxx')
|
||||
|
||||
|
||||
def write_country_official_languages_file(xml, out_dir):
|
||||
lang_file = open(os.path.join(out_dir, COUNTRY_LANGUAGES_FILENAME), 'w')
|
||||
lang_writer = csv.writer(lang_file, delimiter='\t')
|
||||
|
||||
def get_population_pct(lang):
|
||||
return int(lang.attrib.get('populationPercent', 0))
|
||||
|
||||
lang_scripts = {}
|
||||
for lang in xml.xpath('//languageData/language'):
|
||||
language_code = lang.attrib['type'].lower()
|
||||
scripts = lang.get('scripts')
|
||||
if not scripts:
|
||||
continue
|
||||
territories = lang.get('territories')
|
||||
if (language_code, None) not in lang_scripts:
|
||||
lang_scripts[(language_code, None)] = scripts
|
||||
|
||||
if not territories:
|
||||
continue
|
||||
for territory in territories.strip().split():
|
||||
lang_scripts[(language_code, territory.lower())] = scripts
|
||||
|
||||
for territory in xml.xpath('//territoryInfo/territory'):
|
||||
country_code = territory.attrib['type'].lower()
|
||||
if country_code == UNKNOWN_COUNTRY:
|
||||
continue
|
||||
langs = territory.xpath('languagePopulation')
|
||||
languages = Counter()
|
||||
official = set()
|
||||
regional = set()
|
||||
for lang in langs:
|
||||
language = lang.attrib['type'].lower().split('_')[0]
|
||||
official_status = lang.attrib.get('officialStatus')
|
||||
languages[language] += float(lang.attrib['populationPercent'])
|
||||
if official_status and official_status != REGIONAL:
|
||||
official.add(language)
|
||||
elif official_status == REGIONAL:
|
||||
regional.add(language)
|
||||
|
||||
if official:
|
||||
languages = Counter({l: c for l, c in languages.iteritems()
|
||||
if l in official or l in regional})
|
||||
else:
|
||||
languages = Counter({l: c for l, c in languages.most_common(1)})
|
||||
|
||||
for lang, pct in languages.most_common():
|
||||
if lang in UNKNOWN_LANGUAGES:
|
||||
continue
|
||||
|
||||
script = lang_scripts.get((lang, country_code), lang_scripts.get((lang, None), ''))
|
||||
|
||||
lang_writer.writerow((country_code, lang, script.replace(' ', ','),
|
||||
str(min(pct, 100.0)), str(int(lang in official))))
|
||||
|
||||
RETIRED = 'R'
|
||||
INDIVIDUAL = 'I'
|
||||
MACRO = 'M'
|
||||
LIVING = 'L'
|
||||
|
||||
|
||||
def write_languages_file(langs, macro, out_dir):
|
||||
lang_file = open(os.path.join(out_dir, 'iso_languages.tsv'), 'w')
|
||||
writer = csv.writer(lang_file, delimiter='\t')
|
||||
writer.writerow(('ISO 639-3', 'ISO 639-2B', 'ISO 639-2T',
|
||||
'ISO 639-1', 'type', 'macro'))
|
||||
|
||||
macro_reader = csv.reader(StringIO(macro), delimiter='\t')
|
||||
headers = macro_reader.next()
|
||||
assert len(headers) == 3
|
||||
macros = {minor_code: macro_code for (macro_code, minor_code, status)
|
||||
in macro_reader if status != RETIRED}
|
||||
|
||||
lang_reader = csv.reader(StringIO(langs), delimiter='\t')
|
||||
headers = lang_reader.next()
|
||||
assert headers[:6] == ['Id', 'Part2B', 'Part2T',
|
||||
'Part1', 'Scope', 'Language_Type']
|
||||
|
||||
for line in lang_reader:
|
||||
iso639_3, iso639_2b, iso639_2t, iso639_1, scope, lang_type = line[:6]
|
||||
macro = macros.get(iso639_3, '')
|
||||
# Only living languages that are either individual or macro
|
||||
if scope in (INDIVIDUAL, MACRO) and lang_type == LIVING:
|
||||
writer.writerow((iso639_3, iso639_2b, iso639_2t,
|
||||
iso639_1, scope, macro))
|
||||
|
||||
|
||||
def fetch_cldr_languages(out_dir=DEFAULT_LANGUAGES_DIR):
|
||||
response = requests.get(ISO_639_3)
|
||||
langs = response.content
|
||||
|
||||
response = requests.get(ISO_MACROLANGUAGES)
|
||||
macro = response.content
|
||||
write_languages_file(langs, macro, out_dir)
|
||||
|
||||
supplemental = open(CLDR_SUPPLEMENTAL_DATA)
|
||||
xml = etree.parse(supplemental)
|
||||
write_country_official_languages_file(xml, out_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-o', '--out',
|
||||
default=DEFAULT_LANGUAGES_DIR,
|
||||
help='Out directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
fetch_cldr_languages(args.out)
|
||||
30
scripts/geodata/i18n/download_cldr.py
Normal file
30
scripts/geodata/i18n/download_cldr.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from unicode_paths import CLDR_DIR
|
||||
from geodata.file_utils import ensure_dir
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
CLDR_URL = 'http://www.unicode.org/Public/cldr/latest/core.zip'
|
||||
|
||||
|
||||
def download_cldr(temp_dir=None):
|
||||
if os.path.exists(CLDR_DIR):
|
||||
shutil.rmtree(CLDR_DIR)
|
||||
ensure_dir(CLDR_DIR)
|
||||
|
||||
if not temp_dir:
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
cldr_filename = os.path.join(temp_dir, CLDR_URL.rsplit('/', 1)[-1])
|
||||
|
||||
subprocess.check_call(['wget', CLDR_URL, '-O', cldr_filename])
|
||||
subprocess.check_call(['unzip', cldr_filename, '-d', CLDR_DIR])
|
||||
|
||||
if __name__ == '__main__':
|
||||
download_cldr(*sys.argv[1:])
|
||||
37
scripts/geodata/i18n/google.py
Normal file
37
scripts/geodata/i18n/google.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import re
|
||||
import requests
|
||||
import six.moves.urllib_parse as urlparse
|
||||
import ujson
|
||||
|
||||
requests.models.json = ujson
|
||||
|
||||
|
||||
GOOGLE_I18N_API = 'http://i18napis.appspot.com'
|
||||
GOOGLE_ADDRESS_DATA_API = urlparse.urljoin(GOOGLE_I18N_API, 'address/data/')
|
||||
|
||||
|
||||
class GoogleI18N(object):
|
||||
'''
|
||||
Fetches data from e.g. http://i18napis.appspot.com/address/data/GB
|
||||
and caches it in a dictionary for each country. These requests are
|
||||
lightweight, so for a given run of a program, max 250 requests
|
||||
will be made.
|
||||
'''
|
||||
def __init__(self):
|
||||
self.responses = {}
|
||||
|
||||
def get(self, country_code):
|
||||
ret = self.responses.get(country_code.lower())
|
||||
|
||||
if ret is None:
|
||||
url = urlparse.urljoin(GOOGLE_ADDRESS_DATA_API, country_code.upper())
|
||||
response = requests.get(url)
|
||||
if response.ok:
|
||||
ret = response.json()
|
||||
self.responses[country_code.lower()] = ret
|
||||
else:
|
||||
self.responses[country_code.lower()] = {}
|
||||
return ret
|
||||
|
||||
|
||||
google_i18n = GoogleI18N()
|
||||
86
scripts/geodata/i18n/languages.py
Normal file
86
scripts/geodata/i18n/languages.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import os
|
||||
import csv
|
||||
import sys
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.csv_utils import unicode_csv_reader
|
||||
|
||||
LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'language')
|
||||
|
||||
country_languages = defaultdict(OrderedDict)
|
||||
# Only official and de facto official, no official_regional
|
||||
official_languages = defaultdict(OrderedDict)
|
||||
|
||||
regional_languages = defaultdict(OrderedDict)
|
||||
road_language_overrides = defaultdict(OrderedDict)
|
||||
|
||||
languages = set()
|
||||
all_languages = languages
|
||||
|
||||
osm_admin1_ids = set()
|
||||
|
||||
languages_initialized = False
|
||||
|
||||
|
||||
def init_languages(languages_dir=LANGUAGES_DIR):
|
||||
global languages_initialized
|
||||
if languages_initialized:
|
||||
return
|
||||
path = os.path.join(languages_dir, 'countries', 'country_language.tsv')
|
||||
if not os.path.exists(path):
|
||||
raise ValueError('File does not exist: {}'.format(path))
|
||||
|
||||
for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
|
||||
country_languages[country][lang] = int(is_official)
|
||||
languages.add(lang)
|
||||
|
||||
for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
|
||||
if int(is_official) or len(country_languages[country]) == 1:
|
||||
official_languages[country][lang] = 1
|
||||
|
||||
path = os.path.join(languages_dir, 'countries', 'road_sign_languages.tsv')
|
||||
for country, lang, default in csv.reader(open(path), delimiter='\t'):
|
||||
road_language_overrides[country][lang] = int(default)
|
||||
if lang not in languages:
|
||||
languages.add(lang)
|
||||
|
||||
path = os.path.join(languages_dir, 'regional', 'adm1.tsv')
|
||||
|
||||
for country, key, value, langs, default in unicode_csv_reader(open(path), delimiter='\t'):
|
||||
if key == 'osm':
|
||||
osm_admin1_ids.add(tuple(value.split(':')))
|
||||
for lang in langs.split(','):
|
||||
regional_languages[(country, key, value)][lang] = int(default)
|
||||
if lang not in country_languages[country]:
|
||||
country_languages[country][lang] = 0
|
||||
if lang not in languages:
|
||||
languages.add(lang)
|
||||
|
||||
languages_initialized = True
|
||||
|
||||
|
||||
init_languages()
|
||||
|
||||
|
||||
def get_country_languages(country, official=True, overrides=True):
|
||||
if official:
|
||||
languages = official_languages[country]
|
||||
else:
|
||||
languages = country_languages[country]
|
||||
|
||||
if overrides:
|
||||
road_overrides = road_language_overrides.get(country)
|
||||
if road_overrides and road_overrides.values()[0]:
|
||||
languages = road_overrides
|
||||
elif road_overrides:
|
||||
languages.update(road_overrides)
|
||||
return languages
|
||||
|
||||
|
||||
def get_regional_languages(country, key, value):
|
||||
return regional_languages.get((country, key, value), OrderedDict())
|
||||
5
scripts/geodata/i18n/normalize.py
Normal file
5
scripts/geodata/i18n/normalize.py
Normal file
@@ -0,0 +1,5 @@
|
||||
import unicodedata
|
||||
|
||||
|
||||
def strip_accents(s):
|
||||
return u''.join([c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'])
|
||||
37
scripts/geodata/i18n/scanner.py
Normal file
37
scripts/geodata/i18n/scanner.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
class Scanner(object):
|
||||
'''
|
||||
Simple scanner implementation in Python using regular expression groups.
|
||||
Used to create dynamic lexicons for parsing various CLDR files
|
||||
without compiling a C scanner. Only C scanners are used at runtime
|
||||
'''
|
||||
|
||||
def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
|
||||
self.lexicon = lexicon
|
||||
|
||||
regexes, responses = zip(*lexicon)
|
||||
|
||||
self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
|
||||
self.responses = responses
|
||||
|
||||
def scan(self, s):
|
||||
|
||||
for match in self.regex.finditer(safe_decode(s)):
|
||||
i = match.lastindex
|
||||
response = self.responses[i - 1]
|
||||
token = match.group(i)
|
||||
if not callable(response):
|
||||
yield (token, response)
|
||||
else:
|
||||
responses = response(match, token)
|
||||
if responses is not None:
|
||||
for response, token in responses:
|
||||
yield (token, response)
|
||||
1680
scripts/geodata/i18n/transliteration_rules.py
Normal file
1680
scripts/geodata/i18n/transliteration_rules.py
Normal file
File diff suppressed because one or more lines are too long
273
scripts/geodata/i18n/unicode_data.py
Normal file
273
scripts/geodata/i18n/unicode_data.py
Normal file
@@ -0,0 +1,273 @@
|
||||
'''
|
||||
unicode_data.py
|
||||
---------------
|
||||
|
||||
Python's unicodedata module uses an outdated spec (Unicode 5.2) and since
|
||||
e.g. unicode categories are used in tokenization, we'd like to keep this
|
||||
as up-to-date as possible with the latest standard.
|
||||
'''
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict, namedtuple
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.file_utils import download_file
|
||||
from geodata.string_utils import wide_unichr, wide_ord
|
||||
|
||||
from unicode_properties import *
|
||||
|
||||
from unicode_paths import UNICODE_DATA_DIR
|
||||
|
||||
UNIDATA_URL = 'http://unicode.org/Public/UNIDATA/UnicodeData.txt'
|
||||
|
||||
UNIDATA_DIR = os.path.join(UNICODE_DATA_DIR, 'unidata')
|
||||
LOCAL_UNIDATA_FILE = os.path.join(UNIDATA_DIR, 'UnicodeData.txt')
|
||||
|
||||
unicode_categories = defaultdict(list)
|
||||
unicode_blocks = defaultdict(list)
|
||||
unicode_combining_classes = defaultdict(list)
|
||||
unicode_general_categories = defaultdict(list)
|
||||
unicode_scripts = defaultdict(list)
|
||||
unicode_properties = {}
|
||||
|
||||
unicode_script_ids = {}
|
||||
|
||||
unicode_blocks = {}
|
||||
unicode_category_aliases = {}
|
||||
unicode_property_aliases = {}
|
||||
unicode_property_value_aliases = {}
|
||||
unicode_word_breaks = {}
|
||||
|
||||
|
||||
# Ref: ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
|
||||
UNIDATA_FIELDS = [
|
||||
'code',
|
||||
'name',
|
||||
'category',
|
||||
'combining',
|
||||
'bidi_category',
|
||||
'decomp_mapping',
|
||||
'decimal_value',
|
||||
'digit_value',
|
||||
'numeric_value',
|
||||
'mirrored',
|
||||
'unicode_1_name',
|
||||
'comment',
|
||||
'upper_mapping',
|
||||
'lower_mapping',
|
||||
'title_mapping',
|
||||
]
|
||||
|
||||
UnicodeDataRow = namedtuple('UnicodeDataRow', ','.join(UNIDATA_FIELDS))
|
||||
|
||||
|
||||
def parse_unicode_data():
|
||||
'''
|
||||
Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
|
||||
'''
|
||||
if not os.path.exists(LOCAL_UNIDATA_FILE):
|
||||
download_file(UNIDATA_URL, LOCAL_UNIDATA_FILE)
|
||||
unidata_file = open(LOCAL_UNIDATA_FILE)
|
||||
|
||||
for line in csv.reader(unidata_file, delimiter=';'):
|
||||
yield UnicodeDataRow(*line)
|
||||
|
||||
|
||||
def iter_unicode_combining_classes():
|
||||
return unicode_combining_classes.iteritems()
|
||||
|
||||
|
||||
def iter_unicode_categories():
|
||||
return unicode_categories.iteritems()
|
||||
|
||||
|
||||
def get_unicode_category(cat):
|
||||
return unicode_categories[cat]
|
||||
|
||||
|
||||
def get_unicode_combining_class(c):
|
||||
return unicode_combining_classes[c]
|
||||
|
||||
|
||||
def get_unicode_categories():
|
||||
'''
|
||||
Build dict of unicode categories e.g.
|
||||
|
||||
{
|
||||
'Lu': ['A', 'B', 'C', ...]
|
||||
'Ll': ['a', 'b', 'c', ...]
|
||||
}
|
||||
'''
|
||||
categories = defaultdict(list)
|
||||
for row in parse_unicode_data():
|
||||
categories[row.category].append(wide_unichr(unicode_to_integer(row.code)))
|
||||
return dict(categories)
|
||||
|
||||
|
||||
def get_unicode_combining_classes():
|
||||
'''
|
||||
Build dict of unicode combining classes e.g.
|
||||
|
||||
{
|
||||
'0': ['\x00', '\x01', \x02', ...]
|
||||
}
|
||||
'''
|
||||
combining_classes = defaultdict(list)
|
||||
for row in parse_unicode_data():
|
||||
combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code)))
|
||||
return dict(combining_classes)
|
||||
|
||||
unicode_category_aliases = {
|
||||
'letter': 'L',
|
||||
'lower': 'Ll',
|
||||
'lowercase': 'Ll',
|
||||
'lowercaseletter': 'Ll',
|
||||
'upper': 'Lu',
|
||||
'uppercase': 'Lu',
|
||||
'uppercaseletter': 'Lu',
|
||||
'title': 'Lt',
|
||||
'nonspacing mark': 'Mn',
|
||||
'mark': 'M',
|
||||
}
|
||||
|
||||
COMBINING_CLASS_PROP = 'canonical_combining_class'
|
||||
BLOCK_PROP = 'block'
|
||||
GENERAL_CATEGORY_PROP = 'general_category'
|
||||
SCRIPT_PROP = 'script'
|
||||
WORD_BREAK_PROP = 'word_break'
|
||||
|
||||
|
||||
def init_unicode_categories():
|
||||
'''
|
||||
Initialize module-level dictionaries
|
||||
'''
|
||||
global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
|
||||
global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
|
||||
global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
|
||||
|
||||
unicode_categories.update(get_unicode_categories())
|
||||
unicode_combining_classes.update(get_unicode_combining_classes())
|
||||
|
||||
for key in unicode_categories.keys():
|
||||
unicode_general_categories[key[0]].extend(unicode_categories[key])
|
||||
|
||||
script_chars = get_chars_by_script()
|
||||
for i, script in enumerate(script_chars):
|
||||
if script:
|
||||
unicode_scripts[script.lower()].append(wide_unichr(i))
|
||||
|
||||
unicode_scripts = dict(unicode_scripts)
|
||||
|
||||
unicode_script_ids.update(build_master_scripts_list(script_chars))
|
||||
|
||||
unicode_blocks.update(get_unicode_blocks())
|
||||
unicode_properties.update(get_unicode_properties())
|
||||
unicode_property_aliases.update(get_property_aliases())
|
||||
|
||||
unicode_word_breaks.update(get_word_break_properties())
|
||||
|
||||
for key, value in get_property_value_aliases().iteritems():
|
||||
key = unicode_property_aliases.get(key, key)
|
||||
if key == GENERAL_CATEGORY_PROP:
|
||||
for k, v in value.iteritems():
|
||||
k = k.lower()
|
||||
unicode_category_aliases[k] = v
|
||||
if '_' in k:
|
||||
unicode_category_aliases[k.replace('_', '')] = v
|
||||
|
||||
unicode_property_value_aliases[key] = value
|
||||
|
||||
|
||||
regex_chars = re.compile('([\[\]\{\}\-\^])')
|
||||
|
||||
|
||||
def replace_regex_chars(s):
|
||||
return regex_chars.sub(r'\\\1', s)
|
||||
|
||||
|
||||
def format_regex_char(i):
|
||||
c = wide_unichr(i)
|
||||
return replace_regex_chars(c.encode('unicode-escape'))
|
||||
|
||||
|
||||
def make_char_set_regex(chars):
|
||||
'''
|
||||
Build a regex character set from a list of characters
|
||||
'''
|
||||
group_start = None
|
||||
group_end = None
|
||||
last_ord = -2
|
||||
|
||||
ords = map(wide_ord, chars)
|
||||
ords.sort()
|
||||
|
||||
ords.append(None)
|
||||
|
||||
groups = []
|
||||
|
||||
for i, o in enumerate(ords):
|
||||
if o is not None and o == last_ord + 1:
|
||||
group_end = o
|
||||
elif group_start is not None and group_end is not None:
|
||||
groups.append('-'.join((format_regex_char(group_start), format_regex_char(group_end))))
|
||||
group_end = None
|
||||
group_start = o
|
||||
elif group_start is not None and group_end is None:
|
||||
groups.append(format_regex_char(group_start))
|
||||
group_start = o
|
||||
else:
|
||||
group_start = o
|
||||
|
||||
last_ord = o
|
||||
|
||||
return u'[{}]'.format(u''.join(groups))
|
||||
|
||||
|
||||
name_category = [
|
||||
('control_chars', 'Cc'),
|
||||
('other_format_chars', 'Cf'),
|
||||
('other_not_assigned_chars', 'Cn'),
|
||||
('other_private_use_chars', 'Co'),
|
||||
('other_surrogate_chars', 'Cs'),
|
||||
('letter_lower_chars', 'Ll'),
|
||||
('letter_modifier_chars', 'Lm'),
|
||||
('letter_other_chars', 'Lo'),
|
||||
('letter_title_chars', 'Lt'),
|
||||
('letter_upper_chars', 'Lu'),
|
||||
('mark_spacing_combining_chars', 'Mc'),
|
||||
('mark_enclosing_chars', 'Me'),
|
||||
('mark_nonspacing_chars', 'Mn'),
|
||||
('number_or_digit_chars', 'Nd'),
|
||||
('number_letter_chars', 'Nl'),
|
||||
('number_other_chars', 'No'),
|
||||
('punct_connector_chars', 'Pc'),
|
||||
('punct_dash_chars', 'Pd'),
|
||||
('punct_close_chars', 'Pe'),
|
||||
('punct_final_quote_chars', 'Pf'),
|
||||
('punct_initial_quote_chars', 'Pi'),
|
||||
('punct_other_chars', 'Po'),
|
||||
('punct_open_chars', 'Ps'),
|
||||
('currency_symbol_chars', 'Sc'),
|
||||
('symbol_modifier_chars', 'Sk'),
|
||||
('symbol_math_chars', 'Sm'),
|
||||
('symbol_other_chars', 'So'),
|
||||
('separator_line_chars', 'Zl'),
|
||||
('separator_paragraph_chars', 'Zp'),
|
||||
('space', 'Zs'),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
init_unicode_categories()
|
||||
for name, cat in name_category:
|
||||
if cat not in unicode_categories:
|
||||
continue
|
||||
chars = unicode_categories[cat]
|
||||
print u'{} = {};'.format(name, make_char_set_regex(chars))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
11
scripts/geodata/i18n/unicode_paths.py
Normal file
11
scripts/geodata/i18n/unicode_paths.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources')
|
||||
|
||||
UNICODE_DATA_DIR = os.path.join(DATA_DIR, 'unicode')
|
||||
|
||||
CLDR_DIR = os.path.join(UNICODE_DATA_DIR, 'cldr')
|
||||
463
scripts/geodata/i18n/unicode_properties.py
Normal file
463
scripts/geodata/i18n/unicode_properties.py
Normal file
@@ -0,0 +1,463 @@
|
||||
'''
|
||||
scripts.py
|
||||
|
||||
This code uses the latest copy of Scripts.txt from unicode.org
|
||||
to generate a C file (and header) defining which script every character
|
||||
belongs to.
|
||||
'''
|
||||
|
||||
import csv
|
||||
import os
|
||||
import requests
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
import requests
|
||||
import subprocess
|
||||
|
||||
from cStringIO import StringIO
|
||||
|
||||
from collections import OrderedDict, defaultdict
|
||||
from itertools import islice
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from operator import itemgetter
|
||||
|
||||
from zipfile import ZipFile
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
from geodata.file_utils import ensure_dir, download_file
|
||||
from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
|
||||
|
||||
from cldr_languages import *
|
||||
from download_cldr import download_cldr
|
||||
from languages import get_country_languages
|
||||
from unicode_paths import UNICODE_DATA_DIR
|
||||
from word_breaks import script_regex, regex_char_range
|
||||
|
||||
SRC_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src')
|
||||
|
||||
SCRIPTS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'scripts')
|
||||
LOCAL_SCRIPTS_FILE = os.path.join(SCRIPTS_DATA_DIR, 'Scripts.txt')
|
||||
LOCAL_ISO_15924_FILE = os.path.join(SCRIPTS_DATA_DIR, 'iso15924.txt')
|
||||
|
||||
BLOCKS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'blocks')
|
||||
LOCAL_BLOCKS_FILE = os.path.join(BLOCKS_DATA_DIR, 'Blocks.txt')
|
||||
|
||||
PROPS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'props')
|
||||
LOCAL_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'PropList.txt')
|
||||
LOCAL_PROP_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyAliases.txt')
|
||||
LOCAL_PROP_VALUE_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyValueAliases.txt')
|
||||
LOCAL_DERIVED_CORE_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'DerivedCoreProperties.txt')
|
||||
|
||||
WORD_BREAKS_DIR = os.path.join(UNICODE_DATA_DIR, 'word_breaks')
|
||||
LOCAL_WORD_BREAKS_FILE = os.path.join(WORD_BREAKS_DIR, 'WordBreakProperty.txt')
|
||||
|
||||
SCRIPTS_HEADER = 'unicode_script_types.h'
|
||||
SCRIPTS_DATA_FILENAME = 'unicode_scripts_data.c'
|
||||
|
||||
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
|
||||
BLOCKS_URL = 'http://unicode.org/Public/UNIDATA/Blocks.txt'
|
||||
PROPS_URL = 'http://unicode.org/Public/UNIDATA/PropList.txt'
|
||||
PROP_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyAliases.txt'
|
||||
PROP_VALUE_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyValueAliases.txt'
|
||||
DERIVED_CORE_PROPS_URL = 'http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt'
|
||||
WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt'
|
||||
|
||||
ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
|
||||
|
||||
scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
|
||||
#define UNICODE_SCRIPT_TYPES_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define NUM_CODEPOINTS {num_codepoints}
|
||||
#define MAX_LANGS {max_langs}
|
||||
|
||||
typedef enum {{
|
||||
{script_enum}
|
||||
NUM_SCRIPTS
|
||||
}} script_t;
|
||||
|
||||
#endif
|
||||
'''
|
||||
|
||||
scripts_c_data_template = u'''
|
||||
script_t char_scripts[] = {{
|
||||
{char_scripts}
|
||||
}};
|
||||
|
||||
script_code_t script_codes[] = {{
|
||||
{script_codes}
|
||||
}};
|
||||
|
||||
script_languages_t script_languages[] = {{
|
||||
{script_languages}
|
||||
}};
|
||||
'''
|
||||
|
||||
script_code_template = '{{SCRIPT_{name}, "{code}"}}'
|
||||
|
||||
script_language_template = '{{{num_langs}, {languages}}}'
|
||||
|
||||
|
||||
def unicode_to_integer(u):
|
||||
return int('0x{}'.format(u), 16)
|
||||
|
||||
|
||||
def script_name_constant(i, u):
|
||||
return u'SCRIPT_{} = {}'.format(u.upper(), i)
|
||||
|
||||
|
||||
UNKNOWN_SCRIPT = 'Unknown'
|
||||
COMMON_SCRIPT = 'Common'
|
||||
|
||||
|
||||
def parse_char_range(r):
|
||||
return [unicode_to_integer(u) for u in r.split('..')]
|
||||
|
||||
|
||||
def get_chars_by_script():
|
||||
scripts_file = open(LOCAL_SCRIPTS_FILE)
|
||||
scripts = [None] * NUM_CODEPOINTS
|
||||
|
||||
# Lines look like:
|
||||
# 0041..005A ; Latin # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
|
||||
for char_range, script, char_class in script_regex.findall(scripts_file.read()):
|
||||
script_range = parse_char_range(char_range)
|
||||
if len(script_range) == 2:
|
||||
for i in xrange(script_range[0], script_range[1] + 1):
|
||||
scripts[i] = script
|
||||
elif script_range:
|
||||
scripts[script_range[0]] = script
|
||||
|
||||
return scripts
|
||||
|
||||
|
||||
COMMENT_CHAR = '#'
|
||||
DELIMITER_CHAR = ';'
|
||||
|
||||
|
||||
def parse_file(f):
|
||||
for line in f:
|
||||
line = line.split(COMMENT_CHAR)[0].strip()
|
||||
if not line:
|
||||
continue
|
||||
tokens = line.split(DELIMITER_CHAR)
|
||||
if tokens:
|
||||
yield [t.strip() for t in tokens]
|
||||
|
||||
|
||||
def get_property_aliases():
|
||||
prop_aliases_file = open(LOCAL_PROP_ALIASES_FILE)
|
||||
|
||||
aliases = {}
|
||||
|
||||
for line in parse_file(prop_aliases_file):
|
||||
prop = line[1]
|
||||
prop_aliases = [line[0]] + line[2:]
|
||||
|
||||
for alias in prop_aliases:
|
||||
aliases[alias.lower()] = prop.lower()
|
||||
|
||||
return aliases
|
||||
|
||||
|
||||
def get_property_value_aliases():
|
||||
prop_value_aliases_file = open(LOCAL_PROP_VALUE_ALIASES_FILE)
|
||||
|
||||
value_aliases = defaultdict(dict)
|
||||
|
||||
for line in parse_file(prop_value_aliases_file):
|
||||
prop = line[0]
|
||||
if prop not in ('ccc', 'gc'):
|
||||
value = line[2]
|
||||
aliases = [line[1]] + line[3:]
|
||||
else:
|
||||
value = line[1]
|
||||
aliases = line[2:]
|
||||
|
||||
for alias in aliases:
|
||||
value_aliases[prop.lower()][alias] = value
|
||||
|
||||
return dict(value_aliases)
|
||||
|
||||
|
||||
def get_unicode_blocks():
|
||||
blocks_file = open(LOCAL_BLOCKS_FILE)
|
||||
|
||||
blocks = defaultdict(list)
|
||||
|
||||
for line in parse_file(blocks_file):
|
||||
char_range, block = line
|
||||
char_range = parse_char_range(char_range)
|
||||
|
||||
if len(char_range) == 2:
|
||||
for i in xrange(char_range[0], char_range[1] + 1):
|
||||
blocks[block.lower()].append(wide_unichr(i))
|
||||
elif char_range:
|
||||
blocks[block.lower()].append(wide_unichr(char_range[0]))
|
||||
|
||||
return dict(blocks)
|
||||
|
||||
|
||||
def get_unicode_properties():
|
||||
props_file = open(LOCAL_PROPS_FILE)
|
||||
|
||||
props = defaultdict(list)
|
||||
|
||||
for line in parse_file(props_file):
|
||||
char_range, prop = line
|
||||
|
||||
char_range = parse_char_range(char_range)
|
||||
|
||||
if len(char_range) == 2:
|
||||
for i in xrange(char_range[0], char_range[1] + 1):
|
||||
props[prop.lower()].append(wide_unichr(i))
|
||||
elif char_range:
|
||||
props[prop.lower()].append(wide_unichr(char_range[0]))
|
||||
|
||||
derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
|
||||
for line in parse_file(derived_props_file):
|
||||
char_range, prop = line
|
||||
char_range = parse_char_range(char_range)
|
||||
|
||||
if len(char_range) == 2:
|
||||
for i in xrange(char_range[0], char_range[1] + 1):
|
||||
props[prop.lower()].append(wide_unichr(i))
|
||||
elif char_range:
|
||||
props[prop.lower()].append(wide_unichr(char_range[0]))
|
||||
|
||||
return dict(props)
|
||||
|
||||
|
||||
def get_word_break_properties():
|
||||
props_file = open(LOCAL_WORD_BREAKS_FILE)
|
||||
|
||||
props = defaultdict(list)
|
||||
|
||||
for line in parse_file(props_file):
|
||||
char_range, prop = line
|
||||
|
||||
char_range = parse_char_range(char_range)
|
||||
|
||||
if len(char_range) == 2:
|
||||
for i in xrange(char_range[0], char_range[1] + 1):
|
||||
props[prop].append(wide_unichr(i))
|
||||
elif char_range:
|
||||
props[prop].append(wide_unichr(char_range[0]))
|
||||
|
||||
return dict(props)
|
||||
|
||||
|
||||
def build_master_scripts_list(chars):
|
||||
all_scripts = OrderedDict.fromkeys(filter(bool, chars))
|
||||
|
||||
for i, script in enumerate(all_scripts.keys()):
|
||||
all_scripts[script] = i + 1
|
||||
|
||||
# Unknown script for all characters not covered
|
||||
all_scripts[UNKNOWN_SCRIPT] = 0
|
||||
|
||||
return all_scripts
|
||||
|
||||
|
||||
SCRIPT_ALIASES_SUPPLEMENTAL = {
|
||||
'Hant': 'Han',
|
||||
'Hans': 'Han'
|
||||
}
|
||||
|
||||
|
||||
def get_script_codes(all_scripts):
|
||||
|
||||
if not os.path.exists(LOCAL_ISO_15924_FILE):
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])
|
||||
|
||||
# This comes as a .zip
|
||||
script_codes_response = requests.get(ISO_15924_URL)
|
||||
zf = ZipFile(StringIO(script_codes_response.content))
|
||||
iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]
|
||||
|
||||
# Strip out the comments, etc.
|
||||
temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
|
||||
if line.strip() and not line.strip().startswith('#')])
|
||||
|
||||
f = open(LOCAL_ISO_15924_FILE, 'w')
|
||||
f.write(safe_encode(temp_iso15924_file))
|
||||
f.close()
|
||||
|
||||
script_codes_file = open(LOCAL_ISO_15924_FILE)
|
||||
|
||||
script_codes = {}
|
||||
seen_scripts = set()
|
||||
|
||||
# Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
|
||||
for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
|
||||
if name in all_scripts:
|
||||
script_codes[code] = name
|
||||
seen_scripts.add(name)
|
||||
else:
|
||||
normalized_name = name.split('(')[0].strip()
|
||||
if normalized_name in all_scripts and normalized_name not in seen_scripts:
|
||||
script_codes[code] = normalized_name
|
||||
seen_scripts.add(normalized_name)
|
||||
|
||||
value_aliases = get_property_value_aliases()
|
||||
script_aliases = value_aliases['sc']
|
||||
|
||||
for code, script in script_aliases.iteritems():
|
||||
if code not in script_codes and script in all_scripts:
|
||||
script_codes[code] = script
|
||||
|
||||
script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)
|
||||
|
||||
return script_codes
|
||||
|
||||
|
||||
SCRIPT_CODE_ALIASES = {
|
||||
'Jpan': ['Hani', 'Hira', 'Kana'],
|
||||
'Kore': ['Hang', 'Han']
|
||||
}
|
||||
|
||||
|
||||
def extract_language_scripts(xml):
|
||||
language_scripts = defaultdict(list)
|
||||
|
||||
for lang in xml.xpath('//languageData/language'):
|
||||
language_code = lang.attrib['type'].lower()
|
||||
scripts = lang.get('scripts')
|
||||
if not scripts:
|
||||
continue
|
||||
for script in scripts.split():
|
||||
script_aliases = SCRIPT_CODE_ALIASES.get(script)
|
||||
if not script_aliases:
|
||||
language_scripts[language_code].append(script)
|
||||
else:
|
||||
language_scripts[language_code].extend(script_aliases)
|
||||
|
||||
return language_scripts
|
||||
|
||||
|
||||
def batch_iter(iterable, batch_size):
|
||||
source_iter = iter(iterable)
|
||||
while True:
|
||||
batch = list(islice(source_iter, batch_size))
|
||||
if len(batch) > 0:
|
||||
yield batch
|
||||
else:
|
||||
return
|
||||
|
||||
|
||||
def get_script_languages():
|
||||
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
|
||||
# to identify the language. We keep track of those single language scripts to inform
|
||||
# the language classifier
|
||||
|
||||
chars = get_chars_by_script()
|
||||
all_scripts = build_master_scripts_list(chars)
|
||||
script_codes = get_script_codes(all_scripts)
|
||||
|
||||
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
|
||||
cldr_xml = etree.parse(cldr_supplemental_data)
|
||||
language_scripts = extract_language_scripts(cldr_xml)
|
||||
|
||||
country_languages_path = os.path.join(DEFAULT_LANGUAGES_DIR, COUNTRY_LANGUAGES_FILENAME)
|
||||
if not os.path.exists(country_languages_path):
|
||||
fetch_cldr_languages(DEFAULT_LANGUAGES_DIR)
|
||||
|
||||
country_language_file = open(country_languages_path)
|
||||
country_language_reader = csv.reader(country_language_file, delimiter='\t')
|
||||
|
||||
countries = set([country for country, lang, script, pct, is_official
|
||||
in country_language_reader])
|
||||
|
||||
spoken_languages = set.union(*(set(get_country_languages(country)) for country in countries))
|
||||
|
||||
script_code_languages = defaultdict(list)
|
||||
for language, scripts in language_scripts.iteritems():
|
||||
if language not in spoken_languages:
|
||||
continue
|
||||
for script in scripts:
|
||||
script_code_languages[script].append(language)
|
||||
|
||||
script_languages = defaultdict(list)
|
||||
|
||||
for script_code, script_name in script_codes.iteritems():
|
||||
langs = script_code_languages.get(script_code, [])
|
||||
script_languages[script_name].extend(langs)
|
||||
|
||||
for name in all_scripts.iterkeys():
|
||||
script_languages.setdefault(name, [])
|
||||
|
||||
return script_languages
|
||||
|
||||
|
||||
def main(out_dir=SRC_DIR):
|
||||
# Output is a C header and data file, see templates
|
||||
out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
|
||||
out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')
|
||||
|
||||
download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
|
||||
download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
|
||||
download_file(PROPS_URL, LOCAL_PROPS_FILE)
|
||||
download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
|
||||
download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
|
||||
download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
|
||||
download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)
|
||||
|
||||
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
|
||||
download_cldr()
|
||||
|
||||
chars = get_chars_by_script()
|
||||
all_scripts = build_master_scripts_list(chars)
|
||||
script_codes = get_script_codes(all_scripts)
|
||||
|
||||
script_languages = get_script_languages()
|
||||
|
||||
max_langs = 0
|
||||
|
||||
for script, langs in script_languages.iteritems():
|
||||
num_langs = len(langs)
|
||||
if num_langs > max_langs:
|
||||
max_langs = num_langs
|
||||
|
||||
# Generate C header and constants
|
||||
|
||||
script_enum = u'''
|
||||
'''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))])
|
||||
|
||||
out_header.write(scripts_header_template.format(num_codepoints=NUM_CODEPOINTS,
|
||||
max_langs=max_langs,
|
||||
script_enum=script_enum))
|
||||
out_header.close()
|
||||
|
||||
# Generate C data file
|
||||
|
||||
char_scripts_data = u''',
|
||||
'''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])
|
||||
|
||||
script_codes_data = u''',
|
||||
'''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])
|
||||
|
||||
sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]
|
||||
|
||||
script_language_data = u''',
|
||||
'''.join([script_language_template.format(num_langs=len(langs),
|
||||
languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL'))
|
||||
for langs in sorted_lang_scripts])
|
||||
|
||||
out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
|
||||
char_scripts=char_scripts_data,
|
||||
script_codes=script_codes_data,
|
||||
script_languages=script_language_data))
|
||||
out_file.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(*sys.argv[1:])
|
||||
140
scripts/geodata/i18n/word_breaks.py
Normal file
140
scripts/geodata/i18n/word_breaks.py
Normal file
@@ -0,0 +1,140 @@
|
||||
'''
|
||||
word_breaks.py
|
||||
|
||||
This script is used to automatically build ranges of unicode characters
|
||||
from the unicode spec's word break properties. These ranges help us
|
||||
build a tokenizer that does the right thing in every language with regard
|
||||
to word segmentation. The lines outputted by this script can be pasted
|
||||
into scanner.re before compliation.
|
||||
'''
|
||||
|
||||
import requests
|
||||
from collections import defaultdict
|
||||
import re
|
||||
|
||||
# Operate on WordBreakProperty.txt file
|
||||
hebrew_letter_regex = re.compile('^([^\s]+)[\s]+; Hebrew_Letter ')
|
||||
format_regex = re.compile('^([^\s]+)[\s]+; Format ')
|
||||
extend_regex = re.compile('^([^\s]+)[\s]+; Extend ')
|
||||
katakana_regex = re.compile('^([^\s]+)[\s]+; Katakana ')
|
||||
other_alpha_letter_regex = re.compile('^([^\s]+)[\s]+; ALetter # Lo (?!.*(?:HANGUL|TIBETAN|JAVANESE|BALINESE|YI) )')
|
||||
mid_letter_regex = re.compile('^([^\s]+)[\s]+; MidLetter')
|
||||
mid_number_regex = re.compile('^([^\s]+)[\s]+; MidNum ')
|
||||
mid_num_letter_regex = re.compile('^([^\s]+)[\s]+; MidNumLet ')
|
||||
numeric_regex = re.compile('^([^\s]+)[\s]+; Numeric ')
|
||||
extend_num_letter_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
|
||||
|
||||
# Operate on Scripts.txt file
|
||||
other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
|
||||
|
||||
script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
|
||||
|
||||
WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
|
||||
HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
|
||||
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
|
||||
|
||||
ideographic_scripts = set([
|
||||
'han',
|
||||
'hiragana',
|
||||
'hangul',
|
||||
'tibetan',
|
||||
'thai',
|
||||
'lao',
|
||||
'javanese',
|
||||
'balinese',
|
||||
'yi',
|
||||
])
|
||||
|
||||
|
||||
def regex_char_range(match):
|
||||
r = match.split('..')
|
||||
# Wide version
|
||||
return u'-'.join([('\u{}'.format(c.lower()) if len(c) < 5 else '\U{}'.format(c.lower().rjust(8, '0'))) for c in r])
|
||||
|
||||
|
||||
def get_letter_range(text, *regexes):
|
||||
char_ranges = []
|
||||
for line in text.split('\n'):
|
||||
for regex in regexes:
|
||||
m = regex.match(line)
|
||||
if m:
|
||||
char_ranges.append(regex_char_range(m.group(1)))
|
||||
return char_ranges
|
||||
|
||||
|
||||
def get_letter_ranges_for_scripts(text, scripts, char_class_regex):
|
||||
char_ranges = []
|
||||
for char_range, script, char_class in script_regex.findall(text):
|
||||
if script.lower() in scripts and char_class_regex.match(char_class):
|
||||
char_ranges.append(regex_char_range(char_range))
|
||||
return char_ranges
|
||||
|
||||
|
||||
def get_char_class(text, char_class_regex):
|
||||
char_ranges = []
|
||||
for char_range, script, char_class in script_regex.findall(text):
|
||||
if char_class_regex.match(char_class):
|
||||
char_ranges.append(regex_char_range(char_range))
|
||||
return char_ranges
|
||||
|
||||
|
||||
hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
|
||||
|
||||
|
||||
def get_hangul_syllable_ranges(text):
|
||||
char_ranges = defaultdict(list)
|
||||
for line in text.split('\n'):
|
||||
m = hangul_syllable_type_regex.match(line)
|
||||
if m:
|
||||
char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
|
||||
return dict(char_ranges)
|
||||
|
||||
|
||||
name_funcs = [
|
||||
('hebrew_letter_chars', hebrew_letter_regex),
|
||||
('format_chars', format_regex),
|
||||
('extend_chars', extend_regex),
|
||||
('katakana_chars', katakana_regex),
|
||||
('letter_other_alpha_chars', other_alpha_letter_regex),
|
||||
('mid_letter_chars', mid_letter_regex),
|
||||
('mid_number_chars', mid_number_regex),
|
||||
('mid_num_letter_chars', mid_num_letter_regex),
|
||||
('numeric_chars', numeric_regex),
|
||||
('extend_num_letter_chars', extend_num_letter_regex),
|
||||
]
|
||||
|
||||
IDEOGRAPHIC_CHARS = 'ideographic_chars'
|
||||
IDEOGRAPHIC_NUMERIC_CHARS = 'ideographic_numeric_chars'
|
||||
|
||||
numbers_regex = re.compile('N[ol]', re.I)
|
||||
letters_regex = re.compile('L*', re.I)
|
||||
|
||||
|
||||
def main():
|
||||
''' Insert these lines into scanner.re '''
|
||||
response = requests.get(WORD_BREAK_PROPERTIES_URL)
|
||||
|
||||
if response.ok:
|
||||
for name, reg in name_funcs:
|
||||
s = get_letter_range(response.content, reg)
|
||||
print '{} = [{}];'.format(name, ''.join(s))
|
||||
|
||||
response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
|
||||
|
||||
if response.ok:
|
||||
syllable_ranges = get_hangul_syllable_ranges(response.content)
|
||||
for name, ranges in syllable_ranges.iteritems():
|
||||
print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
|
||||
|
||||
response = requests.get(SCRIPTS_URL)
|
||||
if response.ok:
|
||||
s = ''.join(get_char_class(response.content, numbers_regex))
|
||||
|
||||
print '{} = [{}];'.format(IDEOGRAPHIC_NUMERIC_CHARS, ''.join(s))
|
||||
|
||||
s = ''.join(get_letter_ranges_for_scripts(response.content, ideographic_scripts, letters_regex))
|
||||
print '{} = [{}];'.format(IDEOGRAPHIC_CHARS, ''.join(s))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user