Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

View File

@@ -0,0 +1,139 @@
import argparse
import csv
import os
import requests
from collections import Counter
from cStringIO import StringIO
from lxml import etree
from unicode_paths import CLDR_DIR
this_dir = os.path.realpath(os.path.dirname(__file__))
DEFAULT_LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'language', 'countries')
CLDR_SUPPLEMENTAL_DATA = os.path.join(CLDR_DIR, 'common', 'supplemental',
'supplementalData.xml')
ISO_639_3 = 'http://www-01.sil.org/iso639-3/iso-639-3.tab'
ISO_MACROLANGUAGES = 'http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab'
ISO_LANGUAGES_FILENAME = 'iso_languages.tsv'
MACROLANGUAGES_FILENAME = 'iso_macrolanguages.tsv'
COUNTRY_LANGUAGES_FILENAME = 'country_language.tsv'
SCRIPT_LANGUAGES_FILENAME = 'script_languages.tsv'
REGIONAL = 'official_regional'
UNKNOWN_COUNTRY = 'zz'
UNKNOWN_LANGUAGES = ('und', 'zxx')
def write_country_official_languages_file(xml, out_dir):
lang_file = open(os.path.join(out_dir, COUNTRY_LANGUAGES_FILENAME), 'w')
lang_writer = csv.writer(lang_file, delimiter='\t')
def get_population_pct(lang):
return int(lang.attrib.get('populationPercent', 0))
lang_scripts = {}
for lang in xml.xpath('//languageData/language'):
language_code = lang.attrib['type'].lower()
scripts = lang.get('scripts')
if not scripts:
continue
territories = lang.get('territories')
if (language_code, None) not in lang_scripts:
lang_scripts[(language_code, None)] = scripts
if not territories:
continue
for territory in territories.strip().split():
lang_scripts[(language_code, territory.lower())] = scripts
for territory in xml.xpath('//territoryInfo/territory'):
country_code = territory.attrib['type'].lower()
if country_code == UNKNOWN_COUNTRY:
continue
langs = territory.xpath('languagePopulation')
languages = Counter()
official = set()
regional = set()
for lang in langs:
language = lang.attrib['type'].lower().split('_')[0]
official_status = lang.attrib.get('officialStatus')
languages[language] += float(lang.attrib['populationPercent'])
if official_status and official_status != REGIONAL:
official.add(language)
elif official_status == REGIONAL:
regional.add(language)
if official:
languages = Counter({l: c for l, c in languages.iteritems()
if l in official or l in regional})
else:
languages = Counter({l: c for l, c in languages.most_common(1)})
for lang, pct in languages.most_common():
if lang in UNKNOWN_LANGUAGES:
continue
script = lang_scripts.get((lang, country_code), lang_scripts.get((lang, None), ''))
lang_writer.writerow((country_code, lang, script.replace(' ', ','),
str(min(pct, 100.0)), str(int(lang in official))))
RETIRED = 'R'
INDIVIDUAL = 'I'
MACRO = 'M'
LIVING = 'L'
def write_languages_file(langs, macro, out_dir):
lang_file = open(os.path.join(out_dir, 'iso_languages.tsv'), 'w')
writer = csv.writer(lang_file, delimiter='\t')
writer.writerow(('ISO 639-3', 'ISO 639-2B', 'ISO 639-2T',
'ISO 639-1', 'type', 'macro'))
macro_reader = csv.reader(StringIO(macro), delimiter='\t')
headers = macro_reader.next()
assert len(headers) == 3
macros = {minor_code: macro_code for (macro_code, minor_code, status)
in macro_reader if status != RETIRED}
lang_reader = csv.reader(StringIO(langs), delimiter='\t')
headers = lang_reader.next()
assert headers[:6] == ['Id', 'Part2B', 'Part2T',
'Part1', 'Scope', 'Language_Type']
for line in lang_reader:
iso639_3, iso639_2b, iso639_2t, iso639_1, scope, lang_type = line[:6]
macro = macros.get(iso639_3, '')
# Only living languages that are either individual or macro
if scope in (INDIVIDUAL, MACRO) and lang_type == LIVING:
writer.writerow((iso639_3, iso639_2b, iso639_2t,
iso639_1, scope, macro))
def fetch_cldr_languages(out_dir=DEFAULT_LANGUAGES_DIR):
response = requests.get(ISO_639_3)
langs = response.content
response = requests.get(ISO_MACROLANGUAGES)
macro = response.content
write_languages_file(langs, macro, out_dir)
supplemental = open(CLDR_SUPPLEMENTAL_DATA)
xml = etree.parse(supplemental)
write_country_official_languages_file(xml, out_dir)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--out',
default=DEFAULT_LANGUAGES_DIR,
help='Out directory')
args = parser.parse_args()
fetch_cldr_languages(args.out)

View File

@@ -0,0 +1,30 @@
import os
import shutil
import subprocess
import sys
import tempfile
from unicode_paths import CLDR_DIR
from geodata.file_utils import ensure_dir
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
CLDR_URL = 'http://www.unicode.org/Public/cldr/latest/core.zip'
def download_cldr(temp_dir=None):
if os.path.exists(CLDR_DIR):
shutil.rmtree(CLDR_DIR)
ensure_dir(CLDR_DIR)
if not temp_dir:
temp_dir = tempfile.gettempdir()
cldr_filename = os.path.join(temp_dir, CLDR_URL.rsplit('/', 1)[-1])
subprocess.check_call(['wget', CLDR_URL, '-O', cldr_filename])
subprocess.check_call(['unzip', cldr_filename, '-d', CLDR_DIR])
if __name__ == '__main__':
download_cldr(*sys.argv[1:])

View File

@@ -0,0 +1,37 @@
import re
import requests
import six.moves.urllib_parse as urlparse
import ujson
requests.models.json = ujson
GOOGLE_I18N_API = 'http://i18napis.appspot.com'
GOOGLE_ADDRESS_DATA_API = urlparse.urljoin(GOOGLE_I18N_API, 'address/data/')
class GoogleI18N(object):
'''
Fetches data from e.g. http://i18napis.appspot.com/address/data/GB
and caches it in a dictionary for each country. These requests are
lightweight, so for a given run of a program, max 250 requests
will be made.
'''
def __init__(self):
self.responses = {}
def get(self, country_code):
ret = self.responses.get(country_code.lower())
if ret is None:
url = urlparse.urljoin(GOOGLE_ADDRESS_DATA_API, country_code.upper())
response = requests.get(url)
if response.ok:
ret = response.json()
self.responses[country_code.lower()] = ret
else:
self.responses[country_code.lower()] = {}
return ret
google_i18n = GoogleI18N()

View File

@@ -0,0 +1,86 @@
import os
import csv
import sys
from collections import defaultdict, OrderedDict
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.csv_utils import unicode_csv_reader
LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'language')
country_languages = defaultdict(OrderedDict)
# Only official and de facto official, no official_regional
official_languages = defaultdict(OrderedDict)
regional_languages = defaultdict(OrderedDict)
road_language_overrides = defaultdict(OrderedDict)
languages = set()
all_languages = languages
osm_admin1_ids = set()
languages_initialized = False
def init_languages(languages_dir=LANGUAGES_DIR):
global languages_initialized
if languages_initialized:
return
path = os.path.join(languages_dir, 'countries', 'country_language.tsv')
if not os.path.exists(path):
raise ValueError('File does not exist: {}'.format(path))
for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
country_languages[country][lang] = int(is_official)
languages.add(lang)
for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
if int(is_official) or len(country_languages[country]) == 1:
official_languages[country][lang] = 1
path = os.path.join(languages_dir, 'countries', 'road_sign_languages.tsv')
for country, lang, default in csv.reader(open(path), delimiter='\t'):
road_language_overrides[country][lang] = int(default)
if lang not in languages:
languages.add(lang)
path = os.path.join(languages_dir, 'regional', 'adm1.tsv')
for country, key, value, langs, default in unicode_csv_reader(open(path), delimiter='\t'):
if key == 'osm':
osm_admin1_ids.add(tuple(value.split(':')))
for lang in langs.split(','):
regional_languages[(country, key, value)][lang] = int(default)
if lang not in country_languages[country]:
country_languages[country][lang] = 0
if lang not in languages:
languages.add(lang)
languages_initialized = True
init_languages()
def get_country_languages(country, official=True, overrides=True):
if official:
languages = official_languages[country]
else:
languages = country_languages[country]
if overrides:
road_overrides = road_language_overrides.get(country)
if road_overrides and road_overrides.values()[0]:
languages = road_overrides
elif road_overrides:
languages.update(road_overrides)
return languages
def get_regional_languages(country, key, value):
return regional_languages.get((country, key, value), OrderedDict())

View File

@@ -0,0 +1,5 @@
import unicodedata
def strip_accents(s):
return u''.join([c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'])

View File

@@ -0,0 +1,37 @@
import re
import os
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_decode
class Scanner(object):
'''
Simple scanner implementation in Python using regular expression groups.
Used to create dynamic lexicons for parsing various CLDR files
without compiling a C scanner. Only C scanners are used at runtime
'''
def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
self.lexicon = lexicon
regexes, responses = zip(*lexicon)
self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
self.responses = responses
def scan(self, s):
for match in self.regex.finditer(safe_decode(s)):
i = match.lastindex
response = self.responses[i - 1]
token = match.group(i)
if not callable(response):
yield (token, response)
else:
responses = response(match, token)
if responses is not None:
for response, token in responses:
yield (token, response)

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,273 @@
'''
unicode_data.py
---------------
Python's unicodedata module uses an outdated spec (Unicode 5.2) and since
e.g. unicode categories are used in tokenization, we'd like to keep this
as up-to-date as possible with the latest standard.
'''
import csv
import os
import sys
from collections import defaultdict, namedtuple
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.file_utils import download_file
from geodata.string_utils import wide_unichr, wide_ord
from unicode_properties import *
from unicode_paths import UNICODE_DATA_DIR
UNIDATA_URL = 'http://unicode.org/Public/UNIDATA/UnicodeData.txt'
UNIDATA_DIR = os.path.join(UNICODE_DATA_DIR, 'unidata')
LOCAL_UNIDATA_FILE = os.path.join(UNIDATA_DIR, 'UnicodeData.txt')
unicode_categories = defaultdict(list)
unicode_blocks = defaultdict(list)
unicode_combining_classes = defaultdict(list)
unicode_general_categories = defaultdict(list)
unicode_scripts = defaultdict(list)
unicode_properties = {}
unicode_script_ids = {}
unicode_blocks = {}
unicode_category_aliases = {}
unicode_property_aliases = {}
unicode_property_value_aliases = {}
unicode_word_breaks = {}
# Ref: ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
UNIDATA_FIELDS = [
'code',
'name',
'category',
'combining',
'bidi_category',
'decomp_mapping',
'decimal_value',
'digit_value',
'numeric_value',
'mirrored',
'unicode_1_name',
'comment',
'upper_mapping',
'lower_mapping',
'title_mapping',
]
UnicodeDataRow = namedtuple('UnicodeDataRow', ','.join(UNIDATA_FIELDS))
def parse_unicode_data():
'''
Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
'''
if not os.path.exists(LOCAL_UNIDATA_FILE):
download_file(UNIDATA_URL, LOCAL_UNIDATA_FILE)
unidata_file = open(LOCAL_UNIDATA_FILE)
for line in csv.reader(unidata_file, delimiter=';'):
yield UnicodeDataRow(*line)
def iter_unicode_combining_classes():
return unicode_combining_classes.iteritems()
def iter_unicode_categories():
return unicode_categories.iteritems()
def get_unicode_category(cat):
return unicode_categories[cat]
def get_unicode_combining_class(c):
return unicode_combining_classes[c]
def get_unicode_categories():
'''
Build dict of unicode categories e.g.
{
'Lu': ['A', 'B', 'C', ...]
'Ll': ['a', 'b', 'c', ...]
}
'''
categories = defaultdict(list)
for row in parse_unicode_data():
categories[row.category].append(wide_unichr(unicode_to_integer(row.code)))
return dict(categories)
def get_unicode_combining_classes():
'''
Build dict of unicode combining classes e.g.
{
'0': ['\x00', '\x01', \x02', ...]
}
'''
combining_classes = defaultdict(list)
for row in parse_unicode_data():
combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code)))
return dict(combining_classes)
unicode_category_aliases = {
'letter': 'L',
'lower': 'Ll',
'lowercase': 'Ll',
'lowercaseletter': 'Ll',
'upper': 'Lu',
'uppercase': 'Lu',
'uppercaseletter': 'Lu',
'title': 'Lt',
'nonspacing mark': 'Mn',
'mark': 'M',
}
COMBINING_CLASS_PROP = 'canonical_combining_class'
BLOCK_PROP = 'block'
GENERAL_CATEGORY_PROP = 'general_category'
SCRIPT_PROP = 'script'
WORD_BREAK_PROP = 'word_break'
def init_unicode_categories():
'''
Initialize module-level dictionaries
'''
global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
unicode_categories.update(get_unicode_categories())
unicode_combining_classes.update(get_unicode_combining_classes())
for key in unicode_categories.keys():
unicode_general_categories[key[0]].extend(unicode_categories[key])
script_chars = get_chars_by_script()
for i, script in enumerate(script_chars):
if script:
unicode_scripts[script.lower()].append(wide_unichr(i))
unicode_scripts = dict(unicode_scripts)
unicode_script_ids.update(build_master_scripts_list(script_chars))
unicode_blocks.update(get_unicode_blocks())
unicode_properties.update(get_unicode_properties())
unicode_property_aliases.update(get_property_aliases())
unicode_word_breaks.update(get_word_break_properties())
for key, value in get_property_value_aliases().iteritems():
key = unicode_property_aliases.get(key, key)
if key == GENERAL_CATEGORY_PROP:
for k, v in value.iteritems():
k = k.lower()
unicode_category_aliases[k] = v
if '_' in k:
unicode_category_aliases[k.replace('_', '')] = v
unicode_property_value_aliases[key] = value
regex_chars = re.compile('([\[\]\{\}\-\^])')
def replace_regex_chars(s):
return regex_chars.sub(r'\\\1', s)
def format_regex_char(i):
c = wide_unichr(i)
return replace_regex_chars(c.encode('unicode-escape'))
def make_char_set_regex(chars):
'''
Build a regex character set from a list of characters
'''
group_start = None
group_end = None
last_ord = -2
ords = map(wide_ord, chars)
ords.sort()
ords.append(None)
groups = []
for i, o in enumerate(ords):
if o is not None and o == last_ord + 1:
group_end = o
elif group_start is not None and group_end is not None:
groups.append('-'.join((format_regex_char(group_start), format_regex_char(group_end))))
group_end = None
group_start = o
elif group_start is not None and group_end is None:
groups.append(format_regex_char(group_start))
group_start = o
else:
group_start = o
last_ord = o
return u'[{}]'.format(u''.join(groups))
name_category = [
('control_chars', 'Cc'),
('other_format_chars', 'Cf'),
('other_not_assigned_chars', 'Cn'),
('other_private_use_chars', 'Co'),
('other_surrogate_chars', 'Cs'),
('letter_lower_chars', 'Ll'),
('letter_modifier_chars', 'Lm'),
('letter_other_chars', 'Lo'),
('letter_title_chars', 'Lt'),
('letter_upper_chars', 'Lu'),
('mark_spacing_combining_chars', 'Mc'),
('mark_enclosing_chars', 'Me'),
('mark_nonspacing_chars', 'Mn'),
('number_or_digit_chars', 'Nd'),
('number_letter_chars', 'Nl'),
('number_other_chars', 'No'),
('punct_connector_chars', 'Pc'),
('punct_dash_chars', 'Pd'),
('punct_close_chars', 'Pe'),
('punct_final_quote_chars', 'Pf'),
('punct_initial_quote_chars', 'Pi'),
('punct_other_chars', 'Po'),
('punct_open_chars', 'Ps'),
('currency_symbol_chars', 'Sc'),
('symbol_modifier_chars', 'Sk'),
('symbol_math_chars', 'Sm'),
('symbol_other_chars', 'So'),
('separator_line_chars', 'Zl'),
('separator_paragraph_chars', 'Zp'),
('space', 'Zs'),
]
def main():
init_unicode_categories()
for name, cat in name_category:
if cat not in unicode_categories:
continue
chars = unicode_categories[cat]
print u'{} = {};'.format(name, make_char_set_regex(chars))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,11 @@
import os
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources')
UNICODE_DATA_DIR = os.path.join(DATA_DIR, 'unicode')
CLDR_DIR = os.path.join(UNICODE_DATA_DIR, 'cldr')

View File

@@ -0,0 +1,463 @@
'''
scripts.py
This code uses the latest copy of Scripts.txt from unicode.org
to generate a C file (and header) defining which script every character
belongs to.
'''
import csv
import os
import requests
import re
import sys
import tempfile
import requests
import subprocess
from cStringIO import StringIO
from collections import OrderedDict, defaultdict
from itertools import islice
from lxml import etree
from operator import itemgetter
from zipfile import ZipFile
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_encode, safe_decode
from geodata.file_utils import ensure_dir, download_file
from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
from cldr_languages import *
from download_cldr import download_cldr
from languages import get_country_languages
from unicode_paths import UNICODE_DATA_DIR
from word_breaks import script_regex, regex_char_range
SRC_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src')
SCRIPTS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'scripts')
LOCAL_SCRIPTS_FILE = os.path.join(SCRIPTS_DATA_DIR, 'Scripts.txt')
LOCAL_ISO_15924_FILE = os.path.join(SCRIPTS_DATA_DIR, 'iso15924.txt')
BLOCKS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'blocks')
LOCAL_BLOCKS_FILE = os.path.join(BLOCKS_DATA_DIR, 'Blocks.txt')
PROPS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'props')
LOCAL_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'PropList.txt')
LOCAL_PROP_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyAliases.txt')
LOCAL_PROP_VALUE_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyValueAliases.txt')
LOCAL_DERIVED_CORE_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'DerivedCoreProperties.txt')
WORD_BREAKS_DIR = os.path.join(UNICODE_DATA_DIR, 'word_breaks')
LOCAL_WORD_BREAKS_FILE = os.path.join(WORD_BREAKS_DIR, 'WordBreakProperty.txt')
SCRIPTS_HEADER = 'unicode_script_types.h'
SCRIPTS_DATA_FILENAME = 'unicode_scripts_data.c'
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
BLOCKS_URL = 'http://unicode.org/Public/UNIDATA/Blocks.txt'
PROPS_URL = 'http://unicode.org/Public/UNIDATA/PropList.txt'
PROP_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyAliases.txt'
PROP_VALUE_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyValueAliases.txt'
DERIVED_CORE_PROPS_URL = 'http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt'
WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt'
ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
#define UNICODE_SCRIPT_TYPES_H
#include <stdlib.h>
#define NUM_CODEPOINTS {num_codepoints}
#define MAX_LANGS {max_langs}
typedef enum {{
{script_enum}
NUM_SCRIPTS
}} script_t;
#endif
'''
scripts_c_data_template = u'''
script_t char_scripts[] = {{
{char_scripts}
}};
script_code_t script_codes[] = {{
{script_codes}
}};
script_languages_t script_languages[] = {{
{script_languages}
}};
'''
script_code_template = '{{SCRIPT_{name}, "{code}"}}'
script_language_template = '{{{num_langs}, {languages}}}'
def unicode_to_integer(u):
return int('0x{}'.format(u), 16)
def script_name_constant(i, u):
return u'SCRIPT_{} = {}'.format(u.upper(), i)
UNKNOWN_SCRIPT = 'Unknown'
COMMON_SCRIPT = 'Common'
def parse_char_range(r):
return [unicode_to_integer(u) for u in r.split('..')]
def get_chars_by_script():
scripts_file = open(LOCAL_SCRIPTS_FILE)
scripts = [None] * NUM_CODEPOINTS
# Lines look like:
# 0041..005A ; Latin # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
for char_range, script, char_class in script_regex.findall(scripts_file.read()):
script_range = parse_char_range(char_range)
if len(script_range) == 2:
for i in xrange(script_range[0], script_range[1] + 1):
scripts[i] = script
elif script_range:
scripts[script_range[0]] = script
return scripts
COMMENT_CHAR = '#'
DELIMITER_CHAR = ';'
def parse_file(f):
for line in f:
line = line.split(COMMENT_CHAR)[0].strip()
if not line:
continue
tokens = line.split(DELIMITER_CHAR)
if tokens:
yield [t.strip() for t in tokens]
def get_property_aliases():
prop_aliases_file = open(LOCAL_PROP_ALIASES_FILE)
aliases = {}
for line in parse_file(prop_aliases_file):
prop = line[1]
prop_aliases = [line[0]] + line[2:]
for alias in prop_aliases:
aliases[alias.lower()] = prop.lower()
return aliases
def get_property_value_aliases():
prop_value_aliases_file = open(LOCAL_PROP_VALUE_ALIASES_FILE)
value_aliases = defaultdict(dict)
for line in parse_file(prop_value_aliases_file):
prop = line[0]
if prop not in ('ccc', 'gc'):
value = line[2]
aliases = [line[1]] + line[3:]
else:
value = line[1]
aliases = line[2:]
for alias in aliases:
value_aliases[prop.lower()][alias] = value
return dict(value_aliases)
def get_unicode_blocks():
blocks_file = open(LOCAL_BLOCKS_FILE)
blocks = defaultdict(list)
for line in parse_file(blocks_file):
char_range, block = line
char_range = parse_char_range(char_range)
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
blocks[block.lower()].append(wide_unichr(i))
elif char_range:
blocks[block.lower()].append(wide_unichr(char_range[0]))
return dict(blocks)
def get_unicode_properties():
props_file = open(LOCAL_PROPS_FILE)
props = defaultdict(list)
for line in parse_file(props_file):
char_range, prop = line
char_range = parse_char_range(char_range)
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop.lower()].append(wide_unichr(i))
elif char_range:
props[prop.lower()].append(wide_unichr(char_range[0]))
derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
for line in parse_file(derived_props_file):
char_range, prop = line
char_range = parse_char_range(char_range)
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop.lower()].append(wide_unichr(i))
elif char_range:
props[prop.lower()].append(wide_unichr(char_range[0]))
return dict(props)
def get_word_break_properties():
props_file = open(LOCAL_WORD_BREAKS_FILE)
props = defaultdict(list)
for line in parse_file(props_file):
char_range, prop = line
char_range = parse_char_range(char_range)
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop].append(wide_unichr(i))
elif char_range:
props[prop].append(wide_unichr(char_range[0]))
return dict(props)
def build_master_scripts_list(chars):
all_scripts = OrderedDict.fromkeys(filter(bool, chars))
for i, script in enumerate(all_scripts.keys()):
all_scripts[script] = i + 1
# Unknown script for all characters not covered
all_scripts[UNKNOWN_SCRIPT] = 0
return all_scripts
SCRIPT_ALIASES_SUPPLEMENTAL = {
'Hant': 'Han',
'Hans': 'Han'
}
def get_script_codes(all_scripts):
if not os.path.exists(LOCAL_ISO_15924_FILE):
temp_dir = tempfile.gettempdir()
script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])
# This comes as a .zip
script_codes_response = requests.get(ISO_15924_URL)
zf = ZipFile(StringIO(script_codes_response.content))
iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]
# Strip out the comments, etc.
temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
if line.strip() and not line.strip().startswith('#')])
f = open(LOCAL_ISO_15924_FILE, 'w')
f.write(safe_encode(temp_iso15924_file))
f.close()
script_codes_file = open(LOCAL_ISO_15924_FILE)
script_codes = {}
seen_scripts = set()
# Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
if name in all_scripts:
script_codes[code] = name
seen_scripts.add(name)
else:
normalized_name = name.split('(')[0].strip()
if normalized_name in all_scripts and normalized_name not in seen_scripts:
script_codes[code] = normalized_name
seen_scripts.add(normalized_name)
value_aliases = get_property_value_aliases()
script_aliases = value_aliases['sc']
for code, script in script_aliases.iteritems():
if code not in script_codes and script in all_scripts:
script_codes[code] = script
script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)
return script_codes
SCRIPT_CODE_ALIASES = {
'Jpan': ['Hani', 'Hira', 'Kana'],
'Kore': ['Hang', 'Han']
}
def extract_language_scripts(xml):
language_scripts = defaultdict(list)
for lang in xml.xpath('//languageData/language'):
language_code = lang.attrib['type'].lower()
scripts = lang.get('scripts')
if not scripts:
continue
for script in scripts.split():
script_aliases = SCRIPT_CODE_ALIASES.get(script)
if not script_aliases:
language_scripts[language_code].append(script)
else:
language_scripts[language_code].extend(script_aliases)
return language_scripts
def batch_iter(iterable, batch_size):
source_iter = iter(iterable)
while True:
batch = list(islice(source_iter, batch_size))
if len(batch) > 0:
yield batch
else:
return
def get_script_languages():
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
# to identify the language. We keep track of those single language scripts to inform
# the language classifier
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
cldr_xml = etree.parse(cldr_supplemental_data)
language_scripts = extract_language_scripts(cldr_xml)
country_languages_path = os.path.join(DEFAULT_LANGUAGES_DIR, COUNTRY_LANGUAGES_FILENAME)
if not os.path.exists(country_languages_path):
fetch_cldr_languages(DEFAULT_LANGUAGES_DIR)
country_language_file = open(country_languages_path)
country_language_reader = csv.reader(country_language_file, delimiter='\t')
countries = set([country for country, lang, script, pct, is_official
in country_language_reader])
spoken_languages = set.union(*(set(get_country_languages(country)) for country in countries))
script_code_languages = defaultdict(list)
for language, scripts in language_scripts.iteritems():
if language not in spoken_languages:
continue
for script in scripts:
script_code_languages[script].append(language)
script_languages = defaultdict(list)
for script_code, script_name in script_codes.iteritems():
langs = script_code_languages.get(script_code, [])
script_languages[script_name].extend(langs)
for name in all_scripts.iterkeys():
script_languages.setdefault(name, [])
return script_languages
def main(out_dir=SRC_DIR):
# Output is a C header and data file, see templates
out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')
download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
download_file(PROPS_URL, LOCAL_PROPS_FILE)
download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
download_cldr()
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
script_languages = get_script_languages()
max_langs = 0
for script, langs in script_languages.iteritems():
num_langs = len(langs)
if num_langs > max_langs:
max_langs = num_langs
# Generate C header and constants
script_enum = u'''
'''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))])
out_header.write(scripts_header_template.format(num_codepoints=NUM_CODEPOINTS,
max_langs=max_langs,
script_enum=script_enum))
out_header.close()
# Generate C data file
char_scripts_data = u''',
'''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])
script_codes_data = u''',
'''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])
sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]
script_language_data = u''',
'''.join([script_language_template.format(num_langs=len(langs),
languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL'))
for langs in sorted_lang_scripts])
out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
char_scripts=char_scripts_data,
script_codes=script_codes_data,
script_languages=script_language_data))
out_file.close()
if __name__ == '__main__':
main(*sys.argv[1:])

View File

@@ -0,0 +1,140 @@
'''
word_breaks.py
This script is used to automatically build ranges of unicode characters
from the unicode spec's word break properties. These ranges help us
build a tokenizer that does the right thing in every language with regard
to word segmentation. The lines outputted by this script can be pasted
into scanner.re before compliation.
'''
import requests
from collections import defaultdict
import re
# Operate on WordBreakProperty.txt file
hebrew_letter_regex = re.compile('^([^\s]+)[\s]+; Hebrew_Letter ')
format_regex = re.compile('^([^\s]+)[\s]+; Format ')
extend_regex = re.compile('^([^\s]+)[\s]+; Extend ')
katakana_regex = re.compile('^([^\s]+)[\s]+; Katakana ')
other_alpha_letter_regex = re.compile('^([^\s]+)[\s]+; ALetter # Lo (?!.*(?:HANGUL|TIBETAN|JAVANESE|BALINESE|YI) )')
mid_letter_regex = re.compile('^([^\s]+)[\s]+; MidLetter')
mid_number_regex = re.compile('^([^\s]+)[\s]+; MidNum ')
mid_num_letter_regex = re.compile('^([^\s]+)[\s]+; MidNumLet ')
numeric_regex = re.compile('^([^\s]+)[\s]+; Numeric ')
extend_num_letter_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
# Operate on Scripts.txt file
other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
ideographic_scripts = set([
'han',
'hiragana',
'hangul',
'tibetan',
'thai',
'lao',
'javanese',
'balinese',
'yi',
])
def regex_char_range(match):
r = match.split('..')
# Wide version
return u'-'.join([('\u{}'.format(c.lower()) if len(c) < 5 else '\U{}'.format(c.lower().rjust(8, '0'))) for c in r])
def get_letter_range(text, *regexes):
char_ranges = []
for line in text.split('\n'):
for regex in regexes:
m = regex.match(line)
if m:
char_ranges.append(regex_char_range(m.group(1)))
return char_ranges
def get_letter_ranges_for_scripts(text, scripts, char_class_regex):
char_ranges = []
for char_range, script, char_class in script_regex.findall(text):
if script.lower() in scripts and char_class_regex.match(char_class):
char_ranges.append(regex_char_range(char_range))
return char_ranges
def get_char_class(text, char_class_regex):
char_ranges = []
for char_range, script, char_class in script_regex.findall(text):
if char_class_regex.match(char_class):
char_ranges.append(regex_char_range(char_range))
return char_ranges
hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
def get_hangul_syllable_ranges(text):
char_ranges = defaultdict(list)
for line in text.split('\n'):
m = hangul_syllable_type_regex.match(line)
if m:
char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
return dict(char_ranges)
name_funcs = [
('hebrew_letter_chars', hebrew_letter_regex),
('format_chars', format_regex),
('extend_chars', extend_regex),
('katakana_chars', katakana_regex),
('letter_other_alpha_chars', other_alpha_letter_regex),
('mid_letter_chars', mid_letter_regex),
('mid_number_chars', mid_number_regex),
('mid_num_letter_chars', mid_num_letter_regex),
('numeric_chars', numeric_regex),
('extend_num_letter_chars', extend_num_letter_regex),
]
IDEOGRAPHIC_CHARS = 'ideographic_chars'
IDEOGRAPHIC_NUMERIC_CHARS = 'ideographic_numeric_chars'
numbers_regex = re.compile('N[ol]', re.I)
letters_regex = re.compile('L*', re.I)
def main():
''' Insert these lines into scanner.re '''
response = requests.get(WORD_BREAK_PROPERTIES_URL)
if response.ok:
for name, reg in name_funcs:
s = get_letter_range(response.content, reg)
print '{} = [{}];'.format(name, ''.join(s))
response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
if response.ok:
syllable_ranges = get_hangul_syllable_ranges(response.content)
for name, ranges in syllable_ranges.iteritems():
print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
response = requests.get(SCRIPTS_URL)
if response.ok:
s = ''.join(get_char_class(response.content, numbers_regex))
print '{} = [{}];'.format(IDEOGRAPHIC_NUMERIC_CHARS, ''.join(s))
s = ''.join(get_letter_ranges_for_scripts(response.content, ideographic_scripts, letters_regex))
print '{} = [{}];'.format(IDEOGRAPHIC_CHARS, ''.join(s))
if __name__ == '__main__':
main()