[transliteration] converting one of the more complicated and frequently used rules to its utf8proc equivalent, adding better support for escaped unicode characters and set differences, generating a header file indicating which unicode script/language pairs warrant various transliterators.
This commit is contained in:
@@ -88,6 +88,7 @@ NFD = 'NFD'
|
|||||||
NFKD = 'NFKD'
|
NFKD = 'NFKD'
|
||||||
NFC = 'NFC'
|
NFC = 'NFC'
|
||||||
NFKC = 'NFKC'
|
NFKC = 'NFKC'
|
||||||
|
STRIP_MARK = 'STRIP_MARK'
|
||||||
|
|
||||||
LOWER = 'lower'
|
LOWER = 'lower'
|
||||||
UPPER = 'upper'
|
UPPER = 'upper'
|
||||||
@@ -98,6 +99,7 @@ UNICODE_NORMALIZATION_TRANSFORMS = set([
|
|||||||
NFKD,
|
NFKD,
|
||||||
NFC,
|
NFC,
|
||||||
NFKC,
|
NFKC,
|
||||||
|
STRIP_MARK,
|
||||||
])
|
])
|
||||||
|
|
||||||
unicode_category_aliases = {
|
unicode_category_aliases = {
|
||||||
@@ -120,6 +122,8 @@ unicode_general_categories = defaultdict(list)
|
|||||||
unicode_scripts = defaultdict(list)
|
unicode_scripts = defaultdict(list)
|
||||||
unicode_properties = {}
|
unicode_properties = {}
|
||||||
|
|
||||||
|
unicode_script_ids = {}
|
||||||
|
|
||||||
unicode_blocks = {}
|
unicode_blocks = {}
|
||||||
unicode_category_aliases = {}
|
unicode_category_aliases = {}
|
||||||
unicode_property_aliases = {}
|
unicode_property_aliases = {}
|
||||||
@@ -140,9 +144,9 @@ class TransliterationParseError(Exception):
|
|||||||
def init_unicode_categories():
|
def init_unicode_categories():
|
||||||
global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
|
global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
|
||||||
global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
|
global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
|
||||||
global unicode_property_value_aliases, unicode_scripts, unicode_word_breaks
|
global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
|
||||||
|
|
||||||
for i in xrange(65536):
|
for i in xrange(NUM_CHARS):
|
||||||
unicode_categories[unicodedata.category(unichr(i))].append(unichr(i))
|
unicode_categories[unicodedata.category(unichr(i))].append(unichr(i))
|
||||||
unicode_combining_classes[str(unicodedata.combining(unichr(i)))].append(unichr(i))
|
unicode_combining_classes[str(unicodedata.combining(unichr(i)))].append(unichr(i))
|
||||||
|
|
||||||
@@ -161,6 +165,8 @@ def init_unicode_categories():
|
|||||||
|
|
||||||
unicode_scripts = dict(unicode_scripts)
|
unicode_scripts = dict(unicode_scripts)
|
||||||
|
|
||||||
|
unicode_script_ids.update(build_master_scripts_list(script_chars))
|
||||||
|
|
||||||
unicode_blocks.update(get_unicode_blocks())
|
unicode_blocks.update(get_unicode_blocks())
|
||||||
unicode_properties.update(get_unicode_properties())
|
unicode_properties.update(get_unicode_properties())
|
||||||
unicode_property_aliases.update(get_property_aliases())
|
unicode_property_aliases.update(get_property_aliases())
|
||||||
@@ -226,10 +232,8 @@ unicode_property_regexes = [
|
|||||||
('logical_order_exception', '[เ-ไ ເ-ໄ ꪵ ꪶ ꪹ ꪻ ꪼ]'),
|
('logical_order_exception', '[เ-ไ ເ-ໄ ꪵ ꪶ ꪹ ꪻ ꪼ]'),
|
||||||
]
|
]
|
||||||
|
|
||||||
char_set_map = {
|
rule_map = {
|
||||||
'[^[:ccc=Not_Reordered:][:ccc=Above:]]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
|
u'[:Latin:] { [:Mn:]+ → ;': ':: {}'.format(STRIP_MARK)
|
||||||
'[[:^ccc=0:] & [:^ccc=230:]]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
|
|
||||||
'[^\p{ccc=0}\p{ccc=above}]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unicode_properties = {}
|
unicode_properties = {}
|
||||||
@@ -277,6 +281,8 @@ QUOTED_STRING = 'QUOTED_STRING'
|
|||||||
SINGLE_QUOTE = 'SINGLE_QUOTE'
|
SINGLE_QUOTE = 'SINGLE_QUOTE'
|
||||||
HTML_ENTITY = 'HTML_ENTITY'
|
HTML_ENTITY = 'HTML_ENTITY'
|
||||||
SINGLE_QUOTE = 'SINGLE_QUOTE'
|
SINGLE_QUOTE = 'SINGLE_QUOTE'
|
||||||
|
UNICODE_CHARACTER = 'UNICODE_CHARACTER'
|
||||||
|
UNICODE_WIDE_CHARACTER = 'UNICODE_WIDE_CHARACTER'
|
||||||
ESCAPED_CHARACTER = 'ESCAPED_CHARACTER'
|
ESCAPED_CHARACTER = 'ESCAPED_CHARACTER'
|
||||||
|
|
||||||
|
|
||||||
@@ -302,6 +308,8 @@ rule_scanner = Scanner([
|
|||||||
# Scanner for the lvalue or rvalue of a transform rule
|
# Scanner for the lvalue or rvalue of a transform rule
|
||||||
|
|
||||||
transform_scanner = Scanner([
|
transform_scanner = Scanner([
|
||||||
|
(r'\\u[0-9A-Fa-f]{4}', UNICODE_CHARACTER),
|
||||||
|
(r'\\U[0-9A-Fa-f]{8}', UNICODE_WIDE_CHARACTER),
|
||||||
(r'[\\].', ESCAPED_CHARACTER),
|
(r'[\\].', ESCAPED_CHARACTER),
|
||||||
(r'\'\'', SINGLE_QUOTE),
|
(r'\'\'', SINGLE_QUOTE),
|
||||||
(r'\'.*?\'', QUOTED_STRING),
|
(r'\'.*?\'', QUOTED_STRING),
|
||||||
@@ -338,6 +346,8 @@ char_set_scanner = Scanner([
|
|||||||
('^\^', NEGATION),
|
('^\^', NEGATION),
|
||||||
(r'\\p\{[^\{\}]+\}', CHAR_CLASS_PCRE),
|
(r'\\p\{[^\{\}]+\}', CHAR_CLASS_PCRE),
|
||||||
(r'[\\]?[^\\\s]\-[\\]?[^\s]', CHAR_RANGE),
|
(r'[\\]?[^\\\s]\-[\\]?[^\s]', CHAR_RANGE),
|
||||||
|
(r'\\u[0-9A-Fa-f]{4}', UNICODE_CHARACTER),
|
||||||
|
(r'\\U[0-9A-Fa-f]{8}', UNICODE_WIDE_CHARACTER),
|
||||||
(r'[\\].', ESCAPED_CHARACTER),
|
(r'[\\].', ESCAPED_CHARACTER),
|
||||||
(r'\'\'', SINGLE_QUOTE),
|
(r'\'\'', SINGLE_QUOTE),
|
||||||
(r'\'.*?\'', QUOTED_STRING),
|
(r'\'.*?\'', QUOTED_STRING),
|
||||||
@@ -348,7 +358,7 @@ char_set_scanner = Scanner([
|
|||||||
('\[', OPEN_SET),
|
('\[', OPEN_SET),
|
||||||
('\]', CLOSE_SET),
|
('\]', CLOSE_SET),
|
||||||
('&', INTERSECTION),
|
('&', INTERSECTION),
|
||||||
('(?<=[\s])-(?=[\s])', DIFFERENCE),
|
('-', DIFFERENCE),
|
||||||
('\$', WORD_BOUNDARY),
|
('\$', WORD_BOUNDARY),
|
||||||
(ur'[\ud800-\udbff][\udc00-\udfff]', WIDE_CHARACTER),
|
(ur'[\ud800-\udbff][\udc00-\udfff]', WIDE_CHARACTER),
|
||||||
(r'\{[^\s]+\}', BRACKETED_CHARACTER),
|
(r'\{[^\s]+\}', BRACKETED_CHARACTER),
|
||||||
@@ -482,8 +492,6 @@ def parse_regex_char_set(s, current_filter=all_chars):
|
|||||||
Parse into a single, flat character set without the unicode properties,
|
Parse into a single, flat character set without the unicode properties,
|
||||||
ranges, unions/intersections, etc.
|
ranges, unions/intersections, etc.
|
||||||
'''
|
'''
|
||||||
if s in char_set_map:
|
|
||||||
s = char_set_map[s]
|
|
||||||
|
|
||||||
s = s[1:-1]
|
s = s[1:-1]
|
||||||
is_negation = False
|
is_negation = False
|
||||||
@@ -532,7 +540,12 @@ def parse_regex_char_set(s, current_filter=all_chars):
|
|||||||
elif token_class == CHARACTER and token not in control_chars:
|
elif token_class == CHARACTER and token not in control_chars:
|
||||||
this_group.add(token)
|
this_group.add(token)
|
||||||
real_chars.add(token)
|
real_chars.add(token)
|
||||||
elif token_class == WIDE_CHARACTER:
|
elif token_class == UNICODE_CHARACTER:
|
||||||
|
token = token.decode('unicode-escape')
|
||||||
|
if token not in control_chars:
|
||||||
|
this_group.add(token)
|
||||||
|
real_chars.add(token)
|
||||||
|
elif token_class in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER):
|
||||||
continue
|
continue
|
||||||
elif token_class == BRACKETED_CHARACTER:
|
elif token_class == BRACKETED_CHARACTER:
|
||||||
if token.strip('{{}}') not in control_chars:
|
if token.strip('{{}}') not in control_chars:
|
||||||
@@ -575,10 +588,11 @@ def get_raw_rules_and_variables(xml):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip())
|
rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip())
|
||||||
rule = literal_space_regex.sub(replace_literal_space, rule)
|
if rule not in rule_map:
|
||||||
rule = escaped_wide_unicode_regex.sub('', rule)
|
rule = literal_space_regex.sub(replace_literal_space, rule)
|
||||||
rule = escaped_unicode_regex.sub(unescape_unicode_char, rule)
|
rule = rule.rstrip(END_CHAR).strip()
|
||||||
rule = rule.rstrip(END_CHAR).strip()
|
else:
|
||||||
|
rule = rule_map[rule]
|
||||||
|
|
||||||
if rule.strip().endswith('\\'):
|
if rule.strip().endswith('\\'):
|
||||||
compound_rule.append(rule.rstrip('\\'))
|
compound_rule.append(rule.rstrip('\\'))
|
||||||
@@ -692,7 +706,10 @@ def char_permutations(s, current_filter=all_chars):
|
|||||||
char_types.append([replace_html_entity(token)])
|
char_types.append([replace_html_entity(token)])
|
||||||
elif token_type == CHARACTER:
|
elif token_type == CHARACTER:
|
||||||
char_types.append([token])
|
char_types.append([token])
|
||||||
elif token_type == WIDE_CHARACTER:
|
elif token_type == UNICODE_CHARACTER:
|
||||||
|
token = token.decode('unicode-escape')
|
||||||
|
char_types.append([token])
|
||||||
|
elif token_type in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER):
|
||||||
continue
|
continue
|
||||||
if in_group and last_token_group_start:
|
if in_group and last_token_group_start:
|
||||||
start_group = len(char_types)
|
start_group = len(char_types)
|
||||||
@@ -760,10 +777,10 @@ def format_groups(char_types, groups):
|
|||||||
for start, end in groups:
|
for start, end in groups:
|
||||||
group_regex.append(char_types_string(char_types[last_end:start]))
|
group_regex.append(char_types_string(char_types[last_end:start]))
|
||||||
group_regex.append(u'(')
|
group_regex.append(u'(')
|
||||||
group_regex.append(char_types_string(char_types[start:end + 1]))
|
group_regex.append(char_types_string(char_types[start:end]))
|
||||||
group_regex.append(u')')
|
group_regex.append(u')')
|
||||||
last_end = end
|
last_end = end
|
||||||
group_regex.append(char_types_string(char_types[last_end + 1:]))
|
group_regex.append(char_types_string(char_types[last_end:]))
|
||||||
return u''.join(group_regex)
|
return u''.join(group_regex)
|
||||||
|
|
||||||
charset_regex = re.compile(r'(?<!\\)\[')
|
charset_regex = re.compile(r'(?<!\\)\[')
|
||||||
@@ -901,6 +918,7 @@ def parse_transform_rules(xml):
|
|||||||
in_set = False
|
in_set = False
|
||||||
elif token_type == BEFORE_CONTEXT and not in_set:
|
elif token_type == BEFORE_CONTEXT and not in_set:
|
||||||
left_pre_context = u''.join(current_token)
|
left_pre_context = u''.join(current_token)
|
||||||
|
|
||||||
current_token = []
|
current_token = []
|
||||||
elif token_type == AFTER_CONTEXT and not in_set:
|
elif token_type == AFTER_CONTEXT and not in_set:
|
||||||
have_post_context = True
|
have_post_context = True
|
||||||
@@ -968,7 +986,7 @@ def parse_transform_rules(xml):
|
|||||||
elif left_pre_context.strip() == START_OF_HAN_VAR:
|
elif left_pre_context.strip() == START_OF_HAN_VAR:
|
||||||
left_pre_context = None
|
left_pre_context = None
|
||||||
left_pre_context_type = CONTEXT_TYPE_NONE
|
left_pre_context_type = CONTEXT_TYPE_NONE
|
||||||
else:
|
elif left_pre_context.strip():
|
||||||
left_pre_context, _, _ = char_permutations(left_pre_context.strip(), current_filter=current_filter)
|
left_pre_context, _, _ = char_permutations(left_pre_context.strip(), current_filter=current_filter)
|
||||||
if left_pre_context:
|
if left_pre_context:
|
||||||
left_pre_context_max_len = len(left_pre_context or [])
|
left_pre_context_max_len = len(left_pre_context or [])
|
||||||
@@ -981,8 +999,11 @@ def parse_transform_rules(xml):
|
|||||||
else:
|
else:
|
||||||
left_pre_context = None
|
left_pre_context = None
|
||||||
left_pre_context_type = CONTEXT_TYPE_NONE
|
left_pre_context_type = CONTEXT_TYPE_NONE
|
||||||
|
else:
|
||||||
|
left_pre_context = None
|
||||||
|
left_pre_context_type = CONTEXT_TYPE_NONE
|
||||||
|
|
||||||
if left is not None:
|
if left:
|
||||||
left, _, left_groups = char_permutations(left.strip(), current_filter=current_filter)
|
left, _, left_groups = char_permutations(left.strip(), current_filter=current_filter)
|
||||||
if left_groups:
|
if left_groups:
|
||||||
left_groups = format_groups(left, left_groups)
|
left_groups = format_groups(left, left_groups)
|
||||||
@@ -997,7 +1018,7 @@ def parse_transform_rules(xml):
|
|||||||
elif left_post_context.strip() == START_OF_HAN_VAR:
|
elif left_post_context.strip() == START_OF_HAN_VAR:
|
||||||
left_pre_context_type = None
|
left_pre_context_type = None
|
||||||
left_pre_context_type = CONTEXT_TYPE_NONE
|
left_pre_context_type = CONTEXT_TYPE_NONE
|
||||||
else:
|
elif left_post_context.strip():
|
||||||
left_post_context, _, _ = char_permutations(left_post_context.strip(), current_filter=current_filter)
|
left_post_context, _, _ = char_permutations(left_post_context.strip(), current_filter=current_filter)
|
||||||
if left_post_context:
|
if left_post_context:
|
||||||
left_post_context_max_len = len(left_post_context or [])
|
left_post_context_max_len = len(left_post_context or [])
|
||||||
@@ -1009,6 +1030,10 @@ def parse_transform_rules(xml):
|
|||||||
else:
|
else:
|
||||||
left_post_context = None
|
left_post_context = None
|
||||||
left_post_context_type = CONTEXT_TYPE_NONE
|
left_post_context_type = CONTEXT_TYPE_NONE
|
||||||
|
else:
|
||||||
|
left_post_context = None
|
||||||
|
left_post_context_type = CONTEXT_TYPE_NONE
|
||||||
|
|
||||||
|
|
||||||
if right:
|
if right:
|
||||||
right, move, right_groups = char_permutations(right.strip(), current_filter=current_filter)
|
right, move, right_groups = char_permutations(right.strip(), current_filter=current_filter)
|
||||||
@@ -1039,10 +1064,15 @@ EXISTING_STEP = 'EXISTING_STEP'
|
|||||||
supplemental_transliterations = {
|
supplemental_transliterations = {
|
||||||
'latin-ascii': (EXISTING_STEP, [
|
'latin-ascii': (EXISTING_STEP, [
|
||||||
# German transliterations not handled by standard NFD normalization
|
# German transliterations not handled by standard NFD normalization
|
||||||
(u'"\xc3\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'), # ä => ae
|
# ä => ae
|
||||||
(u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'), # ö => oe
|
(u'"\xc3\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'),
|
||||||
(u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'), # ü => ue
|
# ö => oe
|
||||||
(u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'), # ß => ss
|
(u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'),
|
||||||
|
# ü => ue
|
||||||
|
(u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'),
|
||||||
|
# ß => ss
|
||||||
|
(u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'),
|
||||||
|
|
||||||
]),
|
]),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1090,6 +1120,9 @@ def get_all_transform_rules():
|
|||||||
if rule.lower() in all_transforms and rule.lower() not in EXCLUDE_TRANSLITERATORS:
|
if rule.lower() in all_transforms and rule.lower() not in EXCLUDE_TRANSLITERATORS:
|
||||||
dependencies[name].append(rule.lower())
|
dependencies[name].append(rule.lower())
|
||||||
steps.append((STEP_TRANSFORM, rule.lower()))
|
steps.append((STEP_TRANSFORM, rule.lower()))
|
||||||
|
elif rule.split('-')[0].lower() in all_transforms and rule.split('-')[0].lower() not in EXCLUDE_TRANSLITERATORS:
|
||||||
|
dependencies[name].append(rule.split('-')[0].lower())
|
||||||
|
steps.append((STEP_TRANSFORM, rule.split('-')[0].lower()))
|
||||||
|
|
||||||
rule = UTF8PROC_TRANSFORMS.get(rule, rule)
|
rule = UTF8PROC_TRANSFORMS.get(rule, rule)
|
||||||
if rule in UNICODE_NORMALIZATION_TRANSFORMS:
|
if rule in UNICODE_NORMALIZATION_TRANSFORMS:
|
||||||
@@ -1166,16 +1199,152 @@ transliterator_source_t transliterators_source[] = {{
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
transliterator_script_data_template = u'''
|
||||||
|
#ifndef TRANSLITERATION_SCRIPTS_H
|
||||||
|
#define TRANSLITERATION_SCRIPTS_H
|
||||||
|
|
||||||
def create_transliterator(name, internal, steps):
|
#include <stdlib.h>
|
||||||
return transliterator_template.format(name=name, internal=int(internal), num_steps=len(steps))
|
#include "unicode_scripts.h"
|
||||||
|
#include "transliterate.h"
|
||||||
|
|
||||||
|
typedef struct script_transliteration_rule {{
|
||||||
|
script_type_t script;
|
||||||
|
char *language;
|
||||||
|
uint32_t index;
|
||||||
|
uint32_t len;
|
||||||
|
}} script_transliteration_rule_t;
|
||||||
|
|
||||||
|
script_transliteration_rule_t script_transliteration_rules[] = {{
|
||||||
|
{rules}
|
||||||
|
}};
|
||||||
|
|
||||||
|
char *script_transliterators[] = {{
|
||||||
|
{transliterators}
|
||||||
|
}}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
TRANSLITERATION_DATA_FILENAME = 'transliteration_data.c'
|
script_transliterators = {
|
||||||
|
'arabic': {None: ['arabic-latin', 'arabic-latin-bgn'],
|
||||||
|
'fa': ['persian-latin-bgn'],
|
||||||
|
'ps': ['pashto-latin-bgn'],
|
||||||
|
},
|
||||||
|
'armenian': {None: ['armenian-latin-bgn']},
|
||||||
|
'balinese': None,
|
||||||
|
'bamum': None,
|
||||||
|
'batak': None,
|
||||||
|
'bengali': {None: ['bengali-latin']},
|
||||||
|
'bopomofo': None,
|
||||||
|
'braille': None,
|
||||||
|
'buginese': None,
|
||||||
|
'buhid': None,
|
||||||
|
'canadian_aboriginal': {None: ['canadianaboriginal-latin']},
|
||||||
|
'cham': None,
|
||||||
|
'cherokee': None,
|
||||||
|
'common': {None: ['latin-ascii']},
|
||||||
|
'coptic': None,
|
||||||
|
'cyrillic': {None: ['cyrillic-latin'],
|
||||||
|
'be': ['belarusian-latin-bgn'],
|
||||||
|
'ru': ['russian-latin-bgn'],
|
||||||
|
'bg': ['bulgarian-latin-bgn'],
|
||||||
|
'kk': ['kazakh-latin-bgn'],
|
||||||
|
'ky': ['kirghiz-latin-bgn'],
|
||||||
|
'mk': ['macedonian-latin-bgn'],
|
||||||
|
'mn': ['mongolian-latin-bgn'],
|
||||||
|
'sr': ['serbian-latin-bgn'],
|
||||||
|
'uk': ['ukrainian-latin-bgn'],
|
||||||
|
'uz': ['uzbek-latin-bgn'],
|
||||||
|
},
|
||||||
|
'devanagari': {None: ['devanagari-latin']},
|
||||||
|
'ethiopic': None,
|
||||||
|
'georgian': {None: ['georgian-latin', 'georgian-latin-bgn']},
|
||||||
|
'glagolitic': None,
|
||||||
|
'greek': {None: ['greek-latin', 'greek-latin-bgn', 'greek_latin_ungegn']},
|
||||||
|
'gujarati': {None: ['gujarati-latin']},
|
||||||
|
'gurmukhi': {None: ['gurmukhi-latin']},
|
||||||
|
'han': {None: ['han-latin']},
|
||||||
|
'hangul': {None: ['korean-latin-bgn']},
|
||||||
|
'hanunoo': None,
|
||||||
|
'hebrew': {None: ['hebrew-latin', 'hebrew-latin-bgn']},
|
||||||
|
'hiragana': {None: ['hiragana-latin']},
|
||||||
|
'inherited': None,
|
||||||
|
'javanese': None,
|
||||||
|
'kannada': {None: ['kannada-latin']},
|
||||||
|
'katakana': {None: ['katakana-latin-bgn']},
|
||||||
|
'kayah_li': None,
|
||||||
|
'khmer': None,
|
||||||
|
'lao': None,
|
||||||
|
'latin': {None: ['latin-ascii']},
|
||||||
|
'lepcha': None,
|
||||||
|
'limbu': None,
|
||||||
|
'lisu': None,
|
||||||
|
'malayalam': {None: ['malayam-latin']},
|
||||||
|
'mandaic': None,
|
||||||
|
'meetei_mayek': None,
|
||||||
|
'mongolian': None,
|
||||||
|
'myanmar': None,
|
||||||
|
'new_tai_lue': None,
|
||||||
|
'nko': None,
|
||||||
|
'ogham': None,
|
||||||
|
'ol_chiki': None,
|
||||||
|
'oriya': {None: ['oriya-latin']},
|
||||||
|
'phags_pa': None,
|
||||||
|
'rejang': None,
|
||||||
|
'runic': None,
|
||||||
|
'samaritan': None,
|
||||||
|
'saurashtra': None,
|
||||||
|
'sinhala': None,
|
||||||
|
'sundanese': None,
|
||||||
|
'syloti_nagri': None,
|
||||||
|
'syriac': None,
|
||||||
|
'tagalog': None,
|
||||||
|
'tagbanwa': None,
|
||||||
|
'tai_le': None,
|
||||||
|
'tai_tham': None,
|
||||||
|
'tai_viet': None,
|
||||||
|
'tamil': {None: ['tamil-latin']},
|
||||||
|
'telugu': {None: ['telugu-latin']},
|
||||||
|
'thaana': None,
|
||||||
|
'thai': {None: ['thai-latin']},
|
||||||
|
'tibetan': None,
|
||||||
|
'tifinagh': None,
|
||||||
|
'unknown': None,
|
||||||
|
'vai': None,
|
||||||
|
'yi': None
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(out_dir):
|
def write_transliterator_scripts_file(filename):
|
||||||
f = open(os.path.join(out_dir, TRANSLITERATION_DATA_FILENAME), 'w')
|
transliterator_rule_template = '''{{{script_type}, {lang}, {start}, {length}}}'''
|
||||||
|
rules = []
|
||||||
|
all_transliterators = []
|
||||||
|
index = 0
|
||||||
|
for script, i in unicode_script_ids.iteritems():
|
||||||
|
spec = script_transliterators.get(script.lower())
|
||||||
|
if not spec:
|
||||||
|
continue
|
||||||
|
script_type = 'SCRIPT_{}'.format(script.upper())
|
||||||
|
for lang, transliterators in spec.iteritems():
|
||||||
|
lang = 'NULL' if not lang else quote_string(lang)
|
||||||
|
num_transliterators = len(transliterators)
|
||||||
|
rules.append(transliterator_rule_template.format(script_type=script_type,
|
||||||
|
lang=lang, start=index, length=num_transliterators))
|
||||||
|
for trans in transliterators:
|
||||||
|
all_transliterators.append(quote_string(trans))
|
||||||
|
|
||||||
|
index += num_transliterators
|
||||||
|
|
||||||
|
template = transliterator_script_data_template.format(rules=''',
|
||||||
|
'''.join(rules), transliterators=''',
|
||||||
|
'''.join(all_transliterators))
|
||||||
|
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(safe_encode(template))
|
||||||
|
|
||||||
|
|
||||||
|
def write_transliteration_data_file(filename):
|
||||||
transforms, steps, rules = get_all_transform_rules()
|
transforms, steps, rules = get_all_transform_rules()
|
||||||
|
|
||||||
all_transforms = u''',
|
all_transforms = u''',
|
||||||
@@ -1193,9 +1362,19 @@ def main(out_dir):
|
|||||||
all_rules=all_rules
|
all_rules=all_rules
|
||||||
)
|
)
|
||||||
|
|
||||||
|
f = open(filename, 'w')
|
||||||
f.write(safe_encode(template))
|
f.write(safe_encode(template))
|
||||||
|
|
||||||
|
|
||||||
|
TRANSLITERATION_DATA_FILENAME = 'transliteration_data.c'
|
||||||
|
TRANSLITERATION_SCRIPTS_FILENAME = 'transliteration_scripts.h'
|
||||||
|
|
||||||
|
|
||||||
|
def main(out_dir):
|
||||||
|
write_transliteration_data_file(os.path.join(out_dir, TRANSLITERATION_DATA_FILENAME))
|
||||||
|
|
||||||
|
write_transliterator_scripts_file(os.path.join(out_dir, TRANSLITERATION_SCRIPTS_FILENAME))
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print 'Usage: python transliteration_rules.py out_dir'
|
print 'Usage: python transliteration_rules.py out_dir'
|
||||||
|
|||||||
Reference in New Issue
Block a user