[transliteration] char set mapping for some of the more complicated sets found in CLDR

This commit is contained in:
Al
2015-05-10 18:34:53 -04:00
parent 2a69488f9b
commit fe044cebef

View File

@@ -157,6 +157,12 @@ unicode_property_regexes = [
('logical_order_exception', '[เ-ไ ເ-ໄ ꪵ ꪶ ꪹ ꪻ ꪼ]'),
]
char_set_map = {
'[^[:ccc=Not_Reordered:][:ccc=Above:]]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
'[[:^ccc=0:] & [:^ccc=230:]]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
'[^\p{ccc=0}\p{ccc=above}]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
}
unicode_properties = {}
@@ -360,6 +366,9 @@ def parse_regex_char_set(s):
Parse into a single, flat character set without the unicode properties,
ranges, unions/intersections, etc.
'''
if s in char_set_map:
s = char_set_map[s]
s = s[1:-1]
is_negation = False
this_group = set()