diff --git a/scripts/geodata/i18n/transliteration_rules.py b/scripts/geodata/i18n/transliteration_rules.py index 39e1481e..236d50fd 100644 --- a/scripts/geodata/i18n/transliteration_rules.py +++ b/scripts/geodata/i18n/transliteration_rules.py @@ -50,7 +50,10 @@ POST_CONTEXT_INDICATOR = '}' REVISIT_INDICATOR = '|' -WORD_BOUNDARY_VAR = 'wordBoundary' +WORD_BOUNDARY_VAR_NAME = 'wordBoundary' +WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME) + +word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$')) EMPTY_TRANSITION = u'\u007f' @@ -503,8 +506,28 @@ def char_permutations(s): return list(itertools.product(char_types)), move +string_replacements = { + u'[': u'\[', + u']': u'\]', + u'': EMPTY_TRANSITION, + u'*': u'\*', + u'+': u'\+', + PLUS: u'+', + STAR: u'*', +} + +escape_sequence_long_regex = re.compile(r'(\\x[0-9a-f]{2})([0-9a-f])', re.I) + + +def replace_long_escape_sequence(s): + def replace_match(m): + return u'{}""{}'.format(m.group(1), m.group(2)) + + return escape_sequence_long_regex.sub(replace_match, s) + + def quote_string(s): - return u'"{}"'.format(s.replace('"', '\\"')) + return u'"{}"'.format(replace_long_escape_sequence(safe_decode(s).replace('"', '\\"'))) def char_types_string(char_types): @@ -519,19 +542,7 @@ def char_types_string(char_types): template = u'{}' if len(chars) == 1 else u'[{}]' norm = [] for c in chars: - if c == '[': - c = '\[' - elif c == '': - c = EMPTY_TRANSITION - elif c == '*': - c = '\*' - elif c == '+': - c = '\+' - elif c == PLUS: - c = '+' - elif c == STAR: - c = '*' - + c = string_replacements.get(c, c) norm.append(c) ret.append(template.format(u''.join(norm))) @@ -555,48 +566,68 @@ def format_groups(char_types, groups): charset_regex = re.compile(r'(? ae + (u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'), # ö => oe + (u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'), # ü => ue + (u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'), # ß => ss ]), } @@ -766,7 +818,7 @@ def get_all_transform_rules(): elif step_type == STEP_TRANSFORM: step = (STEP_TRANSFORM, '-1', '-1', quote_string(data)) elif step_type == STEP_UNICODE_NORMALIZATION: - step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', data) + step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', quote_string(data)) all_steps.append(step) internal = int(name not in to_latin) @@ -778,9 +830,6 @@ def get_all_transform_rules(): transliteration_data_template = u'''#include -#include - -#include "transliteration_rule.h" transliteration_rule_source_t rules_source[] = {{ {all_rules} diff --git a/src/transliteration_data.c b/src/transliteration_data.c index 4d247596..aa0903b6 100644 Binary files a/src/transliteration_data.c and b/src/transliteration_data.c differ diff --git a/src/transliteration_rule.h b/src/transliteration_rule.h index cda55256..6ba6eefd 100644 --- a/src/transliteration_rule.h +++ b/src/transliteration_rule.h @@ -13,21 +13,31 @@ typedef enum { typedef struct transliteration_rule_source { char *key; + size_t key_len; context_type_t pre_context_type; size_t pre_context_max_len; char *pre_context; + size_t pre_context_len; context_type_t post_context_type; size_t post_context_max_len; char *post_context; + size_t post_context_len; char *replacement; + size_t replacement_len; int move; char *group_regex_str; + size_t group_regex_len; } transliteration_rule_source_t; +typedef enum { + STEP_RULESET, + STEP_TRANSFORM, + STEP_UNICODE_NORMALIZATION +} step_type_t; typedef struct transliteration_step_source { step_type_t type;