[numex] Moving numex script to a different subpackage, adding function for creating ordinals

2016-03-18 20:36:22 -04:00
parent e6b59980e7
commit 5e2d9f371e
4 changed files with 73 additions and 3 deletions
--- a/scripts/geodata/i18n/numex.py
+++ b/scripts/geodata/i18n/numex.py
@@ -1,212 +0,0 @@
-import os
-import sys
-
-import ujson as json
-
-this_dir = os.path.realpath(os.path.dirname(__file__))
-sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
-
-from geodata.encoding import safe_encode
-from unicode_paths import DATA_DIR
-
-
-class InvalidNumexRuleException(Exception):
-    pass
-
-NUMEX_DATA_DIR = os.path.join(DATA_DIR, 'numex')
-
-NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
-
-GENDER_MASCULINE = 'GENDER_MASCULINE'
-GENDER_FEMININE = 'GENDER_FEMININE'
-GENDER_NEUTER = 'GENDER_NEUTER'
-GENDER_NONE = 'GENDER_NONE'
-
-gender_map = {
-    'm': GENDER_MASCULINE,
-    'f': GENDER_FEMININE,
-    'n': GENDER_NEUTER,
-    None: GENDER_NONE,
-}
-
-
-CATEGORY_PLURAL = 'CATEGORY_PLURAL'
-CATEGORY_DEFAULT = 'CATEGORY_DEFAULT'
-
-valid_numex_keys = set(['name', 'value', 'type', 'left', 'right', 'gender', 'category', 'radix'])
-
-valid_ordinal_keys = set(['suffixes', 'gender', 'category'])
-
-
-category_map = {
-    'plural': CATEGORY_PLURAL,
-    None: CATEGORY_DEFAULT
-}
-
-LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
-LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
-LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
-
-left_context_map = {
-    'add': LEFT_CONTEXT_ADD,
-    'multiply': LEFT_CONTEXT_MULTIPLY,
-    None: LEFT_CONTEXT_NONE,
-}
-
-RIGHT_CONTEXT_MULTIPLY = 'NUMEX_RIGHT_CONTEXT_MULTIPLY'
-RIGHT_CONTEXT_ADD = 'NUMEX_RIGHT_CONTEXT_ADD'
-RIGHT_CONTEXT_NONE = 'NUMEX_RIGHT_CONTEXT_NONE'
-
-right_context_map = {
-    'add': RIGHT_CONTEXT_ADD,
-    'multiply': RIGHT_CONTEXT_MULTIPLY,
-    None: RIGHT_CONTEXT_NONE,
-}
-
-CARDINAL = 'NUMEX_CARDINAL_RULE'
-ORDINAL = 'NUMEX_ORDINAL_RULE'
-ORDINAL_INDICATOR = 'NUMEX_ORDINAL_INDICATOR_RULE'
-
-rule_type_map = {
-    'cardinal': CARDINAL,
-    'ordinal': ORDINAL,
-    'ordinal_indicator': ORDINAL_INDICATOR,
-}
-
-numex_key_template = u'"{key}"'
-numex_rule_template = u'{{{left_context_type}, {right_context_type}, {rule_type}, {gender}, {category}, {radix}, {value}LL}}'
-
-stopword_rule = u'NUMEX_STOPWORD_RULE'
-
-ordinal_indicator_template = u'{{"{key}", {gender}, {category}, "{value}"}}'
-
-stopwords_template = u'"{word}"'
-
-language_template = u'{{"{language}", {whole_words_only}, {rule_index}, {num_rules}, {ordinal_indicator_index}, {num_ordinal_indicators}}}'
-
-numex_rules_data_template = u'''
-char *numex_keys[] = {{
-    {numex_keys}
-}};
-
-numex_rule_t numex_rules[] = {{
-    {numex_rules}
-}};
-
-ordinal_indicator_t ordinal_indicator_rules[] = {{
-    {ordinal_indicator_rules}
-}};
-
-numex_language_source_t numex_languages[] = {{
-    {languages}
-}};
-'''
-
-
-def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
-    all_keys = []
-    all_rules = []
-
-    all_ordinal_indicators = []
-    all_stopwords = []
-
-    all_languages = []
-
-    out = open(outfile, 'w')
-
-    for filename in os.listdir(dirname):
-        path = os.path.join(dirname, filename)
-        if not os.path.isfile(path) or not filename.endswith('.json'):
-            continue
-
-        language = filename.split('.json', 1)[0]
-
-        data = json.load(open(path))
-
-        whole_words_only = data.get('whole_words_only', False)
-
-        rules = data.get('rules', [])
-        rule_index = len(all_rules)
-
-        for rule in rules:
-            invalid_keys = set(rule.keys()) - valid_numex_keys
-            if invalid_keys:
-                raise InvalidNumexRuleException(u'Invalid keys: ({}) for language {}, rule: {}'.format(u','.join(invalid_keys), language, rule))
-            gender = gender_map[rule.get('gender')]
-            rule_type = rule_type_map[rule['type']]
-            key = rule['name']
-            value = rule['value']
-            radix = rule.get('radix', 10)
-            category = category_map[rule.get('category')]
-            left_context_type = left_context_map[rule.get('left')]
-            right_context_type = right_context_map[rule.get('right')]
-            all_keys.append(unicode(numex_key_template.format(key=key)))
-            all_rules.append(unicode(numex_rule_template.format(
-                language=language,
-                rule_type=rule_type,
-                gender=gender,
-                category=category,
-                left_context_type=left_context_type,
-                right_context_type=right_context_type,
-                value=value,
-                radix=radix
-            )))
-
-        ordinal_indicator_index = len(all_ordinal_indicators)
-        ordinal_indicators = data.get('ordinal_indicators', [])
-        num_ordinal_indicators = 0
-
-        for rule in ordinal_indicators:
-            gender = gender_map[rule.get('gender')]
-            category = category_map[rule.get('category')]
-            invalid_ordinal_keys = set(rule.keys()) - valid_ordinal_keys
-            if invalid_ordinal_keys:
-                raise InvalidNumexRuleException(u'Invalid keys ({}) in ordinal rule for language {}, rule: {}'.format(u','.join(invalid_ordinal_keys), language, rule))
-
-            for key, suffixes in rule['suffixes'].iteritems():
-                for suffix in suffixes:
-                    all_ordinal_indicators.append(unicode(ordinal_indicator_template.format(
-                        key=key,
-                        value=suffix,
-                        gender=gender,
-                        category=category
-                    )))
-                num_ordinal_indicators += len(suffixes)
-
-        stopwords = data.get('stopwords', [])
-        stopword_index = len(all_stopwords)
-        num_stopwords = len(stopwords)
-
-        for stopword in stopwords:
-            all_keys.append(numex_key_template.format(key=unicode(stopword)))
-            all_rules.append(stopword_rule)
-
-        num_rules = len(rules) + len(stopwords)
-
-        all_languages.append(unicode(language_template.format(
-            language=language,
-            whole_words_only=int(whole_words_only),
-            rule_index=rule_index,
-            num_rules=num_rules,
-            ordinal_indicator_index=ordinal_indicator_index,
-            num_ordinal_indicators=num_ordinal_indicators
-        )))
-
-    out.write(safe_encode(numex_rules_data_template.format(
-        numex_keys=u''',
-    '''.join(all_keys),
-        numex_rules=u''',
-    '''.join(all_rules),
-        ordinal_indicator_rules=u''',
-    '''.join(all_ordinal_indicators),
-        stopwords=u''',
-    '''.join(all_stopwords),
-        languages=u''',
-    '''.join(all_languages),
-    )))
-
-    out.close()
-
-
-if __name__ == '__main__':
-    parse_numex_rules(*sys.argv[1:])