[numex] Python script to generate numex data

This commit is contained in:
Al
2015-06-02 10:15:02 -04:00
parent 6b3d434c31
commit 2dc870b3da

View File

@@ -0,0 +1,172 @@
import os
import sys
import ujson as json
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_encode
from unicode_paths import DATA_DIR
NUMEX_DATA_DIR = os.path.join(DATA_DIR, 'numex', 'rules')
NUMEX_RULES_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
GENDER_MASCULINE = 'GENDER_MASCULINE'
GENDER_FEMININE = 'GENDER_FEMININE'
GENDER_NEUTER = 'GENDER_NEUTER'
GENDER_NONE = 'GENDER_NONE'
gender_map = {
'm': GENDER_MASCULINE,
'f': GENDER_FEMININE,
'n': GENDER_NEUTER,
None: GENDER_NONE,
}
LEFT_CONTEXT_MULTIPLY = 'LEFT_CONTEXT_MULTIPLY'
LEFT_CONTEXT_ADD = 'LEFT_CONTEXT_ADD'
LEFT_CONTEXT_NONE = 'LEFT_CONTEXT_NONE'
left_context_map = {
'add': LEFT_CONTEXT_ADD,
'multiply': LEFT_CONTEXT_MULTIPLY,
None: LEFT_CONTEXT_NONE,
}
RIGHT_CONTEXT_MULTIPLY = 'RIGHT_CONTEXT_MULTIPLY'
RIGHT_CONTEXT_ADD = 'RIGHT_CONTEXT_ADD'
RIGHT_CONTEXT_NONE = 'RIGHT_CONTEXT_NONE'
right_context_map = {
'add': RIGHT_CONTEXT_ADD,
'multiply': RIGHT_CONTEXT_MULTIPLY,
None: RIGHT_CONTEXT_NONE,
}
CARDINAL = 'NUMEX_RULE_TYPE_CARDINAL'
ORDINAL = 'NUMEX_RULE_TYPE_ORDINAL'
rule_type_map = {
'cardinal': CARDINAL,
'ordinal': ORDINAL
}
numex_rule_template = u'{{"{key}", {rule_type}, {gender}, {left_context_type}, {right_context_type}, {value}LL, {radix}}}'
ordinal_indicator_template = u'{{{number}, {gender}, "{value}"}}'
stopwords_template = u'"{word}"'
language_template = u'{{"{language}", {rule_index}, {num_rules}, {ordinal_indicator_index}, {num_ordinal_indicators}, {stopword_index}, {num_stopwords}}}'
numex_rules_data_template = u'''
numex_rule_source_t numex_rules[] = {{
{numex_rules}
}};
ordinal_indicator_rule_t ordinal_indicator_rules[] = {{
{ordinal_indicator_rules}
}};
char *numex_stopwords[] = {{
{stopwords}
}};
numex_language_source_t numex_languages[] = {{
{languages}
}};
'''
def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
all_rules = []
all_ordinal_indicators = []
all_stopwords = []
all_languages = []
out = open(outfile, 'w')
for filename in os.listdir(dirname):
path = os.path.join(dirname, filename)
if not os.path.isfile(path) or not filename.endswith('.json'):
continue
language = filename.split('.json', 1)[0]
data = json.load(open(path))
rules = data.get('rules', [])
rule_index = len(all_rules)
num_rules = len(rules)
for rule in rules:
gender = gender_map[rule.get('gender')]
rule_type = rule_type_map[rule['type']]
key = rule['name']
value = rule['value']
radix = rule.get('radix', 10)
left_context_type = left_context_map[rule.get('left')]
right_context_type = right_context_map[rule.get('right')]
all_rules.append(unicode(numex_rule_template.format(
key=key,
language=language,
rule_type=rule_type,
gender=gender,
left_context_type=left_context_type,
right_context_type=right_context_type,
value=value,
radix=radix
)))
ordinal_indicator_index = len(all_ordinal_indicators)
ordinal_indicators = data.get('ordinal_indicators', [])
num_ordinal_indicators = len(ordinal_indicators) * 10
for rule in ordinal_indicators:
gender = gender_map[rule.get('gender')]
if 'suffixes' not in rule:
print rule.keys()
for number, value in enumerate(rule['suffixes']):
all_ordinal_indicators.append(unicode(ordinal_indicator_template.format(
number=number,
value=value,
gender=gender
)))
stopwords = data.get('stopwords', [])
stopword_index = len(all_stopwords)
num_stopwords = len(stopwords)
for stopword in stopwords:
all_stopwords.append(unicode(stopwords_template.format(word=stopword)))
all_languages.append(unicode(language_template.format(
language=language,
rule_index=rule_index,
num_rules=num_rules,
ordinal_indicator_index=ordinal_indicator_index,
num_ordinal_indicators=num_ordinal_indicators,
stopword_index=stopword_index,
num_stopwords=num_stopwords
)))
out.write(safe_encode(numex_rules_data_template.format(
numex_rules=u''',
'''.join(all_rules),
ordinal_indicator_rules=u''',
'''.join(all_ordinal_indicators),
stopwords=u''',
'''.join(all_stopwords),
languages=u''',
'''.join(all_languages),
)))
out.close()
if __name__ == '__main__':
parse_numex_rules(*sys.argv[1:])