[numex] Adding a whole words only option on numex languages e.g. for Latin so we don't match an initial D with 500
This commit is contained in:
@@ -73,7 +73,7 @@ ordinal_indicator_template = u'{{"{key}", {gender}, {category}, "{value}"}}'
|
|||||||
|
|
||||||
stopwords_template = u'"{word}"'
|
stopwords_template = u'"{word}"'
|
||||||
|
|
||||||
language_template = u'{{"{language}", {rule_index}, {num_rules}, {ordinal_indicator_index}, {num_ordinal_indicators}}}'
|
language_template = u'{{"{language}", {whole_words_only}, {rule_index}, {num_rules}, {ordinal_indicator_index}, {num_ordinal_indicators}}}'
|
||||||
|
|
||||||
numex_rules_data_template = u'''
|
numex_rules_data_template = u'''
|
||||||
numex_rule_source_t numex_rules[] = {{
|
numex_rule_source_t numex_rules[] = {{
|
||||||
@@ -108,6 +108,8 @@ def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
|
|||||||
|
|
||||||
data = json.load(open(path))
|
data = json.load(open(path))
|
||||||
|
|
||||||
|
whole_words_only = data.get('whole_words_only', False)
|
||||||
|
|
||||||
rules = data.get('rules', [])
|
rules = data.get('rules', [])
|
||||||
rule_index = len(all_rules)
|
rule_index = len(all_rules)
|
||||||
|
|
||||||
@@ -163,6 +165,7 @@ def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
|
|||||||
|
|
||||||
all_languages.append(unicode(language_template.format(
|
all_languages.append(unicode(language_template.format(
|
||||||
language=language,
|
language=language,
|
||||||
|
whole_words_only=int(whole_words_only),
|
||||||
rule_index=rule_index,
|
rule_index=rule_index,
|
||||||
num_rules=num_rules,
|
num_rules=num_rules,
|
||||||
ordinal_indicator_index=ordinal_indicator_index,
|
ordinal_indicator_index=ordinal_indicator_index,
|
||||||
|
|||||||
Reference in New Issue
Block a user