[numex] Nicer API for ordinal suffixes

This commit is contained in:
Al
2016-04-08 17:10:10 -04:00
parent 737b5d06ed
commit 02e82e5342

View File

@@ -1,9 +1,13 @@
import bisect
import math
import os import os
import operator
import random
import six import six
import sys import sys
import yaml
import ujson as json from collections import defaultdict
from marisa_trie import BytesTrie from marisa_trie import BytesTrie
from geodata.text.phrases import PhraseFilter from geodata.text.phrases import PhraseFilter
@@ -13,9 +17,9 @@ from geodata.i18n.unicode_paths import DATA_DIR
from geodata.numbers.numex import NUMEX_DATA_DIR from geodata.numbers.numex import NUMEX_DATA_DIR
class OrdinalTrie(PhraseFilter): class OrdinalSuffixTrie(PhraseFilter):
def __init__(self, ordinal_rules): def __init__(self, ordinal_rules):
self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
self.configured = True self.configured = True
def search_substring(self, s): def search_substring(self, s):
@@ -38,32 +42,60 @@ class OrdinalTrie(PhraseFilter):
else: else:
return None return None
ordinal_rules = {}
ordinal_rules_configured = False
class OrdinalExpressions(object):
def __init__(self, base_dir=NUMEX_DATA_DIR):
self.cardinal_rules = {}
self.cardinal_rules_ones = {}
def init_ordinal_rules(d=NUMEX_DATA_DIR): self.ordinal_rules = {}
global ordinal_rules, ordinal_rules_configured self.ordinal_suffix_rules = {}
for filename in os.listdir(d):
if filename.endswith('.json'): for filename in os.listdir(base_dir):
lang = filename.split('.json')[0] if filename.endswith('.yaml'):
f = open(os.path.join(d, filename)) lang = filename.split('.yaml')[0]
data = json.load(f) f = open(os.path.join(base_dir, filename))
rules = data.get('ordinal_indicators') data = yaml.load(f)
rules = data.get('rules')
if rules is not None and hasattr(rules, '__getslice__'): if rules is not None and hasattr(rules, '__getslice__'):
lang_rules = [] cardinals = []
for rule_set in rules: ordinals = defaultdict(list)
for rule in rules:
name = rule.get('name')
value = rule.get('value')
rule_type = rule.get('type')
if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
continue
gender = rule.get('gender', None)
category = rule.get('category', None)
if rule_type == 'ordinal':
ordinals[(value, gender, category)].append(name)
else:
cardinals.append(rule)
if value == 1:
self.cardinal_rules_ones[(lang, gender, category)] = name
self.cardinal_rules[lang] = cardinals
self.ordinal_rules[lang] = ordinals
ordinal_indicators = data.get('ordinal_indicators')
if ordinal_indicators is not None and hasattr(ordinal_indicators, '__getslice__'):
for rule_set in ordinal_indicators:
gender = rule_set.get('gender', None) gender = rule_set.get('gender', None)
category = rule_set.get('category', None) category = rule_set.get('category', None)
ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes']) self.ordinal_suffix_rules[(lang, gender, category)] = OrdinalSuffixTrie(rule_set['suffixes'])
ordinal_rules_configured = True
def get_suffixes(self, num, lang, gender=None, category=None):
def ordinal_suffixes(num, lang, gender=None, category=None): trie = self.ordinal_suffix_rules.get((lang, gender, category))
if not ordinal_rules_configured:
raise RuntimeError('ordinal rules not configured')
trie = ordinal_rules.get((lang, gender, category))
if not trie: if not trie:
return None return None
return trie.search_suffix(str(num)) return trie.search_suffix(str(num))
def suffix(self, num, lang, gender=None, category=None):
suffixes = self.get_suffixes(num, lang, gender=gender, category=category)
suffix = random.choice(suffixes)
return six.u('{}{}').format(safe_decode(num), safe_decode(suffix))
ordinal_expressions = OrdinalExpressions()