[numex] Nicer API for ordinal suffixes

This commit is contained in:
Al
2016-04-08 17:10:10 -04:00
parent 737b5d06ed
commit 02e82e5342

View File

@@ -1,9 +1,13 @@
import bisect
import math
import os import os
import operator
import random
import six import six
import sys import sys
import yaml
import ujson as json from collections import defaultdict
from marisa_trie import BytesTrie from marisa_trie import BytesTrie
from geodata.text.phrases import PhraseFilter from geodata.text.phrases import PhraseFilter
@@ -13,9 +17,9 @@ from geodata.i18n.unicode_paths import DATA_DIR
from geodata.numbers.numex import NUMEX_DATA_DIR from geodata.numbers.numex import NUMEX_DATA_DIR
class OrdinalTrie(PhraseFilter): class OrdinalSuffixTrie(PhraseFilter):
def __init__(self, ordinal_rules): def __init__(self, ordinal_rules):
self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
self.configured = True self.configured = True
def search_substring(self, s): def search_substring(self, s):
@@ -38,32 +42,60 @@ class OrdinalTrie(PhraseFilter):
else: else:
return None return None
ordinal_rules = {}
ordinal_rules_configured = False
class OrdinalExpressions(object):
def __init__(self, base_dir=NUMEX_DATA_DIR):
self.cardinal_rules = {}
self.cardinal_rules_ones = {}
def init_ordinal_rules(d=NUMEX_DATA_DIR): self.ordinal_rules = {}
global ordinal_rules, ordinal_rules_configured self.ordinal_suffix_rules = {}
for filename in os.listdir(d):
if filename.endswith('.json'):
lang = filename.split('.json')[0]
f = open(os.path.join(d, filename))
data = json.load(f)
rules = data.get('ordinal_indicators')
if rules is not None and hasattr(rules, '__getslice__'):
lang_rules = []
for rule_set in rules:
gender = rule_set.get('gender', None)
category = rule_set.get('category', None)
ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes'])
ordinal_rules_configured = True
for filename in os.listdir(base_dir):
if filename.endswith('.yaml'):
lang = filename.split('.yaml')[0]
f = open(os.path.join(base_dir, filename))
data = yaml.load(f)
def ordinal_suffixes(num, lang, gender=None, category=None): rules = data.get('rules')
if not ordinal_rules_configured: if rules is not None and hasattr(rules, '__getslice__'):
raise RuntimeError('ordinal rules not configured') cardinals = []
trie = ordinal_rules.get((lang, gender, category)) ordinals = defaultdict(list)
if not trie: for rule in rules:
return None name = rule.get('name')
value = rule.get('value')
rule_type = rule.get('type')
if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
continue
gender = rule.get('gender', None)
category = rule.get('category', None)
if rule_type == 'ordinal':
ordinals[(value, gender, category)].append(name)
else:
cardinals.append(rule)
if value == 1:
self.cardinal_rules_ones[(lang, gender, category)] = name
return trie.search_suffix(str(num)) self.cardinal_rules[lang] = cardinals
self.ordinal_rules[lang] = ordinals
ordinal_indicators = data.get('ordinal_indicators')
if ordinal_indicators is not None and hasattr(ordinal_indicators, '__getslice__'):
for rule_set in ordinal_indicators:
gender = rule_set.get('gender', None)
category = rule_set.get('category', None)
self.ordinal_suffix_rules[(lang, gender, category)] = OrdinalSuffixTrie(rule_set['suffixes'])
def get_suffixes(self, num, lang, gender=None, category=None):
trie = self.ordinal_suffix_rules.get((lang, gender, category))
if not trie:
return None
return trie.search_suffix(str(num))
def suffix(self, num, lang, gender=None, category=None):
suffixes = self.get_suffixes(num, lang, gender=gender, category=category)
suffix = random.choice(suffixes)
return six.u('{}{}').format(safe_decode(num), safe_decode(suffix))
ordinal_expressions = OrdinalExpressions()