[numex] Nicer API for ordinal suffixes
This commit is contained in:
@@ -1,9 +1,13 @@
|
|||||||
|
import bisect
|
||||||
|
import math
|
||||||
import os
|
import os
|
||||||
|
import operator
|
||||||
|
import random
|
||||||
import six
|
import six
|
||||||
import sys
|
import sys
|
||||||
|
import yaml
|
||||||
|
|
||||||
import ujson as json
|
from collections import defaultdict
|
||||||
|
|
||||||
from marisa_trie import BytesTrie
|
from marisa_trie import BytesTrie
|
||||||
|
|
||||||
from geodata.text.phrases import PhraseFilter
|
from geodata.text.phrases import PhraseFilter
|
||||||
@@ -13,9 +17,9 @@ from geodata.i18n.unicode_paths import DATA_DIR
|
|||||||
from geodata.numbers.numex import NUMEX_DATA_DIR
|
from geodata.numbers.numex import NUMEX_DATA_DIR
|
||||||
|
|
||||||
|
|
||||||
class OrdinalTrie(PhraseFilter):
|
class OrdinalSuffixTrie(PhraseFilter):
|
||||||
def __init__(self, ordinal_rules):
|
def __init__(self, ordinal_rules):
|
||||||
self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
|
self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
|
||||||
self.configured = True
|
self.configured = True
|
||||||
|
|
||||||
def search_substring(self, s):
|
def search_substring(self, s):
|
||||||
@@ -38,32 +42,60 @@ class OrdinalTrie(PhraseFilter):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
ordinal_rules = {}
|
|
||||||
ordinal_rules_configured = False
|
|
||||||
|
|
||||||
|
class OrdinalExpressions(object):
|
||||||
|
def __init__(self, base_dir=NUMEX_DATA_DIR):
|
||||||
|
self.cardinal_rules = {}
|
||||||
|
self.cardinal_rules_ones = {}
|
||||||
|
|
||||||
def init_ordinal_rules(d=NUMEX_DATA_DIR):
|
self.ordinal_rules = {}
|
||||||
global ordinal_rules, ordinal_rules_configured
|
self.ordinal_suffix_rules = {}
|
||||||
for filename in os.listdir(d):
|
|
||||||
if filename.endswith('.json'):
|
for filename in os.listdir(base_dir):
|
||||||
lang = filename.split('.json')[0]
|
if filename.endswith('.yaml'):
|
||||||
f = open(os.path.join(d, filename))
|
lang = filename.split('.yaml')[0]
|
||||||
data = json.load(f)
|
f = open(os.path.join(base_dir, filename))
|
||||||
rules = data.get('ordinal_indicators')
|
data = yaml.load(f)
|
||||||
|
|
||||||
|
rules = data.get('rules')
|
||||||
if rules is not None and hasattr(rules, '__getslice__'):
|
if rules is not None and hasattr(rules, '__getslice__'):
|
||||||
lang_rules = []
|
cardinals = []
|
||||||
for rule_set in rules:
|
ordinals = defaultdict(list)
|
||||||
|
for rule in rules:
|
||||||
|
name = rule.get('name')
|
||||||
|
value = rule.get('value')
|
||||||
|
rule_type = rule.get('type')
|
||||||
|
if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
|
||||||
|
continue
|
||||||
|
gender = rule.get('gender', None)
|
||||||
|
category = rule.get('category', None)
|
||||||
|
if rule_type == 'ordinal':
|
||||||
|
ordinals[(value, gender, category)].append(name)
|
||||||
|
else:
|
||||||
|
cardinals.append(rule)
|
||||||
|
if value == 1:
|
||||||
|
self.cardinal_rules_ones[(lang, gender, category)] = name
|
||||||
|
|
||||||
|
self.cardinal_rules[lang] = cardinals
|
||||||
|
self.ordinal_rules[lang] = ordinals
|
||||||
|
|
||||||
|
ordinal_indicators = data.get('ordinal_indicators')
|
||||||
|
if ordinal_indicators is not None and hasattr(ordinal_indicators, '__getslice__'):
|
||||||
|
for rule_set in ordinal_indicators:
|
||||||
gender = rule_set.get('gender', None)
|
gender = rule_set.get('gender', None)
|
||||||
category = rule_set.get('category', None)
|
category = rule_set.get('category', None)
|
||||||
ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes'])
|
self.ordinal_suffix_rules[(lang, gender, category)] = OrdinalSuffixTrie(rule_set['suffixes'])
|
||||||
ordinal_rules_configured = True
|
|
||||||
|
|
||||||
|
def get_suffixes(self, num, lang, gender=None, category=None):
|
||||||
def ordinal_suffixes(num, lang, gender=None, category=None):
|
trie = self.ordinal_suffix_rules.get((lang, gender, category))
|
||||||
if not ordinal_rules_configured:
|
|
||||||
raise RuntimeError('ordinal rules not configured')
|
|
||||||
trie = ordinal_rules.get((lang, gender, category))
|
|
||||||
if not trie:
|
if not trie:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return trie.search_suffix(str(num))
|
return trie.search_suffix(str(num))
|
||||||
|
|
||||||
|
def suffix(self, num, lang, gender=None, category=None):
|
||||||
|
suffixes = self.get_suffixes(num, lang, gender=gender, category=category)
|
||||||
|
suffix = random.choice(suffixes)
|
||||||
|
return six.u('{}{}').format(safe_decode(num), safe_decode(suffix))
|
||||||
|
|
||||||
|
ordinal_expressions = OrdinalExpressions()
|
||||||
|
|||||||
Reference in New Issue
Block a user