70 lines
2.1 KiB
Python
70 lines
2.1 KiB
Python
import os
|
|
import six
|
|
import sys
|
|
|
|
import ujson as json
|
|
|
|
from marisa_trie import BytesTrie
|
|
|
|
from geodata.text.phrases import PhraseFilter
|
|
from geodata.encoding import safe_encode, safe_decode
|
|
from geodata.i18n.unicode_paths import DATA_DIR
|
|
|
|
from geodata.numbers.numex import NUMEX_DATA_DIR
|
|
|
|
|
|
class OrdinalTrie(PhraseFilter):
|
|
def __init__(self, ordinal_rules):
|
|
self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
|
|
self.configured = True
|
|
|
|
def search_substring(self, s):
|
|
if len(s) == 0:
|
|
return None, 0
|
|
|
|
for i in xrange(len(s) + 1):
|
|
if not self.trie.has_keys_with_prefix(s[:i]):
|
|
i -= 1
|
|
break
|
|
if i > 0:
|
|
return (self.trie.get(s[:i]), i)
|
|
else:
|
|
return None, 0
|
|
|
|
def search_suffix(self, token):
|
|
suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
|
|
if suffix_search:
|
|
return suffix_search[0].split('|')
|
|
else:
|
|
return None
|
|
|
|
ordinal_rules = {}
|
|
ordinal_rules_configured = False
|
|
|
|
|
|
def init_ordinal_rules(d=NUMEX_DATA_DIR):
|
|
global ordinal_rules, ordinal_rules_configured
|
|
for filename in os.listdir(d):
|
|
if filename.endswith('.json'):
|
|
lang = filename.split('.json')[0]
|
|
f = open(os.path.join(d, filename))
|
|
data = json.load(f)
|
|
rules = data.get('ordinal_indicators')
|
|
if rules is not None and hasattr(rules, '__getslice__'):
|
|
lang_rules = []
|
|
for rule_set in rules:
|
|
gender = rule_set.get('gender', None)
|
|
category = rule_set.get('category', None)
|
|
ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes'])
|
|
ordinal_rules_configured = True
|
|
|
|
|
|
def ordinal_suffixes(num, lang, gender=None, category=None):
|
|
if not ordinal_rules_configured:
|
|
raise RuntimeError('ordinal rules not configured')
|
|
trie = ordinal_rules.get((lang, gender, category))
|
|
if not trie:
|
|
return None
|
|
|
|
return trie.search_suffix(str(num))
|