[numex] Moving numex script to a different subpackage, adding function for creating ordinals

This commit is contained in:
Al
2016-03-18 20:36:22 -04:00
parent e6b59980e7
commit 5e2d9f371e
4 changed files with 73 additions and 3 deletions

View File

View File

@@ -7,13 +7,14 @@ this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.encoding import safe_encode
from unicode_paths import DATA_DIR
from geodata.i18n.unicode_paths import DATA_DIR
class InvalidNumexRuleException(Exception):
pass
NUMEX_DATA_DIR = os.path.join(DATA_DIR, 'numex')
NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'numex')
NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')

View File

@@ -0,0 +1,69 @@
import os
import six
import sys
import ujson as json
from marisa_trie import BytesTrie
from geodata.text.phrases import PhraseFilter
from geodata.encoding import safe_encode, safe_decode
from geodata.i18n.unicode_paths import DATA_DIR
from geodata.numbers.numex import NUMEX_DATA_DIR
class OrdinalTrie(PhraseFilter):
def __init__(self, ordinal_rules):
self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
self.configured = True
def search_substring(self, s):
if len(s) == 0:
return None, 0
for i in xrange(len(s) + 1):
if not self.trie.has_keys_with_prefix(s[:i]):
i -= 1
break
if i > 0:
return (self.trie.get(s[:i]), i)
else:
return None, 0
def search_suffix(self, token):
suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
if suffix_search:
return suffix_search[0].split('|')
else:
return None
ordinal_rules = {}
ordinal_rules_configured = False
def init_ordinal_rules(d=NUMEX_DATA_DIR):
global ordinal_rules, ordinal_rules_configured
for filename in os.listdir(d):
if filename.endswith('.json'):
lang = filename.split('.json')[0]
f = open(os.path.join(d, filename))
data = json.load(f)
rules = data.get('ordinal_indicators')
if rules is not None and hasattr(rules, '__getslice__'):
lang_rules = []
for rule_set in rules:
gender = rule_set.get('gender', None)
category = rule_set.get('category', None)
ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes'])
ordinal_rules_configured = True
def ordinal_suffixes(num, lang, gender=None, category=None):
if not ordinal_rules_configured:
raise RuntimeError('ordinal rules not configured')
trie = ordinal_rules.get((lang, gender, category))
if not trie:
return None
return trie.search_suffix(str(num))