Files
2025-09-06 22:03:29 -04:00

109 lines
3.9 KiB
Python

import bisect
import math
import os
import operator
import random
import six
import sys
import yaml
from collections import defaultdict
from marisa_trie import BytesTrie
from geodata.text.phrases import PhraseFilter
from geodata.encoding import safe_encode, safe_decode
from geodata.i18n.unicode_paths import DATA_DIR
from geodata.numbers.numex import NUMEX_DATA_DIR
class OrdinalSuffixTrie(PhraseFilter):
def __init__(self, ordinal_rules):
self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
self.configured = True
def search_substring(self, s):
if len(s) == 0:
return None, 0
for i in xrange(len(s) + 1):
if not self.trie.has_keys_with_prefix(s[:i]):
i -= 1
break
if i > 0:
return (self.trie.get(s[:i]), i)
else:
return None, 0
def search_suffix(self, token):
suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
if suffix_search:
return suffix_search[0].split('|')
else:
return None
class OrdinalExpressions(object):
def __init__(self, base_dir=NUMEX_DATA_DIR):
self.cardinal_rules = {}
self.cardinal_rules_ones = {}
self.ordinal_rules = {}
self.ordinal_suffix_rules = {}
for filename in os.listdir(base_dir):
if filename.endswith('.yaml'):
lang = filename.split('.yaml')[0]
f = open(os.path.join(base_dir, filename))
data = yaml.load(f)
rules = data.get('rules')
if rules is not None and hasattr(rules, '__getslice__'):
cardinals = []
ordinals = defaultdict(list)
for rule in rules:
name = rule.get('name')
value = rule.get('value')
rule_type = rule.get('type')
if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
continue
gender = rule.get('gender', None)
category = rule.get('category', None)
if rule_type == 'ordinal':
ordinals[(value, gender, category)].append(name)
else:
cardinals.append(rule)
if value == 1:
self.cardinal_rules_ones[(lang, gender, category)] = name
self.cardinal_rules[lang] = cardinals
self.ordinal_rules[lang] = ordinals
ordinal_indicators = data.get('ordinal_indicators')
if ordinal_indicators is not None and hasattr(ordinal_indicators, '__getslice__'):
for rule_set in ordinal_indicators:
gender = rule_set.get('gender', None)
category = rule_set.get('category', None)
self.ordinal_suffix_rules[(lang, gender, category)] = OrdinalSuffixTrie(rule_set['suffixes'])
def get_suffixes(self, num, lang, gender=None, category=None):
trie = self.ordinal_suffix_rules.get((lang, gender, category))
if not trie:
return None
return trie.search_suffix(str(num))
def get_suffix(self, num, lang, gender=None, category=None):
suffixes = self.get_suffixes(num, lang, gender=gender, category=category)
if not suffixes:
return None
return random.choice(suffixes)
def suffixed_number(self, num, lang, gender=None, category=None):
suffix = self.get_suffix(num, lang, gender=gender, category=category)
if not suffix:
return None
return six.u('{}{}').format(safe_decode(num), safe_decode(suffix))
ordinal_expressions = OrdinalExpressions()