From 23525df39d93254e979070c6984e6c24fe7294f5 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 8 Apr 2016 17:10:10 -0400 Subject: [PATCH] [numex] Nicer API for ordinal suffixes --- scripts/geodata/numbers/ordinals.py | 88 ++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 28 deletions(-) diff --git a/scripts/geodata/numbers/ordinals.py b/scripts/geodata/numbers/ordinals.py index d2f439ea..36555238 100644 --- a/scripts/geodata/numbers/ordinals.py +++ b/scripts/geodata/numbers/ordinals.py @@ -1,9 +1,13 @@ +import bisect +import math import os +import operator +import random import six import sys +import yaml -import ujson as json - +from collections import defaultdict from marisa_trie import BytesTrie from geodata.text.phrases import PhraseFilter @@ -13,9 +17,9 @@ from geodata.i18n.unicode_paths import DATA_DIR from geodata.numbers.numex import NUMEX_DATA_DIR -class OrdinalTrie(PhraseFilter): +class OrdinalSuffixTrie(PhraseFilter): def __init__(self, ordinal_rules): - self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) + self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) self.configured = True def search_substring(self, s): @@ -38,32 +42,60 @@ class OrdinalTrie(PhraseFilter): else: return None -ordinal_rules = {} -ordinal_rules_configured = False +class OrdinalExpressions(object): + def __init__(self, base_dir=NUMEX_DATA_DIR): + self.cardinal_rules = {} + self.cardinal_rules_ones = {} -def init_ordinal_rules(d=NUMEX_DATA_DIR): - global ordinal_rules, ordinal_rules_configured - for filename in os.listdir(d): - if filename.endswith('.json'): - lang = filename.split('.json')[0] - f = open(os.path.join(d, filename)) - data = json.load(f) - rules = data.get('ordinal_indicators') - if rules is not None and hasattr(rules, '__getslice__'): - lang_rules = [] - for rule_set in rules: - gender = rule_set.get('gender', None) - category = rule_set.get('category', None) - ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes']) - ordinal_rules_configured = True + self.ordinal_rules = {} + self.ordinal_suffix_rules = {} + for filename in os.listdir(base_dir): + if filename.endswith('.yaml'): + lang = filename.split('.yaml')[0] + f = open(os.path.join(base_dir, filename)) + data = yaml.load(f) -def ordinal_suffixes(num, lang, gender=None, category=None): - if not ordinal_rules_configured: - raise RuntimeError('ordinal rules not configured') - trie = ordinal_rules.get((lang, gender, category)) - if not trie: - return None + rules = data.get('rules') + if rules is not None and hasattr(rules, '__getslice__'): + cardinals = [] + ordinals = defaultdict(list) + for rule in rules: + name = rule.get('name') + value = rule.get('value') + rule_type = rule.get('type') + if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'): + continue + gender = rule.get('gender', None) + category = rule.get('category', None) + if rule_type == 'ordinal': + ordinals[(value, gender, category)].append(name) + else: + cardinals.append(rule) + if value == 1: + self.cardinal_rules_ones[(lang, gender, category)] = name - return trie.search_suffix(str(num)) + self.cardinal_rules[lang] = cardinals + self.ordinal_rules[lang] = ordinals + + ordinal_indicators = data.get('ordinal_indicators') + if ordinal_indicators is not None and hasattr(ordinal_indicators, '__getslice__'): + for rule_set in ordinal_indicators: + gender = rule_set.get('gender', None) + category = rule_set.get('category', None) + self.ordinal_suffix_rules[(lang, gender, category)] = OrdinalSuffixTrie(rule_set['suffixes']) + + def get_suffixes(self, num, lang, gender=None, category=None): + trie = self.ordinal_suffix_rules.get((lang, gender, category)) + if not trie: + return None + + return trie.search_suffix(str(num)) + + def suffix(self, num, lang, gender=None, category=None): + suffixes = self.get_suffixes(num, lang, gender=gender, category=category) + suffix = random.choice(suffixes) + return six.u('{}{}').format(safe_decode(num), safe_decode(suffix)) + +ordinal_expressions = OrdinalExpressions()