From 95934ab2138a662580878622d2e03e63c4adc77f Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 18 Mar 2016 20:36:22 -0400 Subject: [PATCH] [numex] Moving numex script to a different subpackage, adding function for creating ordinals --- .travis.yml | 2 +- scripts/geodata/numbers/__init__.py | 0 scripts/geodata/{i18n => numbers}/numex.py | 5 +- scripts/geodata/numbers/ordinals.py | 69 ++++++++++++++++++++++ 4 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 scripts/geodata/numbers/__init__.py rename scripts/geodata/{i18n => numbers}/numex.py (97%) create mode 100644 scripts/geodata/numbers/ordinals.py diff --git a/.travis.yml b/.travis.yml index 67e41b2c..bb883cfc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ addons: before_script: - ./bootstrap.sh - if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi; - - if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/i18n/numex.py; fi; + - if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi; - if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi; install: - if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi diff --git a/scripts/geodata/numbers/__init__.py b/scripts/geodata/numbers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/i18n/numex.py b/scripts/geodata/numbers/numex.py similarity index 97% rename from scripts/geodata/i18n/numex.py rename to scripts/geodata/numbers/numex.py index 5cdc6f4f..dba7bb72 100644 --- a/scripts/geodata/i18n/numex.py +++ b/scripts/geodata/numbers/numex.py @@ -7,13 +7,14 @@ this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) from geodata.encoding import safe_encode -from unicode_paths import DATA_DIR +from geodata.i18n.unicode_paths import DATA_DIR class InvalidNumexRuleException(Exception): pass -NUMEX_DATA_DIR = os.path.join(DATA_DIR, 'numex') +NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'numex') NUMEX_RULES_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c') diff --git a/scripts/geodata/numbers/ordinals.py b/scripts/geodata/numbers/ordinals.py new file mode 100644 index 00000000..d2f439ea --- /dev/null +++ b/scripts/geodata/numbers/ordinals.py @@ -0,0 +1,69 @@ +import os +import six +import sys + +import ujson as json + +from marisa_trie import BytesTrie + +from geodata.text.phrases import PhraseFilter +from geodata.encoding import safe_encode, safe_decode +from geodata.i18n.unicode_paths import DATA_DIR + +from geodata.numbers.numex import NUMEX_DATA_DIR + + +class OrdinalTrie(PhraseFilter): + def __init__(self, ordinal_rules): + self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) + self.configured = True + + def search_substring(self, s): + if len(s) == 0: + return None, 0 + + for i in xrange(len(s) + 1): + if not self.trie.has_keys_with_prefix(s[:i]): + i -= 1 + break + if i > 0: + return (self.trie.get(s[:i]), i) + else: + return None, 0 + + def search_suffix(self, token): + suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1])) + if suffix_search: + return suffix_search[0].split('|') + else: + return None + +ordinal_rules = {} +ordinal_rules_configured = False + + +def init_ordinal_rules(d=NUMEX_DATA_DIR): + global ordinal_rules, ordinal_rules_configured + for filename in os.listdir(d): + if filename.endswith('.json'): + lang = filename.split('.json')[0] + f = open(os.path.join(d, filename)) + data = json.load(f) + rules = data.get('ordinal_indicators') + if rules is not None and hasattr(rules, '__getslice__'): + lang_rules = [] + for rule_set in rules: + gender = rule_set.get('gender', None) + category = rule_set.get('category', None) + ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes']) + ordinal_rules_configured = True + + +def ordinal_suffixes(num, lang, gender=None, category=None): + if not ordinal_rules_configured: + raise RuntimeError('ordinal rules not configured') + trie = ordinal_rules.get((lang, gender, category)) + if not trie: + return None + + return trie.search_suffix(str(num))