[numex] Moving numex script to a different subpackage, adding function for creating ordinals

2016-03-18 20:36:22 -04:00
parent e6b59980e7
commit 5e2d9f371e
4 changed files with 73 additions and 3 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,7 +26,7 @@ addons:
 before_script:
    - ./bootstrap.sh
    - if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
-    - if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/i18n/numex.py; fi;
+    - if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
    - if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
 install:
    - if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
--- a/scripts/geodata/numbers/init.py
+++ b/scripts/geodata/numbers/init.py
--- a/scripts/geodata/numbers/numex.py
+++ b/scripts/geodata/numbers/numex.py
@@ -7,13 +7,14 @@ this_dir = os.path.realpath(os.path.dirname(__file__))
 sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))

 from geodata.encoding import safe_encode
-from unicode_paths import DATA_DIR
+from geodata.i18n.unicode_paths import DATA_DIR


 class InvalidNumexRuleException(Exception):
    pass

-NUMEX_DATA_DIR = os.path.join(DATA_DIR, 'numex')
+NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                              'resources', 'numex')

 NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')

--- a/scripts/geodata/numbers/ordinals.py
+++ b/scripts/geodata/numbers/ordinals.py
@@ -0,0 +1,69 @@
+import os
+import six
+import sys
+
+import ujson as json
+
+from marisa_trie import BytesTrie
+
+from geodata.text.phrases import PhraseFilter
+from geodata.encoding import safe_encode, safe_decode
+from geodata.i18n.unicode_paths import DATA_DIR
+
+from geodata.numbers.numex import NUMEX_DATA_DIR
+
+
+class OrdinalTrie(PhraseFilter):
+    def __init__(self, ordinal_rules):
+        self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
+        self.configured = True
+
+    def search_substring(self, s):
+        if len(s) == 0:
+            return None, 0
+
+        for i in xrange(len(s) + 1):
+            if not self.trie.has_keys_with_prefix(s[:i]):
+                i -= 1
+                break
+        if i > 0:
+            return (self.trie.get(s[:i]), i)
+        else:
+            return None, 0
+
+    def search_suffix(self, token):
+        suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
+        if suffix_search:
+            return suffix_search[0].split('|')
+        else:
+            return None
+
+ordinal_rules = {}
+ordinal_rules_configured = False
+
+
+def init_ordinal_rules(d=NUMEX_DATA_DIR):
+    global ordinal_rules, ordinal_rules_configured
+    for filename in os.listdir(d):
+        if filename.endswith('.json'):
+            lang = filename.split('.json')[0]
+            f = open(os.path.join(d, filename))
+            data = json.load(f)
+            rules = data.get('ordinal_indicators')
+            if rules is not None and hasattr(rules, '__getslice__'):
+                lang_rules = []
+                for rule_set in rules:
+                    gender = rule_set.get('gender', None)
+                    category = rule_set.get('category', None)
+                    ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes'])
+    ordinal_rules_configured = True
+
+
+def ordinal_suffixes(num, lang, gender=None, category=None):
+    if not ordinal_rules_configured:
+        raise RuntimeError('ordinal rules not configured')
+    trie = ordinal_rules.get((lang, gender, category))
+    if not trie:
+        return None
+
+    return trie.search_suffix(str(num))