[numex] Moving numex script to a different subpackage, adding function for creating ordinals
This commit is contained in:
@@ -26,7 +26,7 @@ addons:
|
||||
before_script:
|
||||
- ./bootstrap.sh
|
||||
- if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
|
||||
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/i18n/numex.py; fi;
|
||||
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
|
||||
- if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
|
||||
install:
|
||||
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
|
||||
|
||||
0
scripts/geodata/numbers/__init__.py
Normal file
0
scripts/geodata/numbers/__init__.py
Normal file
@@ -7,13 +7,14 @@ this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_encode
|
||||
from unicode_paths import DATA_DIR
|
||||
from geodata.i18n.unicode_paths import DATA_DIR
|
||||
|
||||
|
||||
class InvalidNumexRuleException(Exception):
|
||||
pass
|
||||
|
||||
NUMEX_DATA_DIR = os.path.join(DATA_DIR, 'numex')
|
||||
NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'numex')
|
||||
|
||||
NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
|
||||
|
||||
69
scripts/geodata/numbers/ordinals.py
Normal file
69
scripts/geodata/numbers/ordinals.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
|
||||
import ujson as json
|
||||
|
||||
from marisa_trie import BytesTrie
|
||||
|
||||
from geodata.text.phrases import PhraseFilter
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
from geodata.i18n.unicode_paths import DATA_DIR
|
||||
|
||||
from geodata.numbers.numex import NUMEX_DATA_DIR
|
||||
|
||||
|
||||
class OrdinalTrie(PhraseFilter):
|
||||
def __init__(self, ordinal_rules):
|
||||
self.trie = BytesTrie([(k[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
|
||||
self.configured = True
|
||||
|
||||
def search_substring(self, s):
|
||||
if len(s) == 0:
|
||||
return None, 0
|
||||
|
||||
for i in xrange(len(s) + 1):
|
||||
if not self.trie.has_keys_with_prefix(s[:i]):
|
||||
i -= 1
|
||||
break
|
||||
if i > 0:
|
||||
return (self.trie.get(s[:i]), i)
|
||||
else:
|
||||
return None, 0
|
||||
|
||||
def search_suffix(self, token):
|
||||
suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
|
||||
if suffix_search:
|
||||
return suffix_search[0].split('|')
|
||||
else:
|
||||
return None
|
||||
|
||||
ordinal_rules = {}
|
||||
ordinal_rules_configured = False
|
||||
|
||||
|
||||
def init_ordinal_rules(d=NUMEX_DATA_DIR):
|
||||
global ordinal_rules, ordinal_rules_configured
|
||||
for filename in os.listdir(d):
|
||||
if filename.endswith('.json'):
|
||||
lang = filename.split('.json')[0]
|
||||
f = open(os.path.join(d, filename))
|
||||
data = json.load(f)
|
||||
rules = data.get('ordinal_indicators')
|
||||
if rules is not None and hasattr(rules, '__getslice__'):
|
||||
lang_rules = []
|
||||
for rule_set in rules:
|
||||
gender = rule_set.get('gender', None)
|
||||
category = rule_set.get('category', None)
|
||||
ordinal_rules[(lang, gender, category)] = OrdinalTrie(rule_set['suffixes'])
|
||||
ordinal_rules_configured = True
|
||||
|
||||
|
||||
def ordinal_suffixes(num, lang, gender=None, category=None):
|
||||
if not ordinal_rules_configured:
|
||||
raise RuntimeError('ordinal rules not configured')
|
||||
trie = ordinal_rules.get((lang, gender, category))
|
||||
if not trie:
|
||||
return None
|
||||
|
||||
return trie.search_suffix(str(num))
|
||||
Reference in New Issue
Block a user