[languages] Refactorying street_types_gazetteer a bit so dictionaries are configurable
This commit is contained in:
@@ -29,7 +29,10 @@ POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii',
|
|||||||
'm', 'mm', 'mmm', 'mmmm'])
|
'm', 'mm', 'mmm', 'mmmm'])
|
||||||
|
|
||||||
|
|
||||||
class StreetTypesGazetteer(PhraseFilter):
|
class DictionaryPhraseFilter(PhraseFilter):
|
||||||
|
def __init__(self, *dictionaries):
|
||||||
|
self.dictionaries = dictionaries
|
||||||
|
|
||||||
def serialize(self, s):
|
def serialize(self, s):
|
||||||
return s
|
return s
|
||||||
|
|
||||||
@@ -39,7 +42,10 @@ class StreetTypesGazetteer(PhraseFilter):
|
|||||||
def configure(self, base_dir=DICTIONARIES_DIR):
|
def configure(self, base_dir=DICTIONARIES_DIR):
|
||||||
kvs = defaultdict(OrderedDict)
|
kvs = defaultdict(OrderedDict)
|
||||||
for lang in os.listdir(DICTIONARIES_DIR):
|
for lang in os.listdir(DICTIONARIES_DIR):
|
||||||
for filename in ('street_types.txt', 'directionals.txt'):
|
for filename in self.dictionaries:
|
||||||
|
is_suffix_dictionary = 'suffixes' in filename
|
||||||
|
is_prefix_dictionary = 'prefixes' in filename
|
||||||
|
|
||||||
path = os.path.join(DICTIONARIES_DIR, lang, filename)
|
path = os.path.join(DICTIONARIES_DIR, lang, filename)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
continue
|
continue
|
||||||
@@ -51,21 +57,9 @@ class StreetTypesGazetteer(PhraseFilter):
|
|||||||
for phrase in safe_decode(line).split(u'|'):
|
for phrase in safe_decode(line).split(u'|'):
|
||||||
if phrase in POSSIBLE_ROMAN_NUMERALS:
|
if phrase in POSSIBLE_ROMAN_NUMERALS:
|
||||||
continue
|
continue
|
||||||
kvs[phrase][lang] = None
|
if is_suffix_dictionary:
|
||||||
for filename in ('concatenated_suffixes_separable.txt', 'concatenated_suffixes_inseparable.txt', 'concatenated_prefixes_separable.txt'):
|
|
||||||
path = os.path.join(DICTIONARIES_DIR, lang, filename)
|
|
||||||
if not os.path.exists(path):
|
|
||||||
continue
|
|
||||||
|
|
||||||
for line in open(path):
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for phrase in safe_decode(line).split(u'|'):
|
|
||||||
if 'suffixes' in filename:
|
|
||||||
phrase = SUFFIX_KEY + phrase[::-1]
|
phrase = SUFFIX_KEY + phrase[::-1]
|
||||||
else:
|
elif is_prefix_dictionary:
|
||||||
phrase = PREFIX_KEY + phrase
|
phrase = PREFIX_KEY + phrase
|
||||||
|
|
||||||
kvs[phrase][lang] = None
|
kvs[phrase][lang] = None
|
||||||
@@ -89,7 +83,7 @@ class StreetTypesGazetteer(PhraseFilter):
|
|||||||
return None, 0
|
return None, 0
|
||||||
|
|
||||||
def basic_filter(self, tokens):
|
def basic_filter(self, tokens):
|
||||||
return super(StreetTypesGazetteer, self).filter(tokens)
|
return super(DictionaryPhraseFilter, self).filter(tokens)
|
||||||
|
|
||||||
def filter(self, tokens):
|
def filter(self, tokens):
|
||||||
for c, t, data in self.basic_filter(tokens):
|
for c, t, data in self.basic_filter(tokens):
|
||||||
@@ -102,13 +96,16 @@ class StreetTypesGazetteer(PhraseFilter):
|
|||||||
yield (token_types.PHRASE, [(c,) + t], suffix_search)
|
yield (token_types.PHRASE, [(c,) + t], suffix_search)
|
||||||
continue
|
continue
|
||||||
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
|
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
|
||||||
print 'prefix = ', prefix_search, token[:(prefix_len - len(PREFIX_KEY))]
|
|
||||||
if prefix_search and self.trie.get(token[:(prefix_len - len(PREFIX_KEY))]):
|
if prefix_search and self.trie.get(token[:(prefix_len - len(PREFIX_KEY))]):
|
||||||
yield (token_types.PHRASE, [(c,) + t], prefix_search)
|
yield (token_types.PHRASE, [(c,) + t], prefix_search)
|
||||||
continue
|
continue
|
||||||
yield c, t, data
|
yield c, t, data
|
||||||
|
|
||||||
street_types_gazetteer = StreetTypesGazetteer()
|
street_types_gazetteer = DictionaryPhraseFilter('street_types.txt',
|
||||||
|
'directionals.txt',
|
||||||
|
'concatenated_suffixes_separable.txt',
|
||||||
|
'concatenated_suffixes_inseparable.txt',
|
||||||
|
'concatenated_prefixes_separable.txt')
|
||||||
|
|
||||||
|
|
||||||
UNKNOWN_LANGUAGE = 'unk'
|
UNKNOWN_LANGUAGE = 'unk'
|
||||||
|
|||||||
Reference in New Issue
Block a user