Files
libpostal/scripts/geodata/phrases/extraction.py

125 lines
3.7 KiB
Python

import csv
import six
from collections import defaultdict, Counter
from itertools import izip, islice
from geodata.text.tokenize import tokenize, token_types
from geodata.encoding import safe_encode
class FrequentPhraseExtractor(object):
'''
Extract common multi-word phrases from a file/iterator using the
frequent itemsets method to keep memory usage low.
'''
WORD_TOKEN_TYPES = (token_types.WORD,
token_types.IDEOGRAPHIC_CHAR,
token_types.ABBREVIATION,
token_types.HANGUL_SYLLABLE,
token_types.ACRONYM)
def __init__(self, min_count=5):
self.min_count = min_count
self.vocab = defaultdict(int)
self.frequencies = defaultdict(int)
self.train_words = 0
def ngrams(self, words, n=2):
for t in izip(*(islice(words, i, None) for i in xrange(n))):
yield t
def add_tokens(self, s):
for t, c in tokenize(s):
if c in self.WORD_TOKEN_TYPES:
self.vocab[((t.lower(), c), )] += 1
self.train_words += 1
def create_vocab(self, f):
for line in f:
line = line.rstrip()
if not line:
continue
self.add_tokens(line)
self.prune_vocab()
def prune_vocab(self):
for k in self.vocab.keys():
if self.vocab[k] < self.min_count:
del self.vocab[k]
def add_ngrams(self, s, n=2):
sequences = []
seq = []
for t, c in tokenize(s):
if c in self.WORD_TOKEN_TYPES:
seq.append((t, c))
elif seq:
sequences.append(seq)
seq = []
if seq:
sequences.append(seq)
for seq in sequences:
for gram in self.ngrams(seq, n=n):
last_c = None
prev_tokens = tuple([(t.lower(), c) for t, c in gram[:-1]])
if prev_tokens in self.vocab:
t, c = gram[-1]
current_token = (t.lower(), c)
self.frequencies[(prev_tokens, current_token)] += 1
def add_frequent_ngrams_to_vocab(self):
for k, v in six.iteritems(self.frequencies):
if v < self.min_count:
continue
prev, current = k
self.vocab[prev + (current,)] = v
def find_ngram_phrases(self, f, n=2):
self.frequencies = defaultdict(int)
for line in f:
line = line.rstrip()
if not line:
continue
self.add_ngrams(line, n=n)
self.add_frequent_ngrams_to_vocab()
self.frequencies = defaultdict(int)
@classmethod
def from_file(cls, f, max_phrase_len=5, min_count=5):
phrases = cls()
print('Doing frequent words for {}'.format(filename))
f.seek(0)
phrases.create_vocab(f)
for n in xrange(2, max_phrase_len + 1):
print('Doing frequent ngrams, n={} for {}'.format(n, filename))
f.seek(0)
phrases.find_ngram_phrases(f, n=n)
print('Done with {}'.format(filename))
return phrases
def to_tsv(self, filename, mode='w', max_rows=None):
f = open(filename, mode)
writer = csv.writer(f, delimiter='\t')
for i, (k, v) in enumerate(Counter(self.vocab).most_common()):
if max_rows is not None and i == max_rows:
break
gram = []
for t, c in k:
gram.append(t)
if c != token_types.IDEOGRAPHIC_CHAR:
gram.append(six.text_type(' '))
phrase = six.text_type('').join(gram)
writer.writerow((safe_encode(phrase), safe_encode(len(k)), safe_encode(v)))