From 0afbc84a2bfb57130e0bf40342d78d061891158c Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 14 Jul 2016 17:07:39 -0400 Subject: [PATCH] [phrases] pulling phrase extraction out of command-line script so it's more reusable for other tasks --- scripts/geodata/phrases/extract_phrases.py | 126 +-------------------- scripts/geodata/phrases/extraction.py | 124 ++++++++++++++++++++ 2 files changed, 127 insertions(+), 123 deletions(-) create mode 100644 scripts/geodata/phrases/extraction.py diff --git a/scripts/geodata/phrases/extract_phrases.py b/scripts/geodata/phrases/extract_phrases.py index ef490c4b..d7b212dd 100644 --- a/scripts/geodata/phrases/extract_phrases.py +++ b/scripts/geodata/phrases/extract_phrases.py @@ -1,131 +1,11 @@ import argparse -import csv import os -import six import sys -from collections import defaultdict, Counter -from itertools import izip, islice - this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) -from geodata.text.tokenize import tokenize, token_types -from geodata.encoding import safe_encode - - -class FrequentPhraseExtractor(object): - ''' - Extract common multi-word phrases from a file/iterator using the - frequent itemsets method to keep memory usage low. - ''' - WORD_TOKEN_TYPES = (token_types.WORD, - token_types.IDEOGRAPHIC_CHAR, - token_types.ABBREVIATION, - token_types.HANGUL_SYLLABLE, - token_types.ACRONYM) - - def __init__(self, min_count=5): - self.min_count = min_count - - self.vocab = defaultdict(int) - self.frequencies = defaultdict(int) - self.train_words = 0 - - def ngrams(self, words, n=2): - for t in izip(*(islice(words, i, None) for i in xrange(n))): - yield t - - def add_tokens(self, s): - for t, c in tokenize(s): - if c in self.WORD_TOKEN_TYPES: - self.vocab[((t.lower(), c), )] += 1 - self.train_words += 1 - - def create_vocab(self, f): - for line in f: - line = line.rstrip() - if not line: - continue - self.add_tokens(line) - self.prune_vocab() - - def prune_vocab(self): - for k in self.vocab.keys(): - if self.vocab[k] < self.min_count: - del self.vocab[k] - - def add_ngrams(self, s, n=2): - sequences = [] - seq = [] - for t, c in tokenize(s): - if c in self.WORD_TOKEN_TYPES: - seq.append((t, c)) - elif seq: - sequences.append(seq) - seq = [] - if seq: - sequences.append(seq) - - for seq in sequences: - for gram in self.ngrams(seq, n=n): - last_c = None - - prev_tokens = tuple([(t.lower(), c) for t, c in gram[:-1]]) - if prev_tokens in self.vocab: - t, c = gram[-1] - current_token = (t.lower(), c) - - self.frequencies[(prev_tokens, current_token)] += 1 - - def add_frequent_ngrams_to_vocab(self): - for k, v in six.iteritems(self.frequencies): - if v < self.min_count: - continue - prev, current = k - self.vocab[prev + (current,)] = v - - def find_ngram_phrases(self, f, n=2): - self.frequencies = defaultdict(int) - for line in f: - line = line.rstrip() - if not line: - continue - self.add_ngrams(line, n=n) - self.add_frequent_ngrams_to_vocab() - self.frequencies = defaultdict(int) - - @classmethod - def from_file(cls, filename, max_phrase_len=5, min_count=5): - phrases = cls() - - print('Doing frequent words for {}'.format(filename)) - phrases.create_vocab(open(filename)) - - for n in xrange(2, max_phrase_len + 1): - print('Doing frequent ngrams, n={} for {}'.format(n, filename)) - phrases.find_ngram_phrases(open(filename), n=n) - - print('Done with {}'.format(filename)) - - return phrases - - def to_tsv(self, filename, mode='w', max_rows=None): - f = open(filename, mode) - writer = csv.writer(f, delimiter='\t') - for i, (k, v) in enumerate(Counter(self.vocab).most_common()): - if max_rows is not None and i == max_rows: - break - - gram = [] - for t, c in k: - gram.append(t) - if c != token_types.IDEOGRAPHIC_CHAR: - gram.append(six.text_type(' ')) - - phrase = six.text_type('').join(gram) - - writer.writerow((safe_encode(phrase), safe_encode(len(k)), safe_encode(v))) +from geodata.phrases.extraction import FrequentPhraseExtractor if __name__ == '__main__': ''' @@ -163,7 +43,7 @@ if __name__ == '__main__': if args.phrase_len < 1: parser.error('--phrase-len must be >= 1') - phrases = FrequentPhraseExtractor.from_file(args.filename, + phrases = FrequentPhraseExtractor.from_file(open(args.filename), min_count=args.min_count, max_phrase_len=args.phrase_len) - phrases.to_tsv(args.output_file, max_rows=args.max_rows) + phrases.to_tsv(args.output_file, max_rows=args.max_rows) \ No newline at end of file diff --git a/scripts/geodata/phrases/extraction.py b/scripts/geodata/phrases/extraction.py new file mode 100644 index 00000000..69aae4b9 --- /dev/null +++ b/scripts/geodata/phrases/extraction.py @@ -0,0 +1,124 @@ +import csv +import six + +from collections import defaultdict, Counter +from itertools import izip, islice + +from geodata.text.tokenize import tokenize, token_types +from geodata.encoding import safe_encode + + +class FrequentPhraseExtractor(object): + ''' + Extract common multi-word phrases from a file/iterator using the + frequent itemsets method to keep memory usage low. + ''' + WORD_TOKEN_TYPES = (token_types.WORD, + token_types.IDEOGRAPHIC_CHAR, + token_types.ABBREVIATION, + token_types.HANGUL_SYLLABLE, + token_types.ACRONYM) + + def __init__(self, min_count=5): + self.min_count = min_count + + self.vocab = defaultdict(int) + self.frequencies = defaultdict(int) + self.train_words = 0 + + def ngrams(self, words, n=2): + for t in izip(*(islice(words, i, None) for i in xrange(n))): + yield t + + def add_tokens(self, s): + for t, c in tokenize(s): + if c in self.WORD_TOKEN_TYPES: + self.vocab[((t.lower(), c), )] += 1 + self.train_words += 1 + + def create_vocab(self, f): + for line in f: + line = line.rstrip() + if not line: + continue + self.add_tokens(line) + self.prune_vocab() + + def prune_vocab(self): + for k in self.vocab.keys(): + if self.vocab[k] < self.min_count: + del self.vocab[k] + + def add_ngrams(self, s, n=2): + sequences = [] + seq = [] + for t, c in tokenize(s): + if c in self.WORD_TOKEN_TYPES: + seq.append((t, c)) + elif seq: + sequences.append(seq) + seq = [] + if seq: + sequences.append(seq) + + for seq in sequences: + for gram in self.ngrams(seq, n=n): + last_c = None + + prev_tokens = tuple([(t.lower(), c) for t, c in gram[:-1]]) + if prev_tokens in self.vocab: + t, c = gram[-1] + current_token = (t.lower(), c) + + self.frequencies[(prev_tokens, current_token)] += 1 + + def add_frequent_ngrams_to_vocab(self): + for k, v in six.iteritems(self.frequencies): + if v < self.min_count: + continue + prev, current = k + self.vocab[prev + (current,)] = v + + def find_ngram_phrases(self, f, n=2): + self.frequencies = defaultdict(int) + for line in f: + line = line.rstrip() + if not line: + continue + self.add_ngrams(line, n=n) + self.add_frequent_ngrams_to_vocab() + self.frequencies = defaultdict(int) + + @classmethod + def from_file(cls, f, max_phrase_len=5, min_count=5): + phrases = cls() + + print('Doing frequent words for {}'.format(filename)) + f.seek(0) + phrases.create_vocab(f) + + for n in xrange(2, max_phrase_len + 1): + print('Doing frequent ngrams, n={} for {}'.format(n, filename)) + f.seek(0) + phrases.find_ngram_phrases(f, n=n) + + print('Done with {}'.format(filename)) + + return phrases + + def to_tsv(self, filename, mode='w', max_rows=None): + f = open(filename, mode) + writer = csv.writer(f, delimiter='\t') + for i, (k, v) in enumerate(Counter(self.vocab).most_common()): + if max_rows is not None and i == max_rows: + break + + gram = [] + for t, c in k: + gram.append(t) + if c != token_types.IDEOGRAPHIC_CHAR: + gram.append(six.text_type(' ')) + + phrase = six.text_type('').join(gram) + + writer.writerow((safe_encode(phrase), safe_encode(len(k)), safe_encode(v)))