[languages][ci skip] Checking in script to extract address phrases in various languages using frequent itemsets

2016-03-08 14:35:20 -05:00
parent 991c3d2a40
commit 08085ee08b
2 changed files with 169 additions and 0 deletions
--- a/scripts/geodata/phrases/init.py
+++ b/scripts/geodata/phrases/init.py
--- a/scripts/geodata/phrases/extract_phrases.py
+++ b/scripts/geodata/phrases/extract_phrases.py
@@ -0,0 +1,169 @@
+import argparse
+import csv
+import os
+import six
+import sys
+
+from collections import defaultdict, Counter
+from itertools import izip, islice
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.text.tokenize import tokenize, token_types
+from geodata.encoding import safe_encode
+
+
+class FrequentPhraseExtractor(object):
+    '''
+    Extract common multi-word phrases from a file/iterator using the
+    frequent itemsets method to keep memory usage low.
+    '''
+    WORD_TOKEN_TYPES = (token_types.WORD,
+                        token_types.IDEOGRAPHIC_CHAR,
+                        token_types.ABBREVIATION,
+                        token_types.HANGUL_SYLLABLE,
+                        token_types.ACRONYM)
+
+    def __init__(self, min_count=5):
+        self.min_count = min_count
+
+        self.vocab = defaultdict(int)
+        self.frequencies = defaultdict(int)
+        self.train_words = 0
+
+    def ngrams(self, words, n=2):
+        for t in izip(*(islice(words, i, None) for i in xrange(n))):
+            yield t
+
+    def add_tokens(self, s):
+        for t, c in tokenize(s):
+            if c in self.WORD_TOKEN_TYPES:
+                self.vocab[((t.lower(), c), )] += 1
+                self.train_words += 1
+
+    def create_vocab(self, f):
+        for line in f:
+            line = line.rstrip()
+            if not line:
+                continue
+            self.add_tokens(line)
+        self.prune_vocab()
+
+    def prune_vocab(self):
+        for k in self.vocab.keys():
+            if self.vocab[k] < self.min_count:
+                del self.vocab[k]
+
+    def add_ngrams(self, s, n=2):
+        sequences = []
+        seq = []
+        for t, c in tokenize(s):
+            if c in self.WORD_TOKEN_TYPES:
+                seq.append((t, c))
+            elif seq:
+                sequences.append(seq)
+                seq = []
+        if seq:
+            sequences.append(seq)
+
+        for seq in sequences:
+            for gram in self.ngrams(seq, n=n):
+                last_c = None
+
+                prev_tokens = tuple([(t.lower(), c) for t, c in gram[:-1]])
+                if prev_tokens in self.vocab:
+                    t, c = gram[-1]
+                    current_token = (t.lower(), c)
+
+                    self.frequencies[(prev_tokens, current_token)] += 1
+
+    def add_frequent_ngrams_to_vocab(self):
+        for k, v in six.iteritems(self.frequencies):
+            if v < self.min_count:
+                continue
+            prev, current = k
+            self.vocab[prev + (current,)] = v
+
+    def find_ngram_phrases(self, f, n=2):
+        self.frequencies = defaultdict(int)
+        for line in f:
+            line = line.rstrip()
+            if not line:
+                continue
+            self.add_ngrams(line, n=n)
+        self.add_frequent_ngrams_to_vocab()
+        self.frequencies = defaultdict(int)
+
+    @classmethod
+    def from_file(cls, filename, max_phrase_len=5, min_count=5):
+        phrases = cls()
+
+        print('Doing frequent words for {}'.format(filename))
+        phrases.create_vocab(open(filename))
+
+        for n in xrange(2, max_phrase_len + 1):
+            print('Doing frequent ngrams, n={} for {}'.format(n, filename))
+            phrases.find_ngram_phrases(open(filename), n=n)
+
+        print('Done with {}'.format(filename))
+
+        return phrases
+
+    def to_tsv(self, filename, mode='w', max_rows=None):
+        f = open(filename, mode)
+        writer = csv.writer(f, delimiter='\t')
+        for i, (k, v) in enumerate(Counter(self.vocab).most_common()):
+            if max_rows is not None and i == max_rows:
+                break
+
+            gram = []
+            for t, c in k:
+                gram.append(t)
+                if c != token_types.IDEOGRAPHIC_CHAR:
+                    gram.append(six.text_type(' '))
+
+            phrase = six.text_type('').join(gram)
+
+            writer.writerow((safe_encode(phrase), safe_encode(len(k)), safe_encode(v)))
+
+if __name__ == '__main__':
+    '''
+    Extract frequent words and multi-word phrases from an input file. The
+    input file is expected to be a simple text file with one "sentence" per line.
+    For OSM we typically use only street names and venue names.
+
+    Phrases are considered to be sequences of n contiguous tokens given that all the
+    tokens are of a "word" type according to the libpostal tokenizer, which implements
+    the full Unicode TR-29 spec and will e.g. treat ideograms as individual tokens even
+    though they are usually not separated by whitespace or punctuation.
+
+    Using phrases is not only helpful for finding frequent patterns like "county road"
+    or "roman catholic church" in English, but is also helpful e.g. in CJK languages for
+    finding words that are longer than a single ideogram.
+
+    Example usage:
+
+    python extract_phrases.py en -o en.tsv --min-count=100
+    find . -type f -size -10M | xargs -n1 basename | xargs -n1 --max-procs=4 -I{} python extract_phrases.py {} -o {}.tsv --min-count=5
+    '''
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('filename', help='Input file')
+    parser.add_argument('-o', '--output-file', required=True,
+                        help='Output file')
+    parser.add_argument('-p', '--phrase-len', default=5, type=int,
+                        help='Maximum phrase length')
+    parser.add_argument('-n', '--min-count', default=5, type=int,
+                        help='Minimum count threshold')
+    parser.add_argument('-m', '--max-rows', default=None, type=int)
+
+    args = parser.parse_args()
+
+    if args.phrase_len < 1:
+        parser.error('--phrase-len must be >= 1')
+
+    phrases = FrequentPhraseExtractor.from_file(args.filename,
+                                                min_count=args.min_count,
+                                                max_phrase_len=args.phrase_len)
+    phrases.to_tsv(args.output_file, max_rows=args.max_rows)