Initial fork commit
This commit is contained in:
0
scripts/geodata/chains/__init__.py
Normal file
0
scripts/geodata/chains/__init__.py
Normal file
23
scripts/geodata/chains/chains.sh
Executable file
23
scripts/geodata/chains/chains.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
if [ "$#" -ge 1 ]; then
|
||||
DATA_DIR=$1
|
||||
else
|
||||
DATA_DIR=$(pwd)
|
||||
fi
|
||||
|
||||
PWD=$(pwd)
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
|
||||
python $SCRIPT_DIR/chains_tsv.py $DATA_DIR/planet-venues.osm $DATA_DIR/chains.tsv
|
||||
|
||||
cd $DATA_DIR
|
||||
split -d -C524200 chains.tsv chains.split.
|
||||
|
||||
for filename in chains.split.*; do
|
||||
extension="${filename##*.0}"
|
||||
name="${filename%%.*}"
|
||||
echo -e "name_lower\tname\tcanonical\tknown_chain\tcount" | cat - $filename > /tmp/out
|
||||
mv /tmp/out $name.$extension.tsv
|
||||
rm $filename
|
||||
done
|
||||
|
||||
cd $PWD
|
||||
78
scripts/geodata/chains/chains_tsv.py
Normal file
78
scripts/geodata/chains/chains_tsv.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
import six
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
from collections import Counter
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.address_expansions.address_dictionaries import ADDRESS_EXPANSIONS_DIR
|
||||
from geodata.osm.extract import *
|
||||
from geodata.encoding import safe_encode
|
||||
|
||||
|
||||
class VenueNames(object):
|
||||
def __init__(self, venues_filename):
|
||||
self.venues_filename = venues_filename
|
||||
self.all_chains = set()
|
||||
self.chain_canonical = {}
|
||||
|
||||
for filename in glob.glob(os.path.join(ADDRESS_EXPANSIONS_DIR, '**', 'chains.txt')):
|
||||
f = open(filename)
|
||||
for line in f:
|
||||
line = line.rstrip()
|
||||
phrases = safe_decode(line).split(six.u('|'))
|
||||
self.all_chains |= set(phrases)
|
||||
canonical = phrases[0]
|
||||
for p in phrases[1:]:
|
||||
self.chain_canonical[p] = canonical
|
||||
|
||||
self.names = Counter()
|
||||
self.names_lower = Counter()
|
||||
self.names_cap = defaultdict(Counter)
|
||||
|
||||
def count(self):
|
||||
i = 0
|
||||
for node_id, value, deps in parse_osm(self.venues_filename):
|
||||
name = value.get('name')
|
||||
if not name:
|
||||
continue
|
||||
self.names[name] += 1
|
||||
self.names_lower[name.lower()] += 1
|
||||
self.names_cap[name.lower()][name] += 1
|
||||
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print 'did', i
|
||||
i += 1
|
||||
|
||||
def write_to_tsv(self, out_filename, min_threshold=5):
|
||||
writer = csv.writer(open(out_filename, 'w'), delimiter='\t')
|
||||
for k, v in self.names_lower.most_common():
|
||||
if v < min_threshold:
|
||||
break
|
||||
canonical = self.chain_canonical.get(k)
|
||||
if canonical:
|
||||
canonical = self.names_cap[canonical].most_common(1)[0][0]
|
||||
else:
|
||||
canonical = ''
|
||||
most_common_cap = self.names_cap[k].most_common(1)[0][0]
|
||||
writer.writerow((safe_encode(k),
|
||||
safe_encode(most_common_cap),
|
||||
safe_encode(canonical),
|
||||
safe_encode(1) if k in self.all_chains else '',
|
||||
safe_encode(v)))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 3:
|
||||
print('Usage: python chains_tsv.py infile outfile')
|
||||
sys.exit(1)
|
||||
input_file = sys.argv[1]
|
||||
output_file = sys.argv[2]
|
||||
|
||||
names = VenueNames(input_file)
|
||||
names.count()
|
||||
names.write_to_tsv(output_file)
|
||||
100
scripts/geodata/chains/query.py
Normal file
100
scripts/geodata/chains/query.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.address_expansions.gazetteers import chains_gazetteer
|
||||
from geodata.categories.config import category_config
|
||||
from geodata.categories.preposition import CategoryPreposition
|
||||
from geodata.math.sampling import weighted_choice, cdf
|
||||
from geodata.text.normalize import normalized_tokens
|
||||
from geodata.text.tokenize import tokenize, token_types
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name, add_address')
|
||||
|
||||
NULL_CHAIN_QUERY = ChainQuery(None, None, False, False)
|
||||
|
||||
|
||||
class Chain(object):
|
||||
@classmethod
|
||||
def tokenize_name(cls, name):
|
||||
if not name:
|
||||
return []
|
||||
tokens = normalized_tokens(name)
|
||||
return tokens
|
||||
|
||||
@classmethod
|
||||
def possible_chain(cls, name):
|
||||
'''
|
||||
Determines if a venue name contains the name of a known chain store.
|
||||
|
||||
Returns a tuple of:
|
||||
|
||||
(True/False, known chain phrases, other tokens)
|
||||
|
||||
Handles cases like "Hard Rock Cafe Times Square" and allows for downstream
|
||||
decision making (i.e. if the tokens have a low IDF in the local area we might
|
||||
want to consider it a chain).
|
||||
'''
|
||||
tokens = cls.tokenize_name(name)
|
||||
if not tokens:
|
||||
return False, [], []
|
||||
matches = chains_gazetteer.filter(tokens)
|
||||
other_tokens = []
|
||||
phrases = []
|
||||
for t, c, l, d in matches:
|
||||
if c == token_types.PHRASE:
|
||||
phrases.append((t, c, l, d))
|
||||
else:
|
||||
other_tokens.append((t, c))
|
||||
|
||||
return len(phrases) > 0, phrases, other_tokens if len(phrases) > 0 else []
|
||||
|
||||
@classmethod
|
||||
def extract(cls, name):
|
||||
'''
|
||||
Determines if an entire venue name matches a known chain store.
|
||||
|
||||
Note: to avoid false positives, only return True if all of the tokens
|
||||
in the venue's name are part of a single chain store phrase. This will
|
||||
miss a few things like "Hard Rock Cafe Times Square" and the like.
|
||||
|
||||
It will however handle compound chain stores like Subway/Taco Bell
|
||||
'''
|
||||
|
||||
possible, phrases, other_tokens = cls.possible_chain(name)
|
||||
is_chain = possible and not any((c in token_types.WORD_TOKEN_TYPES for t, c in other_tokens))
|
||||
return is_chain, phrases if is_chain else []
|
||||
|
||||
@classmethod
|
||||
def alternate_form(cls, language, dictionary, canonical):
|
||||
choices = address_config.sample_phrases.get((language, dictionary), {}).get(canonical)
|
||||
if not choices:
|
||||
return canonical
|
||||
return random.choice(choices)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, chain, language, country=None):
|
||||
if not chain:
|
||||
return NULL_CHAIN_QUERY
|
||||
|
||||
chain_phrase = safe_decode(chain)
|
||||
|
||||
prep_phrase_type = CategoryPreposition.random(language, country=country)
|
||||
|
||||
if prep_phrase_type in (None, CategoryPreposition.NULL):
|
||||
return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
|
||||
|
||||
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
|
||||
if not values:
|
||||
return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
|
||||
|
||||
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
|
||||
prep_phrase = safe_decode(prep_phrase)
|
||||
|
||||
add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
|
||||
add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
|
||||
|
||||
return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
|
||||
Reference in New Issue
Block a user