Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/chains/init.py
+++ b/scripts/geodata/chains/init.py
--- a/scripts/geodata/chains/chains.sh
+++ b/scripts/geodata/chains/chains.sh
@@ -0,0 +1,23 @@
+if [ "$#" -ge 1 ]; then
+    DATA_DIR=$1
+else
+    DATA_DIR=$(pwd)
+fi
+
+PWD=$(pwd)
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+python $SCRIPT_DIR/chains_tsv.py $DATA_DIR/planet-venues.osm $DATA_DIR/chains.tsv
+
+cd $DATA_DIR
+split -d -C524200 chains.tsv chains.split.
+
+for filename in chains.split.*; do 
+    extension="${filename##*.0}"
+    name="${filename%%.*}"
+    echo -e "name_lower\tname\tcanonical\tknown_chain\tcount" | cat - $filename > /tmp/out
+    mv /tmp/out $name.$extension.tsv
+    rm $filename
+done
+
+cd $PWD
--- a/scripts/geodata/chains/chains_tsv.py
+++ b/scripts/geodata/chains/chains_tsv.py
@@ -0,0 +1,78 @@
+import csv
+import os
+import glob
+import six
+import sys
+
+from collections import defaultdict
+from collections import Counter
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.address_expansions.address_dictionaries import ADDRESS_EXPANSIONS_DIR
+from geodata.osm.extract import *
+from geodata.encoding import safe_encode
+
+
+class VenueNames(object):
+    def __init__(self, venues_filename):
+        self.venues_filename = venues_filename
+        self.all_chains = set()
+        self.chain_canonical = {}
+
+        for filename in glob.glob(os.path.join(ADDRESS_EXPANSIONS_DIR, '**', 'chains.txt')):
+            f = open(filename)
+            for line in f:
+                line = line.rstrip()
+                phrases = safe_decode(line).split(six.u('|'))
+                self.all_chains |= set(phrases)
+                canonical = phrases[0]
+                for p in phrases[1:]:
+                    self.chain_canonical[p] = canonical
+
+        self.names = Counter()
+        self.names_lower = Counter()
+        self.names_cap = defaultdict(Counter)
+
+    def count(self):
+        i = 0
+        for node_id, value, deps in parse_osm(self.venues_filename):
+            name = value.get('name')
+            if not name:
+                continue
+            self.names[name] += 1
+            self.names_lower[name.lower()] += 1
+            self.names_cap[name.lower()][name] += 1
+
+            if i % 1000 == 0 and i > 0:
+                print 'did', i
+            i += 1
+
+    def write_to_tsv(self, out_filename, min_threshold=5):
+        writer = csv.writer(open(out_filename, 'w'), delimiter='\t')
+        for k, v in self.names_lower.most_common():
+            if v < min_threshold:
+                break
+            canonical = self.chain_canonical.get(k)
+            if canonical:
+                canonical = self.names_cap[canonical].most_common(1)[0][0]
+            else:
+                canonical = ''
+            most_common_cap = self.names_cap[k].most_common(1)[0][0]
+            writer.writerow((safe_encode(k),
+                             safe_encode(most_common_cap),
+                             safe_encode(canonical),
+                             safe_encode(1) if k in self.all_chains else '',
+                             safe_encode(v)))
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print('Usage: python chains_tsv.py infile outfile')
+        sys.exit(1)
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+
+    names = VenueNames(input_file)
+    names.count()
+    names.write_to_tsv(output_file)
--- a/scripts/geodata/chains/query.py
+++ b/scripts/geodata/chains/query.py
@@ -0,0 +1,100 @@
+import random
+import six
+
+from collections import namedtuple
+
+from geodata.addresses.config import address_config
+from geodata.address_expansions.gazetteers import chains_gazetteer
+from geodata.categories.config import category_config
+from geodata.categories.preposition import CategoryPreposition
+from geodata.math.sampling import weighted_choice, cdf
+from geodata.text.normalize import normalized_tokens
+from geodata.text.tokenize import tokenize, token_types
+from geodata.encoding import safe_decode
+
+ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name, add_address')
+
+NULL_CHAIN_QUERY = ChainQuery(None, None, False, False)
+
+
+class Chain(object):
+    @classmethod
+    def tokenize_name(cls, name):
+        if not name:
+            return []
+        tokens = normalized_tokens(name)
+        return tokens
+
+    @classmethod
+    def possible_chain(cls, name):
+        '''
+        Determines if a venue name contains the name of a known chain store.
+
+        Returns a tuple of:
+
+        (True/False, known chain phrases, other tokens)
+
+        Handles cases like "Hard Rock Cafe Times Square" and allows for downstream
+        decision making (i.e. if the tokens have a low IDF in the local area we might
+        want to consider it a chain).
+        '''
+        tokens = cls.tokenize_name(name)
+        if not tokens:
+            return False, [], []
+        matches = chains_gazetteer.filter(tokens)
+        other_tokens = []
+        phrases = []
+        for t, c, l, d in matches:
+            if c == token_types.PHRASE:
+                phrases.append((t, c, l, d))
+            else:
+                other_tokens.append((t, c))
+
+        return len(phrases) > 0, phrases, other_tokens if len(phrases) > 0 else []
+
+    @classmethod
+    def extract(cls, name):
+        '''
+        Determines if an entire venue name matches a known chain store.
+
+        Note: to avoid false positives, only return True if all of the tokens
+        in the venue's name are part of a single chain store phrase. This will
+        miss a few things like "Hard Rock Cafe Times Square" and the like.
+
+        It will however handle compound chain stores like Subway/Taco Bell
+        '''
+
+        possible, phrases, other_tokens = cls.possible_chain(name)
+        is_chain = possible and not any((c in token_types.WORD_TOKEN_TYPES for t, c in other_tokens))
+        return is_chain, phrases if is_chain else []
+
+    @classmethod
+    def alternate_form(cls, language, dictionary, canonical):
+        choices = address_config.sample_phrases.get((language, dictionary), {}).get(canonical)
+        if not choices:
+            return canonical
+        return random.choice(choices)
+
+    @classmethod
+    def phrase(cls, chain, language, country=None):
+        if not chain:
+            return NULL_CHAIN_QUERY
+
+        chain_phrase = safe_decode(chain)
+
+        prep_phrase_type = CategoryPreposition.random(language, country=country)
+
+        if prep_phrase_type in (None, CategoryPreposition.NULL):
+            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
+
+        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
+        if not values:
+            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
+
+        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
+        prep_phrase = safe_decode(prep_phrase)
+
+        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
+        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
+
+        return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)