Files
libpostal/scripts/geodata/points/index.py

160 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import geohash
import os
import math
import operator
import six
import ujson as json
from collections import defaultdict, OrderedDict
from leveldb import LevelDB
EARTH_RADIUS_KM = 6373 # in km
class PointIndex(object):
include_only_properties = None
persistent_index = False
cache_size = 0
INDEX_FILENAME = None
POINTS_DB_DIR = 'points'
DEFAULT_GEOHASH_PRECISION = 7
DEFAULT_PROPS_FILENAME = 'properties.json'
INDEX_FILENAME = 'index.json'
def __init__(self, index=None,
points=None,
points_db=None, save_dir=None,
points_db_path=None,
index_path=None,
include_only_properties=None,
precision=DEFAULT_GEOHASH_PRECISION):
if save_dir:
self.save_dir = save_dir
else:
self.save_dir = None
if include_only_properties and hasattr(include_only_properties, '__contains__'):
self.include_only_properties = include_only_properties
if not index_path:
index_path = os.path.join(save_dir or '.', self.INDEX_FILENAME)
if not index:
self.index = defaultdict(list)
else:
self.index = index
if not points_db_path:
points_db_path = os.path.join(save_dir or '.', self.POINTS_DB_DIR)
if not points_db:
self.points_db = LevelDB(points_db_path)
else:
self.points_db = points_db
self.precision = precision
self.i = 0
def create_index(self, overwrite=False):
self.index = defaultdict(list)
def index_point(self, lat, lon):
code = geohash.encode(lat, lon)[:self.precision]
for key in [code] + geohash.neighbors(code):
self.index[key].append((self.i, lat, lon))
def add_point(self, lat, lon, properties, cache=False, include_only_properties=None):
if include_only_properties is not None:
properties = {k: v for k, v in properties.iteritems() if k in include_only_properties}
self.index_point(lat, lon)
self.points_db.Put(self.properties_key(self.i), json.dumps(properties))
self.i += 1
def load_properties(self, filename):
properties = json.load(open(filename))
self.i = int(properties.get('num_polygons', self.i))
self.precision = int(properties.get('precision', self.precision))
def save_properties(self, out_filename):
out = open(out_filename, 'w')
json.dump({'num_polygons': str(self.i),
'precision': self.precision}, out)
def save_index(self):
if not self.index_path:
self.index_path = os.path.join(self.save_dir or '.', self.INDEX_FILENAME)
json.dump(self.index, open(self.index_path, 'w'))
@classmethod
def load_index(cls, d, index_name=None):
return json.load(open(os.path.join(d, index_name or cls.INDEX_FILENAME)))
def properties_key(self, i):
return 'props:{}'.format(i)
def save(self):
self.save_index()
self.save_properties(os.path.join(self.save_dir, self.DEFAULT_PROPS_FILENAME))
def haversine_distance(self, lat1, lon1, lat2, lon2, radius=EARTH_RADIUS_KM):
"""Calculate the Haversine distance between two lat/lon pairs, given by:
a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
c = 2 ⋅ atan2( √a, √(1a) )
d = R ⋅ c
where R is the radius of the Earth (in kilometers). By default we use 6373 km,
a radius optimized for calculating distances at approximately 39 degrees from
the equator i.e. Washington, DC
:param lat1: first latitude
:param lon1: first longitude (use negative range for longitudes West of the Prime Meridian)
:param lat2: second latitude
:param lon2: second longitude (use negative range for longitudes West of the Prime Meridian)
:param radius: radius of the Earth in (miles|kilometers) depending on the desired units
"""
lat1 = math.radians(lat1)
lat2 = math.radians(lat2)
lon1 = math.radians(lon1)
lon2 = math.radians(lon2)
dlon = lon2 - lon1
dlat = lat2 - lat1
a = (math.sin(dlat / 2.0)) ** 2 + math.cos(lat1) * math.cos(lat2) * (math.sin(dlon/2.0)) ** 2
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
d = radius * c
return d
def get_candidate_points(self, latitude, longitude):
code = geohash.encode(latitude, longitude)[:self.precision]
candidates = OrderedDict()
candidates.update([(k, None) for k in self.index.get(code, [])])
for neighbor in geohash.neighbors(code):
candidates.update([(k, None) for k in self.index.get(neighbor, [])])
return candidates.keys()
def point_distances(self, latitude, longitude):
candidates = self.get_candidate_points(latitude, longitude)
return [(i, lat, lon, self.haversine_distance(latitude, longitude, lat, lon)) for i, lat, lon in candidates]
def nearest_n_points(self, latitude, longitude, n=2):
distances = self.point_distances(latitude, longitude)
if not distances:
return None
return sorted(distances, key=operator.itemgetter(-1))[:n]
def nearest_point(self, latitude, longitude):
distances = self.nearest_n_points(latitude, longitude, n=1)
if not distances:
return None
return distances[0]