Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

View File

@@ -0,0 +1,128 @@
import re
import six
import unittest
from geodata.addresses.entrances import *
from geodata.addresses.floors import *
from geodata.intersections.query import *
from geodata.addresses.po_boxes import *
from geodata.addresses.postcodes import *
from geodata.addresses.staircases import *
from geodata.addresses.units import *
from geodata.categories.query import *
from geodata.math.floats import isclose
invalid_phrase_re = re.compile(r'\b(None|False|True)\b')
class TestAddressConfigs(unittest.TestCase):
def valid_phrase(self, phrase):
return phrase is None or not invalid_phrase_re.search(phrase)
def check_components(self, language, country):
conf = address_config.get_property('components', language, country=country)
for component, value in six.iteritems(conf):
if component == 'combinations':
continue
total_prob = 0.0
for k, v in six.iteritems(value):
if k.endswith('probability'):
total_prob += v
self.assertTrue(isclose(total_prob, 1.0), six.u('language: {}, country: {}, component: {}'.format(language, country, component)))
def check_entrance_phrases(self, language, country=None):
for i in xrange(1000):
phrase = Entrance.phrase(Entrance.random(language, country=country), language, country=country)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
def check_staircase_phrases(self, language, country=None):
for i in xrange(1000):
phrase = Entrance.phrase(Entrance.random(language, country=country), language, country=country)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
def check_floor_phrases(self, language, country=None):
for i in xrange(10000):
phrase = Floor.phrase(Floor.random(language, country=country), language, country=country)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
for i in xrange(1000):
phrase = Floor.phrase(None, language, country=country)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
for i in xrange(1000):
phrase = Floor.phrase(None, language, country=country, num_floors=3)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
def check_unit_phrases(self, language, country=None):
for i in xrange(10000):
phrase = Unit.phrase(Unit.random(language, country=country), language, country=country)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
for i in xrange(1000):
phrase = Unit.phrase(None, language, country=country)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
for i in xrange(1000):
phrase = Unit.phrase(Unit.random(language, country=country, num_floors=3, num_basements=1), language, country=country)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
for zone in ('commercial', 'industrial', 'university'):
for i in xrange(1000):
phrase = Unit.phrase(Unit.random(language, country=country), language, country=country, zone=zone)
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
def check_po_boxes(self, language, country=None):
for i in xrange(1000):
phrase = POBox.phrase(POBox.random(language, country=country), language, country=country)
if phrase is None:
break
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
def check_postcodes(self, language, country=None):
for i in xrange(1000):
phrase = PostCode.phrase('12345', language, country=country)
if phrase is None:
break
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
def check_intersection_phrases(self, language, country=None):
for i in xrange(1000):
phrase = Intersection.phrase(language, country=country)
if phrase is None:
break
self.assertTrue(self.valid_phrase(phrase), six.u('phrase was: {}').format(phrase))
def check_category_phrases(self, language, country=None):
for i in xrange(1000):
phrase = Category.phrase(language, 'amenity', 'restaurant', country=country)
if phrase.category is None:
break
def check_config(self, language, country=None):
print('Doing lang={}, country={}'.format(language, country))
print('Checking components')
self.check_components(language, country=country)
print('Checking entrances')
self.check_entrance_phrases(language, country=country)
print('Checking staircases')
self.check_staircase_phrases(language, country=country)
print('Checking floors')
self.check_floor_phrases(language, country=country)
print('Checking units')
self.check_unit_phrases(language, country=country)
print('Checking intersections')
self.check_intersection_phrases(language, country=country)
print('Checking categories')
self.check_category_phrases(language, country=country)
print('Checking PO boxes')
self.check_po_boxes(language, country=country)
print('Checking postcodes')
self.check_postcodes(language, country=country)
def test_configs(self):
for lang, value in six.iteritems(address_config.address_configs):
self.check_config(lang)
for country in value.get('countries', []):
self.check_config(lang, country)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,119 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import sys
import unittest
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.i18n.languages import init_languages, get_country_languages, get_regional_languages
from geodata.language_id.disambiguation import disambiguate_language, street_types_gazetteer, UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE
country_test_cases = [
# String, country, expected language
('Division Street', 'us', 'en'),
('Kalfarveien', 'no', 'nb'),
('Upper Glenburn Road', 'gb', 'en'),
('Zafer Caddesi', 'cy', 'tr'),
# US has some Spanish and French street names
('Avenue P', 'us', 'en'),
('Avenue du Champs', 'us', 'fr'),
('Avenida de la Plata', 'us', 'es'),
('Pl', 'us', UNKNOWN_LANGUAGE),
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
('E Thetford Rd', 'us', 'en'),
('El Camino', 'us', 'es'),
('The El Camino', 'us', 'en'),
('Via Antiqua Street', 'us', 'en'),
('Salt Evaporator Plan Road', 'us', 'en'),
('Calle Las Brisas North', 'us', 'en'),
('Chateau Estates', 'us', 'en'),
('Grand Boulevard', 'us', 'en'),
('Rue Louis Phillippe', 'us', 'fr'),
('Calle Street', 'us', 'en'),
('Del Rio Avenue', 'us', 'en'),
('South Signal Butte Road', 'us', 'en'),
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
('South Alameda Street', 'us', 'en'),
('The Alameda', 'us', 'en'),
('Rincon Road', 'us', 'en'),
# Avenue + stopword
('Avenue du Bourget-du-Lac', 'je', 'fr'),
# UAE, English is non-default, has abbreviation
('128 A St', 'ae', 'en'),
('128 A St.', 'ae', 'en'),
# English / Arabic street address
('Omar Street شارع عمر', 'iq', AMBIGUOUS_LANGUAGE),
# Random script
('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE),
# Brussels address
('Avenue Paul Héger - Paul Hégerlaan', 'be', AMBIGUOUS_LANGUAGE),
('Smaragdstraat', 'be', 'nl'),
# India
('Kidwai nagar', 'in', 'hi'),
('Mavoor Rd.', 'in', 'en'),
# Sri Lanka
('Sri Sadathissa Mawatha', 'lk', 'si'),
# Russian
('Фрунзе улица', 'kg', 'ru'),
]
regional_test_cases = [
# Spain
('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
('Avinguda Diagonal', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
('Avinguda de Filipines - Avenida de Filipinas', 'es', 'qs_a1r', 'Cataluña/Catalunya', AMBIGUOUS_LANGUAGE),
('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'),
('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'),
('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'),
('Txurruka', 'es', 'qs_a1r', 'País Vasco/Euskadi', UNKNOWN_LANGUAGE),
# Belgium
('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'),
('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'),
# France / Occitan
('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),
]
class TestNormalization(unittest.TestCase):
def test_countries(self):
for s, country, expected in country_test_cases:
languages = get_country_languages(country)
self.assertTrue(bool(languages))
lang = disambiguate_language(s, languages.items())
self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, languages.items()))
def test_regional(self):
for s, country, k, v, expected in regional_test_cases:
languages = get_country_languages(country)
self.assertTrue(bool(languages))
regional = get_regional_languages(country, k, v)
self.assertTrue(bool(regional))
regional.update(languages)
lang = disambiguate_language(s, regional.items())
self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, regional.items()))
if __name__ == '__main__':
unittest.main()