[intersections] intersections training data, using a Cartesian product of all names in the same language, including something like tiger:name_base

This commit is contained in:
Al
2016-08-18 01:19:02 -04:00
parent 701bcb1d79
commit b41ba7374b
2 changed files with 58 additions and 43 deletions

View File

@@ -1054,63 +1054,81 @@ class OSMAddressFormatter(object):
replace_with_base_name_prob = float(nested_get(self.config, ('intersections', 'replace_with_base_name_probability'), default=0.0))
for node_id, latitude, longitude, ways in OSMIntersectionReader.read_intersections(infile):
distinct_ways = set()
valid_ways = []
for way in ways:
if way['id'] not in distinct_ways:
valid_ways.append(way)
distinct_ways.add(way['id'])
ways = valid_ways
if not ways or len(ways) < 2:
continue
tags = ways[0]
try:
latitude, longitude = latlon_to_decimal(latitude, longitude)
except Exception:
continue
name_tag = 'name'
for t in all_base_name_tags:
if any((t in way for way in ways)):
base_name_tag = t
break
way_names = []
languages = set([None])
for way in ways:
names = defaultdict(list)
for tag in way:
base_tag = tag.rsplit(':', 1)[0]
if base_tag not in all_name_tags:
continue
lang = tag.rsplit(':')[-1] if ':' in tag else None
languages.add(lang)
names[lang].append(way[tag])
if base_name_tag in way:
names[None].append(way[base_name_tag])
if not names:
continue
way_names.append(names)
if not way_names or len(way_names) < 2:
continue
language_components = {}
base_name_tags = [t for t in all_base_name_tags if t in tags]
if not base_name_tags:
base_name_tag = None
else:
base_name_tag = base_name_tags[0]
country = None
for tag in tags:
if tag.rsplit(':', 1)[0] in all_name_tags:
way_names = [(w[tag], w.get(base_name_tag) if base_name_tag else None) for w in ways if tag in w]
if len(way_names) < 2:
continue
namespaced_language = None
if ':' in tag:
namespaced_language = tag.rsplit(':')[-1]
for namespaced_language in languages:
address_components, country, language = self.components.expanded({}, latitude, longitude, language=namespaced_language)
language_components[namespaced_language] = address_components
if namespaced_language not in language_components:
address_components, country, language = self.components.expanded({}, latitude, longitude, language=namespaced_language)
language_components[namespaced_language] = (address_components, country, language)
else:
address_components, country, language = language_components[namespaced_language]
if country is None:
continue
intersection_phrase = Intersection.phrase(language, country=country)
if not intersection_phrase:
continue
for way1, way2 in itertools.combinations(way_names):
formatted_intersections = []
for language in set(way1.keys()) & set(way2.keys()):
for w1, w2 in itertools.product(way1[language], way2[language]):
intersection_phrase = Intersection.phrase(language, country=country)
if not intersection_phrase:
continue
formatted_intersections = []
address_components = language_components[language]
for (w1, w1_base), (w2, w2_base) in itertools.combinations(way_names, 2):
w1 = self.components.cleaned_name(w1)
w2 = self.components.cleaned_name(w2)
intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2)
formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components)
formatted_intersections.append(formatted)
if w1_base and not namespaced_language and random.random() < replace_with_base_name_prob:
w1 = self.components.cleand_name(w1_base)
intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2)
formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components)
formatted_intersections.append(formatted)
if w2_base and not namespaced_language and random.random() < replace_with_base_name_prob:
w2 = self.components.cleaned_name(w2_base)
intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2)
formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components)
formatted_intersections.append(formatted)
for formatted in formatted_intersections:
if not formatted or not formatted.strip():
continue