diff --git a/resources/parser/data_sets/osm.yaml b/resources/parser/data_sets/osm.yaml index 92f5fb83..747403d3 100644 --- a/resources/parser/data_sets/osm.yaml +++ b/resources/parser/data_sets/osm.yaml @@ -9,9 +9,6 @@ chains: sample_probability: 0.5 place_only_probability: 0.9 -intersections: - replace_with_base_name_probability: 0.5 - places: drop_postcode_probability: 0.6 cldr_country_probability: 0.5 diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index fbb0db35..090837ef 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -1054,63 +1054,81 @@ class OSMAddressFormatter(object): replace_with_base_name_prob = float(nested_get(self.config, ('intersections', 'replace_with_base_name_probability'), default=0.0)) for node_id, latitude, longitude, ways in OSMIntersectionReader.read_intersections(infile): + distinct_ways = set() + valid_ways = [] + for way in ways: + if way['id'] not in distinct_ways: + valid_ways.append(way) + distinct_ways.add(way['id']) + ways = valid_ways + if not ways or len(ways) < 2: continue - tags = ways[0] + try: + latitude, longitude = latlon_to_decimal(latitude, longitude) + except Exception: + continue + + name_tag = 'name' + + for t in all_base_name_tags: + if any((t in way for way in ways)): + base_name_tag = t + break + + way_names = [] + languages = set([None]) + + for way in ways: + names = defaultdict(list) + for tag in way: + base_tag = tag.rsplit(':', 1)[0] + if base_tag not in all_name_tags: + continue + lang = tag.rsplit(':')[-1] if ':' in tag else None + languages.add(lang) + + names[lang].append(way[tag]) + + if base_name_tag in way: + names[None].append(way[base_name_tag]) + + if not names: + continue + + way_names.append(names) + + if not way_names or len(way_names) < 2: + continue language_components = {} - base_name_tags = [t for t in all_base_name_tags if t in tags] - if not base_name_tags: - base_name_tag = None - else: - base_name_tag = base_name_tags[0] + country = None - for tag in tags: - if tag.rsplit(':', 1)[0] in all_name_tags: - way_names = [(w[tag], w.get(base_name_tag) if base_name_tag else None) for w in ways if tag in w] - if len(way_names) < 2: - continue - namespaced_language = None - if ':' in tag: - namespaced_language = tag.rsplit(':')[-1] + for namespaced_language in languages: + address_components, country, language = self.components.expanded({}, latitude, longitude, language=namespaced_language) + language_components[namespaced_language] = address_components - if namespaced_language not in language_components: - address_components, country, language = self.components.expanded({}, latitude, longitude, language=namespaced_language) - language_components[namespaced_language] = (address_components, country, language) - else: - address_components, country, language = language_components[namespaced_language] + if country is None: + continue - intersection_phrase = Intersection.phrase(language, country=country) - if not intersection_phrase: - continue + for way1, way2 in itertools.combinations(way_names): + formatted_intersections = [] + for language in set(way1.keys()) & set(way2.keys()): + for w1, w2 in itertools.product(way1[language], way2[language]): + intersection_phrase = Intersection.phrase(language, country=country) + if not intersection_phrase: + continue - formatted_intersections = [] + address_components = language_components[language] - for (w1, w1_base), (w2, w2_base) in itertools.combinations(way_names, 2): w1 = self.components.cleaned_name(w1) w2 = self.components.cleaned_name(w2) intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2) formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components) - formatted_intersections.append(formatted) - if w1_base and not namespaced_language and random.random() < replace_with_base_name_prob: - w1 = self.components.cleand_name(w1_base) - - intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2) - formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components) - formatted_intersections.append(formatted) - - if w2_base and not namespaced_language and random.random() < replace_with_base_name_prob: - w2 = self.components.cleaned_name(w2_base) - - intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2) - formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components) - formatted_intersections.append(formatted) - - for formatted in formatted_intersections: if not formatted or not formatted.strip(): continue