From 22be892635a030a997d1f7ea79127ae612617eea Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 1 Jun 2016 12:35:48 -0400 Subject: [PATCH] [dictionaries] Updates to German dictionaries --- .../dictionaries/de/ambiguous_expansions.txt | 2 + resources/dictionaries/de/entrances.txt | 1 + .../dictionaries/de/level_types_numbered.txt | 4 +- resources/dictionaries/de/staircases.txt | 3 +- resources/dictionaries/de/unit_directions.txt | 2 + .../dictionaries/de/unit_types_numbered.txt | 4 +- scripts/geodata/addresses/components.py | 58 ++++++++++--------- 7 files changed, 43 insertions(+), 31 deletions(-) create mode 100644 resources/dictionaries/de/entrances.txt create mode 100644 resources/dictionaries/de/unit_directions.txt diff --git a/resources/dictionaries/de/ambiguous_expansions.txt b/resources/dictionaries/de/ambiguous_expansions.txt index 45c3d48b..838ea42f 100644 --- a/resources/dictionaries/de/ambiguous_expansions.txt +++ b/resources/dictionaries/de/ambiguous_expansions.txt @@ -5,8 +5,10 @@ g h i k +l n o +r s u v diff --git a/resources/dictionaries/de/entrances.txt b/resources/dictionaries/de/entrances.txt new file mode 100644 index 00000000..06f2e515 --- /dev/null +++ b/resources/dictionaries/de/entrances.txt @@ -0,0 +1 @@ +eingang \ No newline at end of file diff --git a/resources/dictionaries/de/level_types_numbered.txt b/resources/dictionaries/de/level_types_numbered.txt index 58f99d00..535d0b4f 100644 --- a/resources/dictionaries/de/level_types_numbered.txt +++ b/resources/dictionaries/de/level_types_numbered.txt @@ -1 +1,3 @@ -obergeschoss|og|o g \ No newline at end of file +etage +obergeschoss|og|o g +stock \ No newline at end of file diff --git a/resources/dictionaries/de/staircases.txt b/resources/dictionaries/de/staircases.txt index b3ba5503..29f69507 100644 --- a/resources/dictionaries/de/staircases.txt +++ b/resources/dictionaries/de/staircases.txt @@ -1 +1,2 @@ -stiege|stg \ No newline at end of file +stiege|stg +treppe \ No newline at end of file diff --git a/resources/dictionaries/de/unit_directions.txt b/resources/dictionaries/de/unit_directions.txt new file mode 100644 index 00000000..f5a2dd16 --- /dev/null +++ b/resources/dictionaries/de/unit_directions.txt @@ -0,0 +1,2 @@ +links|l +rechts|r \ No newline at end of file diff --git a/resources/dictionaries/de/unit_types_numbered.txt b/resources/dictionaries/de/unit_types_numbered.txt index f7b3fdbf..986c8dcf 100644 --- a/resources/dictionaries/de/unit_types_numbered.txt +++ b/resources/dictionaries/de/unit_types_numbered.txt @@ -1,5 +1,5 @@ -abteilung|abt +appartement|apt|app büro|buro|buero top -wohnung|whg|w +wohnung|whg|w|/ w|/ / w zimmer|zi \ No newline at end of file diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 014fa0c4..da0c4c20 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -399,40 +399,44 @@ class AddressComponents(object): def combine_fields(self, address_components, language, country=None, generated_components=None): combo_config = address_config.get_property('components.combinations', language, country=country, default={}) - values = [] - probs = [] + combos = [] + probs = {} generated_components = generated_components or set() - for k, v in six.iteritems(combo_config): - values.append(v) - probs.append(v['probability']) - - if not isclose(sum(probs), 1.0): - values.append(None) - probs.append(1.0 - sum(probs)) - - probs = cdf(probs) - - combo = weighted_choice(values, probs) - if combo is not None: + for k, combo in six.iteritems(combo_config): components = OrderedDict.fromkeys(combo['components']).keys() if not all((c in address_components and (c in generated_components or self.is_numeric(address_components[c])) for c in components)): - return None + continue - values = [] - probs = [] - for s in combo['separators']: - values.append(s['separator']) - probs.append(s['probability']) + combos.append((len(components), combo)) - probs = cdf(probs) - separator = weighted_choice(values, probs) + if not combos: + return None - new_label = combo['label'] - new_value = separator.join([address_components.pop(c) for c in components]) - address_components[new_label] = new_value - return new_label - return None + combos.sort(key=operator.itemgetter(0), reverse=True) + + for num_components, combo in combos: + prob = combo['probability'] + if random.random() < prob: + break + else: + return None + + components = OrderedDict.fromkeys(combo['components']).keys() + + values = [] + probs = [] + for s in combo['separators']: + values.append(s['separator']) + probs.append(s['probability']) + + probs = cdf(probs) + separator = weighted_choice(values, probs) + + new_label = combo['label'] + new_value = separator.join([address_components.pop(c) for c in components]) + address_components[new_label] = new_value + return new_label def generated_type(self, component, existing_components, language, country=None): component_config = address_config.get_property('components.{}'.format(component), language, country=country)