diff --git a/resources/addresses/pt.yaml b/resources/addresses/pt.yaml index 34da6387..ae3570b4 100644 --- a/resources/addresses/pt.yaml +++ b/resources/addresses/pt.yaml @@ -19,10 +19,56 @@ components: unit: # If no unit number is specified - null_probability: 0.3 - alphanumeric_probability: 0.65 + null_probability: 0.7 + alphanumeric_probability: 0.25 standalone_probability: 0.05 + + combinations: + # For unit types like 2/34 (more common in Canada and Australia) + house_number_unit: + components: + - house_number + - unit + label: house_number + separators: + - separator: "-" + probability: 0.9 + - separator: " - " + probability: 0.05 + - separator: / + probability: 0.05 + probability: 0.005 + house_number_floor: + components: + - house_number + - unit + label: house_number + separators: + - separator: "-" + probability: 0.9 + - separator: " - " + probability: 0.05 + - separator: / + probability: 0.05 + probability: 0.005 + + house_number_staircase_unit: + components: + - house_number + - staircase + - unit + label: house_number + separators: + - separator: "-" + probability: 0.9 + - separator: " - " + probability: 0.05 + - separator: / + probability: 0.05 + probability: 0.005 + + numbers: default: &numero canonical: número @@ -148,6 +194,19 @@ levels: canonical_probability: 0.3 abbreviated_probability: 0.4 sample_probability: 0.3 + pavimento_terreo: &pavimento_terreo + canonical: pavimento terréo + abbreviated: pt + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + terreo: &terreo + canonical: terréo + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 baixos: &baixos canonical: baixos abbreviated: bxs @@ -663,8 +722,16 @@ units: sample_probability: 0.3 numeric: direction: left + conjunto: &conjunto + canonical: conjunto + abbreviated: conj + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.3 + sample_probability: 0.3 suite: &suite canonical: suite + sample: true canonical_probability: 0.7 sample_probability: 0.3 numeric: @@ -686,6 +753,13 @@ units: canonical: sala numeric: direction: left + unidade: &unidade + canonical: unidade + abbreviated: un + sample: true + canonical_probability: 0.7 + abbreviated_probability: 0.1 + sample_probability: 0.2 alphanumeric: &unit_alphanumeric default: *apartamento @@ -725,17 +799,19 @@ units: - alternative: *porta probability: 0.05 - alternative: *letra - probability: 0.02 + probability: 0.015 + - alternative: *unidade + probability: 0.005 zones: residential: *unit_alphanumeric commercial: - default: *suite + default: *sala probability: 0.6 alternatives: - alternative: *escritorio probability: 0.2 - - alternative: *sala + - alternative: *suite probability: 0.2 numeric_probability: 0.9 # e.g. escritório 1 @@ -748,12 +824,12 @@ units: whitespace_probability: 0.1 alpha: - default: *suite + default: *sala probability: 0.7 alternatives: - alternative: *escritorio probability: 0.15 - - alternative: *sala + - alternative: *suite probability: 0.1 - alternative: *letra probability: 0.05 @@ -830,16 +906,44 @@ countries: default: *andar_terreo probability: 0.4 alternatives: - - alternative: *rez_do_chao - probability: 0.34 + - alternative: *terreo + probability: 0.2 - alternative: *baixos probability: 0.2 + - alternative: *rez_do_chao + probability: 0.13 + - alternative: *pavimento_terreo + probability: 0.01 # Andar / Piso 0 is uncommon - alternative: *andar probability: 0.05 - alternative: *piso probability: 0.01 + + postcodes: + alphanumeric: + default: + canonical: código de endereçamento postal + abbreviated: cep + sample: true + canonical_probability: 0.001 + abbreviated_probability: 0.995 + sample_probability: 0.004 + + numeric: + # Postcodes in Brazil are sometimes prefixed by CEP + direction: left + + numeric_affix: + affix: cep + direction: left + # null_probability means the chance of doing nothing e.g. just the postal code + null_probability: 0.7 + numeric_probability: 0.18 + numeric_affix_probability: 0.12 + strict_numeric: true + po_boxes: alphanumeric: default: @@ -850,4 +954,30 @@ countries: abbreviated_probability: 0.6 sample_probability: 0.3 numeric: - direction: left \ No newline at end of file + direction: left + + units: + zones: + commercial: + default: *conjunto + probability: 0.8 + alternatives: + - alternative: *sala + probability: 0.1 + - alternative: *suite + probability: 0.05 + - alternative: *escritorio + probability: 0.05 + + alpha: + default: *conjunto + probability: 0.75 + alternatives: + - alternative: *sala + probability: 0.1 + - alternative: *suite + probability: 0.05 + - alternative: *escritorio + probability: 0.05 + - alternative: *letra + probability: 0.05 diff --git a/resources/dictionaries/pt/level_types_standalone.txt b/resources/dictionaries/pt/level_types_standalone.txt index d2103efe..d4da570b 100644 --- a/resources/dictionaries/pt/level_types_standalone.txt +++ b/resources/dictionaries/pt/level_types_standalone.txt @@ -1,3 +1,5 @@ andar terréo|at|a.t|a t|andar terreo baixos|bxs -rés-do-chão|res-do-chao|rés do chão|res do chao|résdochão|resdochao|rc|r / c|r.c|r c|rdc|r / d / c|r/r / chão|r / chao|r-d-c|r d c |r.d.c \ No newline at end of file +rés-do-chão|res-do-chao|rés do chão|res do chao|résdochão|resdochao|rc|r / c|r.c|r c|rdc|r / d / c|r/r / chão|r / chao|r-d-c|r d c |r.d.c +pavimento terréo|pt|p.t|p t|pavimento terreo +terréo|terreo diff --git a/resources/dictionaries/pt/no_number.txt b/resources/dictionaries/pt/no_number.txt index 8ec43d83..99965dac 100644 --- a/resources/dictionaries/pt/no_number.txt +++ b/resources/dictionaries/pt/no_number.txt @@ -1 +1 @@ -sem número|sem numero|sn|s.n.|s.n|s / n|s n \ No newline at end of file +sem número|sem numero|sn|s.n.|s.n|s / n|s n|s / nº|s.nº|snº|s / no \ No newline at end of file diff --git a/resources/dictionaries/pt/postcodes.txt b/resources/dictionaries/pt/postcodes.txt new file mode 100644 index 00000000..e201121f --- /dev/null +++ b/resources/dictionaries/pt/postcodes.txt @@ -0,0 +1 @@ +código de endereçamento postal|cep|c.e.p|c e p|c de e p|c de ep|codigo de enderecamento postal \ No newline at end of file diff --git a/resources/dictionaries/pt/unit_types_numbered.txt b/resources/dictionaries/pt/unit_types_numbered.txt index 6050b11b..b1e62ed9 100644 --- a/resources/dictionaries/pt/unit_types_numbered.txt +++ b/resources/dictionaries/pt/unit_types_numbered.txt @@ -1,8 +1,10 @@ apartamento|ap|apt|apto|apt.o casa +conjunto|conj|cj letra lote|lt parcela porta|pta moradia -sala \ No newline at end of file +sala +unidade|un \ No newline at end of file diff --git a/scripts/geodata/addresses/config.py b/scripts/geodata/addresses/config.py index 612ede70..104ed473 100644 --- a/scripts/geodata/addresses/config.py +++ b/scripts/geodata/addresses/config.py @@ -26,7 +26,7 @@ class AddressConfig(object): self.cache = {} for filename in os.listdir(config_dir): - if filename not in ('en.yaml', 'es.yaml', 'ca.yaml', 'fr.yaml', 'de.yaml', 'nl.yaml', 'da.yaml', 'nb.yaml', 'sv.yaml', 'pt.yaml'): + if filename not in ('en.yaml', 'es.yaml', 'ca.yaml', 'fr.yaml', 'de.yaml', 'nl.yaml', 'da.yaml', 'nb.yaml', 'sv.yaml', 'pt.yaml', 'pl.yaml'): continue config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))