From 6861c09caa6bf9e96fac7a8e46db1ac8e4f9ec39 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 2 Jun 2016 21:06:29 -0400 Subject: [PATCH] [addresses/dictionaries] Adding Catalan address config --- resources/addresses/ca.yaml | 856 ++++++++++++++++++ resources/dictionaries/ca/cross_streets.txt | 8 + .../ca/level_types_sub_basement.txt | 1 + resources/dictionaries/ca/near.txt | 12 + resources/dictionaries/ca/unit_directions.txt | 2 +- .../dictionaries/ca/unit_types_numbered.txt | 10 +- 6 files changed, 886 insertions(+), 3 deletions(-) create mode 100644 resources/addresses/ca.yaml create mode 100644 resources/dictionaries/ca/cross_streets.txt create mode 100644 resources/dictionaries/ca/level_types_sub_basement.txt create mode 100644 resources/dictionaries/ca/near.txt diff --git a/resources/addresses/ca.yaml b/resources/addresses/ca.yaml new file mode 100644 index 00000000..8b518b3b --- /dev/null +++ b/resources/addresses/ca.yaml @@ -0,0 +1,856 @@ +# ca.yaml +# ------- +# Note: make Latin-American conventions by default (country overrides for Spain +# as well as any other country-specific norms) + +components: + level: + # If no floor number is specified + null_probability: 0.6 + alphanumeric_probability: 0.35 + standalone_probability: 0.05 + + staircase: + null_probability: 0.99 + alphanumeric_probability: 0.01 + + entrance: + null_probability: 0.999 + alphanumeric_probability: 0.001 + + unit: + # If no unit number is specified + null_probability: 0.3 + alphanumeric_probability: 0.65 + standalone_probability: 0.05 + +numbers: + default: &numero + canonical: número + abbreviated: "nº" + sample: true + canonical_probability: 0.1 + abbreviated_probability: 0.7 + sample_probability: 0.2 + sample_exclude: + - "#" + numeric: + direction: left + numeric_affix: + affix: "#" # e.g. #3, #2F, etc. + probability: 0.5 + alternatives: + - alternative: + direction: left # affix goes on the number's left + + # Probabilities for numbers + numeric_probability: 0.7 + numeric_affix_probability: 0.3 + +and: + default: &i + canonical: i + abbreviated: "&" + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.4 + sample_probability: 0.1 + +house_numbers: + # sense número (s/n) addresses + no_number: + default: + canonical: sense número + abbreviated: s/n + sample: true + canonical_probability: 0.1 + abbreviated_probability: 0.7 + sample_probability: 0.2 + alphanumeric: + default: *numero + + alphanumeric_phrase_probability: 0.01 + no_number_probability: 0.1 # With this probability, use sense número if no house_number is specified + + + +levels: + # Everywhere except Spain + floor: &pis + canonical: pis + abbreviated: p + sample: true + canonical_probability: 0.8 + abbreviated_probability: 0.1 + sample_probability: 0.1 + numeric: + direction: left + add_number_phrase: true # Occasionally add variation of "number", e.g. Pis No 2 + add_number_phrase_probability: 0.05 + numeric_affix: + affix: p + direction: left # P2 + # e.g. 2o piso + ordinal: + direction: right + direction_probability: 0.95 # Let it vary occasionally e.g. Pis 2o + standalone_probability: 0.2 # Let e.g. 5º be the entire floor string + # If ordinal is selected, chance of e.g. just using 2o without Piso + null_phrase_probability: 0.6 + numeric_probability: 0.2 + numeric_affix_probability: 0.05 + ordinal_probability: 0.75 + # Ground floor + baixos: &baixos + canonical: baixos + abbreviated: bxs + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.3 + sample_probability: 0.1 + pis_baix: &pis_baix + canonical: pis baix + abbreviated: pb + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.5 + sample_probability: 0.1 + sota: &sota + canonical: sota + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + # Used when floor number is < 0 (starts at -1 in all countries) + soterrani: &soterrani + canonical: soterrani + abbreviated: so + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + # e.g. soterrani 1 + numeric: + direction: left + numeric_affix: + affix: so + direction: left + # e.g. segon soterrani + ordinal: + direction: right + standalone_probability: 0.985 + number_abs_value: true + number_min_abs_value: 1 + numeric_probability: 0.005 + numeric_affix_probability: 0.005 + ordinal_probability: 0.005 + sub_soterrani: &sub_soterrani + canonical: sub soterrani + abbreviated: ss + sample: true + # e.g. sub soterrani 1 + numeric: + direction: left + numeric_affix: + affix: ss + direction: left + # e.g. segon sub soterrani + ordinal: + direction: right + number_abs_value: true + number_min_abs_value: 2 + # Soterrani 2 == Sub-soterrani 1 + number_subtract_abs_value: 1 + standalone_probability: 0.985 + numeric_probability: 0.005 + numeric_affix_probability: 0.005 + ordinal_probability: 0.005 + entresol: &entresol + canonical: entresòl + abbreviated: entl + half_floors: true + sample: true + canonical_probability: 0.7 + abbreviated_probability: 0.2 + sample_probability: 0.1 + # e.g. entresòl 2 + numeric: + direction: left + # e.g. ent2 + numeric_affix: + affix: ent + direction: left + # e.g. segon entresòl + ordinal: + direction: right + numeric_probability: 0.1 + numeric_affix_probability: 0.1 + ordinal_probability: 0.2 + standalone_probability: 0.6 + pis_principal: &pis_principal + canonical: pis principal + abbreviated: pis pral + sample: true + canonical_probability: 0.2 + abbreviated_probability: 0.3 + sample_probability: 0.5 + principal: &principal + canonical: principal + abbreviated: pral + sample: true + canonical_probability: 0.2 + abbreviated_probability: 0.6 + sample_probability: 0.2 + atic: &atic + canonical: àtic + abbreviated: át + sample: true + canonical_probability: 0.7 + abbreviated_probability: 0.1 + sample_probability: 0.2 + sobreatic: &sobreatic + canonical: sobreàtic + aliases: + "<-1": + default: *soterrani + probability: 0.6 + alternatives: + - alternative: *sub_soterrani + probability: 0.3995 + - alternative: *pis + probability: 0.0005 + "-1": + default: *soterrani + probability: 0.9995 + alternatives: + - alternative: *pis + probability: 0.0005 + # Special token for half-floors + half_floors: + default: *entresol + "0": + default: *baixos + probability: 0.495 + alternatives: + - alternative: *pis_baix + probability: 0.395 + - alternative: *sota + probability: 0.1 + - alternative: *pis + # Piso 0 is uncommon + probability: 0.01 + top: + default: *pis + probability: 0.85 + alternatives: + - alternative: *atic + probability: 0.1 + - alternative: *sobreatic + probability: 0.05 + + numbering_starts_at: 0 + + alphanumeric: + default: *pis + add_number_phrase: true + add_number_phrase_probability: 0.05 + numeric_probability: 0.99 + alpha_probability: 0.01 + +blocks: + default: + canonical: bloc + abbreviated: bl + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.2 + sample_probability: 0.2 + numeric: + direction: left + +categories: + near: + default: + canonical: a prop de + probability: 0.5 + alternatives: + - alternative: + canonical: prop de + probability: 0.2 + - alternative: + canonical: prop + probability: 0.1 + - alternative: + canonical: a prop + probability: 0.1 + - alternative: + canonical: proper + probability: 0.05 + - alternative: + canonical: proper a + probability: 0.05 + + nearby: + default: + canonical: proper + probability: 0.5 + alternatives: + - alternative: + canonical: a prop + probability: 0.1 + - alternative: + canonical: a prop d'aquí + probability: 0.1 + - alternative: + canonical: a prop d'aqui + probability: 0.1 + - alternative: + canonical: aquí + probability: 0.1 + - alternative: + canonical: aqui + probability: 0.1 + near_me: + default: + canonical: a prop meu + in: + default: + canonical: a + probability: 0.6 + alternatives: + - alternative: + canonical: dins + probability: 0.2 + - alternative: + canonical: en + probability: 0.2 + # Probabilities of each phrase + near_probability: 0.35 + nearby_probability: 0.2 + near_me_probability: 0.1 + in_probability: 0.35 + +cross_streets: + and: *i + amb: &amb + canonical: amb + a: &a + canonical: a + corner_of: &cantonada_de + canonical: cantonada de + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + at_the_corner_of: &a_la_cantonada_de + canonical: a la cantonada de + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + corner: &cantonada + canonical: cantonada + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + + intersection: + default: *i + probability: 0.55 + alternatives: + - alternative: *amb + probability: 0.2 + - alternative: *a + probability: 0.1 + - alternative: *cantonada_de + probability: 0.09 + - alternative: *a_la_cantonada_de + probability: 0.05 + - alternative: *cantonada + probability: 0.01 + + between: + canonical: entre + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probabililty: 0.5 + + +po_boxes: + apartat: &apartat + canonical: apartat + abbreviated: apt + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.4 # Apt No 1234 + numeric_probability: 1.0 + alphanumeric: + sample: false + default: *apartat + numeric_probability: 0.9 # Apt 123 + alpha_probability: 0.05 # Apt A + numeric_plus_alpha_probability: 0.04 # Apt 123G + alpha_plus_numeric_probability: 0.01 # Apt A123 + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + digits: + - length: 1 + probability: 0.05 + - length: 2 + probability: 0.1 + - length: 3 + probability: 0.2 + - length: 4 + probability: 0.5 + - length: 5 + probability: 0.1 + - length: 6 + probability: 0.05 + +postcodes: + alphanumeric: + default: + canonical: codi postal + abbreviated: cp + sample: true + canonical_probability: 0.01 + abbreviated_probability: 0.95 + sample_probability: 0.04 + + numeric: + # Postcodes in Spain and Latin America are sometimes prefixed by CP + direction: left + + numeric_affix: + affix: cp + direction: left + # null_probability means the chance of doing nothing e.g. just the postal code + null_probability: 0.7 + numeric_probability: 0.18 + numeric_affix_probability: 0.12 + strict_numeric: true + +directions: + right: &dreta + canonical: dreta + abbreviated: dta + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: right + numeric_affix: + affix: d + direction: right + whitespace_probability: 0.1 + numeric_probability: 0.4 + numeric_affix_probability: 0.6 + left: &esquerra + canonical: esquerra + abbreviated: esq + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: right + numeric_affix: + affix: e + direction: right + whitespace_probability: 0.1 + numeric_probability: 0.4 + numeric_affix_probability: 0.6 + rear: &posterior + canonical: posterior + abbreviated: pos + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.2 + sample_probability: 0.2 + numeric: + direction: right + front: &front + canonical: front + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + alternatives: + - alternative: *dreta + probability: 0.45 + - alternative: *esquerra + probability: 0.45 + - alternative: *posterior + probability: 0.05 + - alternative: *front + probability: 0.05 + + anteroposterior: + alternatives: + - alternative: *front + probability: 0.5 + - alternative: *posterior + probability: 0.5 + + lateral: + alternatives: + - alternative: *dreta + probability: 0.5 + - alternative: *esquerra + probability: 0.5 + + + + +cardinal_directions: + east: &est + canonical: est + abbreviated: e + canonical_probability: 0.4 + abbreviated_probability: 0.6 + numeric: + direction: right + numeric_affix: + affix: e + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + west: &oest + canonical: oest + abbreviated: w + canonical_probability: 0.4 + abbreviated_probability: 0.6 + numeric: + direction: right + numeric_affix: + affix: w + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + north: &nord + canonical: nord + abbreviated: n + canonical_probability: 0.4 + abbreviated_probability: 0.6 + numeric: + direction: right + numeric_affix: + affix: n + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + south: &sud + canonical: sud + abbreviated: s + canonical_probability: 0.4 + abbreviated_probability: 0.6 + numeric: + direction: right + numeric_affix: + affix: s + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + alternatives: + - alternative: *nord + probability: 0.25 + - alternative: *est + probability: 0.25 + - alternative: *sud + probability: 0.25 + - alternative: *oest + probability: 0.25 + +entrances: + entrada: &entrada + canonical: entrada + abbreviated: entr + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.2 + sample_probability: 0.3 + numeric: + direction: left + + # Entrance 1, Entrance A, etc. + alphanumeric: + default: *entrada + numeric_probability: 0.1 # e.g. Entrance 1 + alpha_probability: 0.85 # e.g. Entrnace A + numeric_plus_alpha_probability: 0.025 # e.g. 1A + alpha_plus_numeric_probability: 0.025 # e.g. A1 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + directional: + modifier: + alternatives: + - alternative: *nord + - alternative: *sud + - alternative: *est + - alternative: *oest + - alternative: *dreta + - alternative: *esquerra + - alternative: *posterior + - alternative: *front + +staircases: + escala: &escala + canonical: escala + abbreviated: esc + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + + alphanumeric: + # For alphanumerics, Stair A, Stair 1, etc. + default: *escala + numeric_probability: 0.6 # e.g. Escalera 1 + alpha_probability: 0.35 # e.g. Escalera A + numeric_plus_alpha_probability: 0.025 # e.g. 1A + alpha_plus_numeric_probability: 0.025 # e.g. A1 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + directional: + direction: right # e.g. Escalera Izq + direction_probability: 0.8 + modifier: + alternatives: + - alternative: *nord + - alternative: *sud + - alternative: *est + - alternative: *oest + - alternative: *dreta + - alternative: *esquerra + - alternative: *posterior + - alternative: *front + +units: + flat: &apartament + canonical: apartament + abbreviated: apmt + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + door: &porta + canonical: porta + abbreviated: pta + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + # If it's just puerta B, many times it's just e.g. 3o B for "tercero piso puerta B" + null_phrase_probability: 0.15 + ordinal: + direction: right + gender: f + direction_probability: 0.95 # Let it vary occasionally e.g. Porta 2a + null_phrase_probability: 0.8 # Let e.g. 5a be the entire unit string + numeric_probability: 0.25 + ordinal_probability: 0.75 + lletra: &lletra + canonical: lletra + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + numeric: + direction: left + office: &oficina + canonical: oficina + abbreviated: of + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.3 + sample_probability: 0.3 + numeric: + direction: left + # Another word for unit, used more in Colombia + unitat: &unitat + canonical: unitat + abbreviated: un + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + lot: &lot + canonical: lot + abbreviated: lt + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.2 + sample_probability: 0.2 + parcel: &parcella + canonical: parcel·la + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + habitacio: &habitacio + canonical: habitació + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + casa: &casa + canonical: casa + numeric: + direction: left + room: &sala + canonical: sala + numeric: + direction: left + alphanumeric: &unit_alphanumeric + default: *porta + probability: 0.8 + sample: true + alternatives: + - alternative: *apartament + probability: 0.1 + - alternative: *casa + probability: 0.1 + + # Separate random probability for adding directions like 2o Izq, 2 Dcha, etc. + add_direction: true + add_direction_probability: 0.1 + add_direction_numeric: true # Only for numbers + add_direction_standalone: true # A unit can be as simple as "D" + + numeric_probability: 0.7 # e.g. Porta 1a + numeric_plus_alpha_probability: 0.01 # e.g. Porta 1A + alpha_plus_numeric_probability: 0.01 # e.g. Porta A1 + alpha_probability: 0.28 # e.g. Porta A + + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + alpha: + default: *porta + probability: 0.8 + alternatives: + - alternative: *lletra + probability: 0.12 + - alternative: *apartament + probability: 0.05 + - alternative: *casa + probability: 0.01 + - alternative: *unitat + probability: 0.01 + - alternative: *habitacio + probability: 0.01 + + zones: + residential: *unit_alphanumeric + commercial: + default: *oficina + probability: 0.8 + alternatives: + - alternative: *sala + probability: 0.2 + + numeric_probability: 0.9 # e.g. Oficina 1 + numeric_plus_alpha_probability: 0.01 # e.g. Oficina 1A + alpha_plus_numeric_probability: 0.01 # e.g. Oficina A1 + alpha_probability: 0.08 # e.g. Oficina A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + alpha: + default: *oficina + probability: 0.8 + alternatives: + - alternative: *sala + probability: 0.15 + - alternative: *lletra + probability: 0.05 + + industrial: + default: *lot + probability: 0.5 + alternatives: + - alternative: *oficina + probability: 0.3 + - alternative: *unitat + probability: 0.19 + - alternative: *parcella + probability: 0.01 + + numeric_probability: 0.9 # e.g. Lote 1 + numeric_plus_alpha_probability: 0.01 # e.g. Lote 1A + alpha_plus_numeric_probability: 0.01 # e.g. Lote A1 + alpha_probability: 0.08 # e.g. Lote A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + university: + default: *sala + probability: 0.9 + alternatives: + - alternative: *porta + probability: 0.1 + + numeric_probability: 0.9 # e.g. Sala 1 + numeric_plus_alpha_probability: 0.01 # e.g. Sala 1A + alpha_plus_numeric_probability: 0.01 # e.g. Sala A1 + alpha_probability: 0.08 # e.g. Sala A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + alpha: + default: *sala + probability: 0.9 + alternatives: + - alternative: *porta + probability: 0.08 + - alternative: *lletra + probability: 0.02 + + allotments: + lot: + default: *lot + numeric_probability: 0.8 + alphanumeric_probability: 0.1 + alpha_probability: 0.1 + parcel: + default: *parcella + numeric_probability: 0.3 + alphanumeric_probability: 0.3 + alpha_probability: 0.4 + lot_probability: 0.9 + parcel_probability: 0.06 + lot_plus_parcel_probability: 0.02 + parcel_plus_lot_probability: 0.02 diff --git a/resources/dictionaries/ca/cross_streets.txt b/resources/dictionaries/ca/cross_streets.txt new file mode 100644 index 00000000..f3d8a939 --- /dev/null +++ b/resources/dictionaries/ca/cross_streets.txt @@ -0,0 +1,8 @@ +& +a +i +amb +cantonada +cantonada de +a la cantonada de +entre|entre /|e / \ No newline at end of file diff --git a/resources/dictionaries/ca/level_types_sub_basement.txt b/resources/dictionaries/ca/level_types_sub_basement.txt new file mode 100644 index 00000000..2f8b1be7 --- /dev/null +++ b/resources/dictionaries/ca/level_types_sub_basement.txt @@ -0,0 +1 @@ +sub soterrani|ss|s.s|s s \ No newline at end of file diff --git a/resources/dictionaries/ca/near.txt b/resources/dictionaries/ca/near.txt new file mode 100644 index 00000000..568b3687 --- /dev/null +++ b/resources/dictionaries/ca/near.txt @@ -0,0 +1,12 @@ +a +a prop de +a prop +a prop d'aquí|a prop d'aqui|a prop daquí|a prop daqui +a prop meu +aquí|aqui +dins +en +prop +prop de +proper +proper a \ No newline at end of file diff --git a/resources/dictionaries/ca/unit_directions.txt b/resources/dictionaries/ca/unit_directions.txt index 51d9a34b..d25c0d61 100644 --- a/resources/dictionaries/ca/unit_directions.txt +++ b/resources/dictionaries/ca/unit_directions.txt @@ -1,2 +1,2 @@ dreta|d|dta -esquerra|e \ No newline at end of file +esquerra|e|esq \ No newline at end of file diff --git a/resources/dictionaries/ca/unit_types_numbered.txt b/resources/dictionaries/ca/unit_types_numbered.txt index 61ec64d5..90d417e1 100644 --- a/resources/dictionaries/ca/unit_types_numbered.txt +++ b/resources/dictionaries/ca/unit_types_numbered.txt @@ -1,4 +1,10 @@ -apartament|apmt|apt +apartament|apmt +casa habitació|hab|habitacio +lletra +lot|lt +oficina|of parcel·la|parc|parcella -porta|pta \ No newline at end of file +porta|pta +sala +unitat|un