@@ -11,7 +11,7 @@ env:
|
||||
- secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
|
||||
- GH_REF=github.com/openvenues/libpostal
|
||||
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/*.txt" | wc -l)
|
||||
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex" | wc -l)
|
||||
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
|
||||
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
|
||||
compiler:
|
||||
- clang
|
||||
@@ -22,11 +22,11 @@ addons:
|
||||
- ubuntu-toolchain-r-test
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- libsnappy-dev
|
||||
- pkg-config
|
||||
before_script:
|
||||
- ./bootstrap.sh
|
||||
- if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
|
||||
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/i18n/numex.py; fi;
|
||||
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
|
||||
- if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
|
||||
install:
|
||||
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
|
||||
|
||||
56
CODE_OF_CONDUCT.md
Normal file
56
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# The libpostal Code of Conduct
|
||||
|
||||
libpostal is all about addresses, and addresses are all about how people communicate. The philosophy of this project has always been one of multiculturalism, of acknowledging and embracing the specifics of our experiences, the nuances of our cultures, languages, identities, and intersections thereof. This code of conduct expresses the values we hope our contributors will emulate and adopt. We further acknowledge that machine learning models like the ones used in libpostal can reflect and sometimes reinforce the defaults, biases, and injustices that are encoded in the underlying training data and it is urgent and necessary that we stay vigilant in ensuring that open mapping data sets are representative of the under-represented.
|
||||
|
||||
> "Maps are an abstraction of reality, but whose reality are we talking about?"
|
||||
>
|
||||
> -- Kate Chapman of the Humanitarian OpenStreetMap Team
|
||||
|
||||
## Our Pledge
|
||||
|
||||
In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, primary language(s), race, religion, or sexual identity and orientation.
|
||||
|
||||
It's a core tenet of this project to represent different cultures, languages, and realities. This can only be accomplished through creating an inclusive community that goes beyond simply tolerating differences, and instead intrinsically values and seeks out the diverse experiences of people around the world. We pledge to encourage and support people who are directly and/or indirectly marginalized in the open source community and in our societies.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to creating a positive environment include:
|
||||
|
||||
- Using welcoming and inclusive language
|
||||
- Being respectful of differing viewpoints and experiences
|
||||
- Gracefully accepting constructive criticism
|
||||
- Acknowledging the privilege that one holds in a conversation
|
||||
- Focusing on what is best for the community
|
||||
- Showing empathy towards other community members
|
||||
|
||||
Examples of unacceptable behavior by participants include:
|
||||
|
||||
- Hate speech, overt or coded racism, use or promotion of hate symbols or imagery
|
||||
- The use of sexualized language or imagery and unwelcome sexual attention or advances
|
||||
- Trolling, insulting/derogatory comments, and personal or political attacks
|
||||
- Public or private harassment
|
||||
- Use of condescending language
|
||||
- Making assumptions about technical knowledge based on personal characteristics
|
||||
- "Othering" of languages, cultures, or groups of people and their lived experiences
|
||||
- Publishing others' private information, such as a physical or electronic address, without explicit permission
|
||||
- Other conduct which could reasonably be considered inappropriate in a professional setting
|
||||
|
||||
## Our Responsibilities
|
||||
|
||||
Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at Contact email address. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
|
||||
|
||||
Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at http://contributor-covenant.org/version/1/4
|
||||
36
CONTRIBUTING.md
Normal file
36
CONTRIBUTING.md
Normal file
@@ -0,0 +1,36 @@
|
||||
## Submitting Issues
|
||||
|
||||
When submitting issues to libpostal, please repeect these guildelines:
|
||||
|
||||
- Be constructive. Try to help solve the problem.
|
||||
- Always search for existing issues before submitting one.
|
||||
- If you've written your own address parsing library/service, whether open-source or proprietary, don't raise issues simply to advertise for your project/solution. Write about it elsewhere, and save the issues page for people who are actually using libpostal.
|
||||
|
||||
### Bad parses
|
||||
|
||||
Libpostal's parser uses machine learning. It improves as the data improves, but contrary to the hype, that doesn't mean it can do everything a human brain can do. Addresses have many edge cases, and while we cover a substantial number of them, we may not be able to handle every bizarre edge case that comes up.
|
||||
|
||||
When reporting a parser issue, only submit one issue per problematic *pattern* of address, preferably with multiple addresses attached. For each address, please include at minimum:
|
||||
|
||||
- Input address
|
||||
- Expected result
|
||||
- Can you find the address in [OpenStreetMap](https://openstreetmap.org)?
|
||||
- If libpostal is getting a place name like a city, suburb, or state wrong, can the admin component(s) name be found in OSM?
|
||||
- What's the minimum form of the address that will parse correctly. For instance, if "123 Main St New York, NY" is the problem address, will "123 Main St" work? Does it work without abbreviations, using local language names, without sub-building information like units?
|
||||
|
||||
Note: we don't claim to handle all of the formatting mistakes that abound in address data sets, so sometimes the input needs to be preprocessed in some way before sending to libpostal. Sometimes there simply is no immediate solution, and many times the solution is simply to add your address or some part of it to OSM.
|
||||
|
||||
However, if there's a specific place or style of address that libpostal gets wrong, often we can do something to help libpostal train for and understand that address.
|
||||
|
||||
|
||||
### Bugs
|
||||
|
||||
When submitting bug reports, please be sure to give us as much context as possible so that we can reproduce the error you encountered. Be sure to include:
|
||||
|
||||
- System conditons (OS, etc.)
|
||||
- Steps to reproduce
|
||||
- Expected outcome
|
||||
- Actual outcome
|
||||
- Screenshots or traceback
|
||||
- Input or code that exposes the bug, if possible
|
||||
|
||||
470
README.md
470
README.md
@@ -1,21 +1,26 @@
|
||||
# libpostal: international street address NLP
|
||||
|
||||
[](https://travis-ci.org/openvenues/libpostal) [](https://github.com/openvenues/libpostal/blob/master/LICENSE)
|
||||
[](#sponsors)
|
||||
[](#backers)
|
||||
[](#sponsors)
|
||||
[](#backers)
|
||||
|
||||
:jp: :us: :gb: :ru: :fr: :kr: :it: :es: :cn: :de:
|
||||
libpostal is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data. The goal of this project is to understand location-based strings in every language, everywhere. For a more comprehensive overview of the research behind libpostal, be sure to check out the (lengthy) introductory blog posts:
|
||||
|
||||
libpostal is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data. This [introductory blog post](https://medium.com/@albarrentine/statistical-nlp-on-openstreetmap-b9d573e6cc86) is a good overview of the research and thought process that went in.
|
||||
- **Original post**: [Statistical NLP on OpenStreetMap](https://medium.com/@albarrentine/statistical-nlp-on-openstreetmap-b9d573e6cc86)
|
||||
- **Follow-up for 1.0 release**: [Statistical NLP on OpenStreetMap: Part 2](https://medium.com/@albarrentine/statistical-nlp-on-openstreetmap-part-2-80405b988718)
|
||||
|
||||
<span>🇧🇷</span> <span>🇫🇮</span> <span>🇳🇬</span> :jp: <span>🇽🇰 </span> <span>🇧🇩 </span> <span>🇵🇱 </span> <span>🇻🇳 </span> <span>🇧🇪 </span> <span>🇲🇦 </span> <span>🇺🇦 </span> <span>🇯🇲 </span> :ru: <span>🇮🇳 </span> <span>🇱🇻 </span> <span>🇧🇴 </span> :de: <span>🇸🇳 </span> <span>🇦🇲 </span> :kr: <span>🇳🇴 </span> <span>🇲🇽 </span> <span>🇨🇿 </span> <span>🇹🇷 </span> :es: <span>🇸🇸 </span> <span>🇪🇪 </span> <span>🇧🇭 </span> <span>🇳🇱 </span> :cn: <span>🇵🇹 </span> <span>🇵🇷 </span> :gb: <span>🇵🇸 </span>
|
||||
|
||||
Addresses and the locations they represent are essential for any application dealing with maps (place search, transportation, on-demand/delivery services, check-ins, reviews). Yet even the simplest addresses are packed with local conventions, abbreviations and context, making them difficult to index/query effectively with traditional full-text search engines. This library helps convert the free-form addresses that humans use into clean normalized forms suitable for machine comparison and full-text indexing. Though libpostal is not itself a full geocoder, it can be used as a preprocessing step to make any geocoding application smarter, simpler, and more consistent internationally.
|
||||
|
||||
<span>🇷🇴 </span> <span>🇬🇭 </span> <span>🇦🇺 </span> <span>🇲🇾 </span> <span>🇭🇷 </span> <span>🇭🇹 </span> :us: <span>🇿🇦 </span> <span>🇷🇸 </span> <span>🇨🇱 </span> :it: <span>🇰🇪 <span>🇨🇭 </span> <span>🇨🇺 </span> <span>🇸🇰 </span> <span>🇦🇴 </span> <span>🇩🇰 </span> <span>🇹🇿 </span> <span>🇦🇱 </span> <span>🇨🇴 </span> <span>🇮🇱 </span> <span>🇬🇹 </span> :fr: <span>🇵🇭 </span> <span>🇦🇹 </span> <span>🇱🇨 </span> <span>🇮🇸 <span>🇮🇩 </span> </span> <span>🇦🇪 </span> </span> <span>🇸🇰 </span> <span>🇹🇳 </span> <span>🇰🇭 </span> <span>🇦🇷 </span> <span>🇭🇰 </span>
|
||||
|
||||
The core library is written in pure C. Language bindings for [Python](https://github.com/openvenues/pypostal), [Ruby](https://github.com/openvenues/ruby_postal), [Go](https://github.com/openvenues/gopostal), [Java](https://github.com/openvenues/jpostal), [PHP](https://github.com/openvenues/php-postal), and [NodeJS](https://github.com/openvenues/node-postal) are officially supported and it's easy to write bindings in other languages.
|
||||
|
||||
Sponsors
|
||||
------------
|
||||
--------
|
||||
|
||||
If your company is using libpostal, consider asking your organization to sponsor the project and help fund our continued research into geo + NLP. Interpreting what humans mean when they refer to locations is far from a solved problem, and sponsorships help us pursue new frontiers in machine geospatial intelligence. As a sponsor, your company logo will appear prominently on the Github repo page along with a link to your site. [Sponsorship info](https://opencollective.com/libpostal#sponsor)
|
||||
If your company is using libpostal, consider asking your organization to sponsor the project. Interpreting what humans mean when they refer to locations is far from a solved problem, and sponsorships help us pursue new frontiers in geospatial NLP. As a sponsor, your company logo will appear prominently on the Github repo page along with a link to your site. [Sponsorship info](https://opencollective.com/libpostal#sponsor)
|
||||
|
||||
<a href="https://opencollective.com/libpostal/sponsor/0/website" target="_blank"><img src="https://opencollective.com/libpostal/sponsor/0/avatar.svg"></a>
|
||||
<a href="https://opencollective.com/libpostal/sponsor/1/website" target="_blank"><img src="https://opencollective.com/libpostal/sponsor/1/avatar.svg"></a>
|
||||
@@ -84,142 +89,6 @@ Individual users can also help support open geo NLP research by making a monthly
|
||||
<a href="https://opencollective.com/libpostal/backer/28/website" target="_blank"><img src="https://opencollective.com/libpostal/backer/28/avatar.svg"></a>
|
||||
<a href="https://opencollective.com/libpostal/backer/29/website" target="_blank"><img src="https://opencollective.com/libpostal/backer/29/avatar.svg"></a>
|
||||
|
||||
Examples of parsing
|
||||
-------------------
|
||||
|
||||
libpostal implements the first statistical address parser that works well internationally,
|
||||
trained on ~50 million addresses in over 100 countries and as many
|
||||
languages. We use OpenStreetMap (anything with an addr:* tag) and the OpenCage
|
||||
address format templates at: https://github.com/OpenCageData/address-formatting
|
||||
to construct the training data, supplementing with containing polygons and
|
||||
perturbing the inputs in a number of ways to make the parser as robust as possible
|
||||
to messy real-world input.
|
||||
|
||||
These example parse results are taken from the interactive address_parser program
|
||||
that builds with libpostal when you run ```make```. Note that the parser is robust to
|
||||
commas vs. no commas, casing, different permutations of components (if the input
|
||||
is e.g. just city or just city/postcode).
|
||||
|
||||

|
||||
|
||||
The parser achieves very high accuracy on held-out data, currently 98.9%
|
||||
correct full parses (meaning a 1 in the numerator for getting *every* token
|
||||
in the address correct).
|
||||
|
||||
Usage (parser)
|
||||
--------------
|
||||
|
||||
Here's an example of the parser API using the Python bindings:
|
||||
|
||||
```python
|
||||
|
||||
from postal.parser import parse_address
|
||||
parse_address('The Book Club 100-106 Leonard St Shoreditch London EC2A 4RH, United Kingdom')
|
||||
```
|
||||
|
||||
And an example with the C API:
|
||||
|
||||
```c
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <libpostal/libpostal.h>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// Setup (only called once at the beginning of your program)
|
||||
if (!libpostal_setup() || !libpostal_setup_parser()) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
address_parser_options_t options = get_libpostal_address_parser_default_options();
|
||||
address_parser_response_t *parsed = parse_address("781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options);
|
||||
|
||||
for (size_t i = 0; i < parsed->num_components; i++) {
|
||||
printf("%s: %s\n", parsed->labels[i], parsed->components[i]);
|
||||
}
|
||||
|
||||
// Free parse result
|
||||
address_parser_response_destroy(parsed);
|
||||
|
||||
// Teardown (only called once at the end of your program)
|
||||
libpostal_teardown();
|
||||
libpostal_teardown_parser();
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Examples of normalization
|
||||
-------------------------
|
||||
|
||||
The expand_address API converts messy real-world addresses into normalized
|
||||
equivalents suitable for search indexing, hashing, etc.
|
||||
|
||||
Here's an interactive example using the Python binding:
|
||||
|
||||

|
||||
|
||||
libpostal contains an OSM-trained language classifier to detect which language(s) are used in a given
|
||||
address so it can apply the appropriate normalizations. The only input needed is the raw address string.
|
||||
Here's a short list of some less straightforward normalizations in various languages.
|
||||
|
||||
| Input | Output (may be multiple in libpostal) |
|
||||
| ----------------------------------- |-----------------------------------------|
|
||||
| One-hundred twenty E 96th St | 120 east 96th street |
|
||||
| C/ Ocho, P.I. 4 | calle 8 polígono industrial 4 |
|
||||
| V XX Settembre, 20 | via 20 settembre 20 |
|
||||
| Quatre vingt douze R. de l'Église | 92 rue de l' église |
|
||||
| ул Каретный Ряд, д 4, строение 7 | улица каретныи ряд дом 4 строение 7 |
|
||||
| ул Каретный Ряд, д 4, строение 7 | ulitsa karetnyy ryad dom 4 stroyeniye 7 |
|
||||
| Marktstrasse 14 | markt straße 14 |
|
||||
|
||||
libpostal currently supports these types of normalizations in *60+ languages*,
|
||||
and you can [add more](https://github.com/openvenues/libpostal/tree/master/resources/dictionaries)
|
||||
(without having to write any C).
|
||||
|
||||
For further reading and some bizarre address edge-cases, see:
|
||||
[Falsehoods Programmers Believe About Addresses](https://www.mjt.me.uk/posts/falsehoods-programmers-believe-about-addresses/).
|
||||
|
||||
Usage (normalization)
|
||||
---------------------
|
||||
|
||||
Here's an example using the Python bindings for succinctness (most of the higher-level language bindings are similar):
|
||||
|
||||
```python
|
||||
from postal.expand import expand_address
|
||||
expansions = expand_address('Quatre-vingt-douze Ave des Champs-Élysées')
|
||||
|
||||
assert '92 avenue des champs-elysees' in set(expansions)
|
||||
```
|
||||
|
||||
The C API equivalent is a few more lines, but still fairly simple:
|
||||
|
||||
```c
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <libpostal/libpostal.h>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// Setup (only called once at the beginning of your program)
|
||||
if (!libpostal_setup() || !libpostal_setup_language_classifier()) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
size_t num_expansions;
|
||||
normalize_options_t options = get_libpostal_default_options();
|
||||
char **expansions = expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions);
|
||||
|
||||
for (size_t i = 0; i < num_expansions; i++) {
|
||||
printf("%s\n", expansions[i]);
|
||||
}
|
||||
|
||||
// Free expansions
|
||||
expansion_array_destroy(expansions, num_expansions);
|
||||
|
||||
// Teardown (only called once at the end of your program)
|
||||
libpostal_teardown();
|
||||
libpostal_teardown_language_classifier();
|
||||
}
|
||||
```
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
@@ -227,17 +96,17 @@ Before you install, make sure you have the following prerequisites:
|
||||
|
||||
**On Ubuntu/Debian**
|
||||
```
|
||||
sudo apt-get install curl libsnappy-dev autoconf automake libtool pkg-config
|
||||
sudo apt-get install curl autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
**On CentOS/RHEL**
|
||||
```
|
||||
sudo yum install snappy snappy-devel autoconf automake libtool pkgconfig
|
||||
sudo yum install curl autoconf automake libtool pkgconfig
|
||||
```
|
||||
|
||||
**On Mac OSX**
|
||||
```
|
||||
brew install snappy autoconf automake libtool pkg-config
|
||||
brew install curl autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
Then to install the C library:
|
||||
@@ -268,37 +137,159 @@ For example, if you write a program called app.c, you can compile it like this:
|
||||
gcc app.c `pkg-config --cflags --libs libpostal`
|
||||
```
|
||||
|
||||
Bindings
|
||||
--------
|
||||
Examples of parsing
|
||||
-------------------
|
||||
|
||||
Libpostal is designed to be used by higher-level languages. If you don't see your language of choice, or if you're writing a language binding, please let us know!
|
||||
libpostal's international address parser uses machine learning (Conditional Random Fields) and is trained on over 1 billion addresses in every inhabited country on Earth. We use [OpenStreetMap](https://openstreetmap.org) and [OpenAddresses](https://openaddresses.io) as sources of structured addresses, and the OpenCage address format templates at: https://github.com/OpenCageData/address-formatting to construct the training data, supplementing with containing polygons, and generating sub-building components like apartment/floor numbers and PO boxes. We also add abbreviations, drop out components at random, etc. to make the parser as robust as possible to messy real-world input.
|
||||
|
||||
**Officially supported language bindings**
|
||||
These example parse results are taken from the interactive address_parser program
|
||||
that builds with libpostal when you run ```make```. Note that the parser can handle
|
||||
commas vs. no commas as well as various casings and permutations of components (if the input
|
||||
is e.g. just city or just city/postcode).
|
||||
|
||||
- Python: [pypostal](https://github.com/openvenues/pypostal)
|
||||
- Ruby: [ruby_postal](https://github.com/openvenues/ruby_postal)
|
||||
- Go: [gopostal](https://github.com/openvenues/gopostal)
|
||||
- Java/JVM: [jpostal](https://github.com/openvenues/jpostal)
|
||||
- PHP: [php-postal](https://github.com/openvenues/php-postal)
|
||||
- NodeJS: [node-postal](https://github.com/openvenues/node-postal)
|
||||
- R: [poster](https://github.com/ironholds/poster)
|
||||

|
||||
|
||||
**Unofficial language bindings**
|
||||
The parser achieves very high accuracy on held-out data, currently 99.45%
|
||||
correct full parses (meaning a 1 in the numerator for getting *every* token
|
||||
in the address correct).
|
||||
|
||||
- LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
|
||||
- Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
|
||||
Usage (parser)
|
||||
--------------
|
||||
|
||||
**Database extensions**
|
||||
Here's an example of the parser API using the Python bindings:
|
||||
|
||||
- PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)
|
||||
```python
|
||||
|
||||
**Unofficial REST API**
|
||||
from postal.parser import parse_address
|
||||
parse_address('The Book Club 100-106 Leonard St Shoreditch London EC2A 4RH, United Kingdom')
|
||||
```
|
||||
|
||||
- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest)
|
||||
And an example with the C API:
|
||||
|
||||
**Libpostal REST Docker**
|
||||
```c
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <libpostal/libpostal.h>
|
||||
|
||||
- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker)
|
||||
int main(int argc, char **argv) {
|
||||
// Setup (only called once at the beginning of your program)
|
||||
if (!libpostal_setup() || !libpostal_setup_parser()) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||
libpostal_address_parser_response_t *parsed = libpostal_parse_address("781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options);
|
||||
|
||||
for (size_t i = 0; i < parsed->num_components; i++) {
|
||||
printf("%s: %s\n", parsed->labels[i], parsed->components[i]);
|
||||
}
|
||||
|
||||
// Free parse result
|
||||
libpostal_address_parser_response_destroy(parsed);
|
||||
|
||||
// Teardown (only called once at the end of your program)
|
||||
libpostal_teardown();
|
||||
libpostal_teardown_parser();
|
||||
}
|
||||
```
|
||||
|
||||
Parser labels
|
||||
-------------
|
||||
|
||||
The address parser can technically use any string labels that are defined in the training data, but these are the ones currently defined, based on the fields defined in [OpenCage's address-formatting library](https://github.com/OpenCageData/address-formatting), as well as a few added by libpostal to handle specific patterns:
|
||||
|
||||
- **house**: venue name e.g. "Brooklyn Academy of Music", and building names e.g. "Empire State Building"
|
||||
- **category**: for category queries like "restaurants", etc.
|
||||
- **near**: phrases like "in", "near", etc. used after a category phrase to help with parsing queries like "restaurants in Brooklyn"
|
||||
- **house_number**: usually refers to the external (street-facing) building number. In some countries this may be a compount, hyphenated number which also includes an apartment number, or a block number (a la Japan), but libpostal will just call it the house_number for simplicity.
|
||||
- **road**: street name(s)
|
||||
- **unit**: an apartment, unit, office, lot, or other secondary unit designator
|
||||
- **level**: expressions indicating a floor number e.g. "3rd Floor", "Ground Floor", etc.
|
||||
- **staircase**: numbered/lettered staircase
|
||||
- **entrance**: numbered/lettered entrance
|
||||
- **po_box**: post office box: typically found in non-physical (mail-only) addresses
|
||||
- **postcode**: postal codes used for mail sorting
|
||||
- **suburb**: usually an unofficial neighborhood name like "Harlem", "South Bronx", or "Crown Heights"
|
||||
- **city_district**: these are usually boroughs or districts within a city that serve some official purpose e.g. "Brooklyn" or "Hackney" or "Bratislava IV"
|
||||
- **city**: any human settlement including cities, towns, villages, hamlets, localities, etc.
|
||||
- **island**: named islands e.g. "Maui"
|
||||
- **state_district**: usually a second-level administrative division or county.
|
||||
- **state**: a first-level administrative division. Scotland, Northern Ireland, Wales, and England in the UK are mapped to "state" as well (convention used in OSM, GeoPlanet, etc.)
|
||||
- **country_region**: informal subdivision of a country without any political status
|
||||
- **country**: sovereign nations and their dependent territories, anything with an [ISO-3166 code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2).
|
||||
- **world_region**: currently only used for appending “West Indies” after the country name, a pattern frequently used in the English-speaking Caribbean e.g. “Jamaica, West Indies”
|
||||
|
||||
Examples of normalization
|
||||
-------------------------
|
||||
|
||||
The expand_address API converts messy real-world addresses into normalized
|
||||
equivalents suitable for search indexing, hashing, etc.
|
||||
|
||||
Here's an interactive example using the Python binding:
|
||||
|
||||

|
||||
|
||||
libpostal contains an OSM-trained language classifier to detect which language(s) are used in a given
|
||||
address so it can apply the appropriate normalizations. The only input needed is the raw address string.
|
||||
Here's a short list of some less straightforward normalizations in various languages.
|
||||
|
||||
| Input | Output (may be multiple in libpostal) |
|
||||
| ----------------------------------- |-----------------------------------------|
|
||||
| One-hundred twenty E 96th St | 120 east 96th street |
|
||||
| C/ Ocho, P.I. 4 | calle 8 polígono industrial 4 |
|
||||
| V XX Settembre, 20 | via 20 settembre 20 |
|
||||
| Quatre vingt douze R. de l'Église | 92 rue de l eglise |
|
||||
| ул Каретный Ряд, д 4, строение 7 | улица каретныи ряд дом 4 строение 7 |
|
||||
| ул Каретный Ряд, д 4, строение 7 | ulitsa karetnyy ryad dom 4 stroyeniye 7 |
|
||||
| Marktstraße 14 | markt strasse 14 |
|
||||
|
||||
libpostal currently supports these types of normalizations in *60+ languages*,
|
||||
and you can [add more](https://github.com/openvenues/libpostal/tree/master/resources/dictionaries) (without having to write any C).
|
||||
|
||||
For further reading and some bizarre address edge-cases, see:
|
||||
[Falsehoods Programmers Believe About Addresses](https://www.mjt.me.uk/posts/falsehoods-programmers-believe-about-addresses/).
|
||||
|
||||
Usage (normalization)
|
||||
---------------------
|
||||
|
||||
Here's an example using the Python bindings for succinctness (most of the higher-level language bindings are similar):
|
||||
|
||||
```python
|
||||
from postal.expand import expand_address
|
||||
expansions = expand_address('Quatre-vingt-douze Ave des Champs-Élysées')
|
||||
|
||||
assert '92 avenue des champs-elysees' in set(expansions)
|
||||
```
|
||||
|
||||
The C API equivalent is a few more lines, but still fairly simple:
|
||||
|
||||
```c
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <libpostal/libpostal.h>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// Setup (only called once at the beginning of your program)
|
||||
if (!libpostal_setup() || !libpostal_setup_language_classifier()) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
size_t num_expansions;
|
||||
libpostal_normalize_options_t options = libpostal_get_default_options();
|
||||
char **expansions = libpostal_expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions);
|
||||
|
||||
for (size_t i = 0; i < num_expansions; i++) {
|
||||
printf("%s\n", expansions[i]);
|
||||
}
|
||||
|
||||
// Free expansions
|
||||
libpostal_expansion_array_destroy(expansions, num_expansions);
|
||||
|
||||
// Teardown (only called once at the end of your program)
|
||||
libpostal_teardown();
|
||||
libpostal_teardown_language_classifier();
|
||||
}
|
||||
```
|
||||
|
||||
Command-line usage (expand)
|
||||
---------------------------
|
||||
@@ -331,6 +322,45 @@ cd src/
|
||||
address_parser is an interactive shell. Just type addresses and libpostal will
|
||||
parse them and print the result.
|
||||
|
||||
|
||||
Bindings
|
||||
--------
|
||||
|
||||
Libpostal is designed to be used by higher-level languages. If you don't see your language of choice, or if you're writing a language binding, please let us know!
|
||||
|
||||
**Officially supported language bindings**
|
||||
|
||||
- Python: [pypostal](https://github.com/openvenues/pypostal)
|
||||
- Ruby: [ruby_postal](https://github.com/openvenues/ruby_postal)
|
||||
- Go: [gopostal](https://github.com/openvenues/gopostal)
|
||||
- Java/JVM: [jpostal](https://github.com/openvenues/jpostal)
|
||||
- PHP: [php-postal](https://github.com/openvenues/php-postal)
|
||||
- NodeJS: [node-postal](https://github.com/openvenues/node-postal)
|
||||
- R: [poster](https://github.com/ironholds/poster)
|
||||
|
||||
**Unofficial language bindings**
|
||||
|
||||
- LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
|
||||
- Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
|
||||
- Elixir: [Expostal](https://github.com/SweetIQ/expostal)
|
||||
|
||||
**Database extensions**
|
||||
|
||||
- PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)
|
||||
|
||||
**Unofficial REST API**
|
||||
|
||||
- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest)
|
||||
|
||||
**Libpostal REST Docker**
|
||||
|
||||
- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker)
|
||||
|
||||
**Libpostal ZeroMQ Docker**
|
||||
|
||||
- Libpostal ZeroMQ Docker image: [pasupulaphani/libpostal-zeromq](https://hub.docker.com/r/pasupulaphani/libpostal-zeromq/) , Source: [Github](https://github.com/pasupulaphani/libpostal-docker)
|
||||
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
@@ -342,19 +372,18 @@ make check
|
||||
|
||||
Adding [test cases](https://github.com/openvenues/libpostal/tree/master/test) is easy, even if your C is rusty/non-existent, and we'd love contributions. We use mostly functional tests checking string input against string output.
|
||||
|
||||
libpostal also gets periodically battle-tested on tens of millions of addresses from OSM (clean) as well as anonymized queries from a production geocoder (not so clean). During this process we use valgrind to check for memory leaks and other errors.
|
||||
libpostal also gets periodically battle-tested on millions of addresses from OSM (clean) as well as anonymized queries from a production geocoder (not so clean). During this process we use valgrind to check for memory leaks and other errors.
|
||||
|
||||
Data files
|
||||
----------
|
||||
|
||||
libpostal needs to download some data files from S3. The basic files are on-disk
|
||||
representations of the data structures necessary to perform expansion. For address
|
||||
parsing, since model training takes about a day, we publish the fully trained model
|
||||
to S3 and will update it automatically as new addresses get added to OSM. Same goes for
|
||||
the language classifier model.
|
||||
parsing, since model training takes a few days, we publish the fully trained model
|
||||
to S3 and will update it automatically as new addresses get added to OSM, OpenAddresses, etc. Same goes for the language classifier model.
|
||||
|
||||
Data files are automatically downloaded when you run make. To check for and download
|
||||
any new data files, run:
|
||||
any new data files, you can either run ```make```, or run:
|
||||
|
||||
```
|
||||
libpostal_data download all $YOUR_DATA_DIR/libpostal
|
||||
@@ -367,6 +396,27 @@ Language dictionaries
|
||||
|
||||
libpostal contains a number of per-language dictionaries that influence expansion, the language classifier, and the parser. To explore the dictionaries or contribute abbreviations/phrases in your language, see [resources/dictionaries](https://github.com/openvenues/libpostal/tree/master/resources/dictionaries).
|
||||
|
||||
Training data
|
||||
-------------
|
||||
|
||||
In machine learning, large amounts of training data are often essential for getting good results. Many open-source machine learning projects either release only the model code (results reproducible if and only if you're Google), or a pre-baked model where the training conditions are unknown.
|
||||
|
||||
Libpostal is a bit different because it's trained on open data that's available to everyone, so we've released the entire training pipeline (the [geodata](https://github.com/openvenues/libpostal/tree/master/scripts/geodata) package in this repo), as well as the resulting training data itself on S3. It's over 100GB unzipped.
|
||||
|
||||
Training data are stored on S3 by the date they were created. There's also a file stored on S3 to point to the most recent training data. To always point to the latest data, use something like: ```latest=$(curl https://s3.amazonaws.com/libpostal/training_data/latest)``` and use that variable in place of the date.
|
||||
|
||||
### Parser training sets ###
|
||||
All files can be found at https://d1p366rbd94x8u.cloudfront.net/training_data/$YYYY-MM-DD/parser/$FILE as gzip'd tab-separated values (TSV) files formatted like:```language\tcountry\taddress```.
|
||||
|
||||
- **formatted_addresses_tagged.random.tsv.gz** (ODBL): OSM addresses. Apartments, PO boxes, categories, etc. are added primarily to these examples
|
||||
- **formatted_places_tagged.random.tsv.gz** (ODBL): every toponym in OSM (even cities represented as points, etc.), reverse-geocoded to its parent admins, possibly including postal codes if they're listed on the point/polygon. Every place gets a base level of representation and places with higher populations get proportionally more.
|
||||
- **formatted_ways_tagged.random.tsv.gz** (ODBL): every street in OSM (ways with highway=*, with a few conditions), reverse-geocoded to its admins
|
||||
- **geoplanet_formatted_addresses_tagged.random.tsv.gz** (CC-BY): every postal code in Yahoo GeoPlanet (includes almost every postcode in the UK, Canada, etc.) and their parent admins. The GeoPlanet admins have been cleaned up and mapped to libpostal's tagset
|
||||
- **openaddresses_formatted_addresses_tagged.random.tsv.gz** (various licenses, mostly CC-BY): most of the address data sets from [OpenAddresses](https://openaddresses.io/), which in turn come directly from government sources
|
||||
- **uk_openaddresses_formatted_addresses_tagged.random.tsv.gz** (CC-BY): addresses from [OpenAddresses UK](https://alpha.openaddressesuk.org/)
|
||||
|
||||
If the parser doesn't perform as well as you'd hoped on a particular type of address, the best recourse is to use grep/awk to look through the training data and try to determine if there's some pattern/style of address that's not being captured.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
@@ -377,18 +427,18 @@ whitespace e.g. Chinese) are supported, as are Germanic languages where
|
||||
thoroughfare types are concatenated onto the end of the string, and may
|
||||
optionally be separated so Rosenstraße and Rosen Straße are equivalent.
|
||||
|
||||
- **International address parsing**: sequence model which parses
|
||||
- **International address parsing**: [Conditional Random Field](http://blog.echen.me/2012/01/03/introduction-to-conditional-random-fields/) which parses
|
||||
"123 Main Street New York New York" into {"house_number": 123, "road":
|
||||
"Main Street", "city": "New York", "state": "New York"}. The parser works
|
||||
for a wide variety of countries and languages, not just US/English.
|
||||
The model is trained on > 50M OSM addresses, using the
|
||||
The model is trained on over 1 billion addresses and address-like strings, using the
|
||||
templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted,
|
||||
tagged traning examples for most countries around the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/osm/osm_address_training_data.py)
|
||||
tagged traning examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
|
||||
are performed to make the training data resemble real messy geocoder input as closely as possible.
|
||||
|
||||
- **Language classification**: multinomial logistic regression
|
||||
trained on all of OpenStreetMap ways, addr:* tags, toponyms and formatted
|
||||
addresses. Labels are derived using point-in-polygon tests in Quattroshapes
|
||||
trained (using the [FTRL-Proximal](https://research.google.com/pubs/archive/41159.pdf) method to induce sparsity) on all of OpenStreetMap ways, addr:* tags, toponyms and formatted
|
||||
addresses. Labels are derived using point-in-polygon tests for both OSM countries
|
||||
and official/regional languages for countries and admin 1 boundaries
|
||||
respectively. So, for example, Spanish is the default language in Spain but
|
||||
in different regions e.g. Catalunya, Galicia, the Basque region, the respective
|
||||
@@ -424,34 +474,11 @@ Latin scripts in the same address). In transliteration we can use all
|
||||
applicable transliterators for a given Unicode script (Greek can for instance
|
||||
be transliterated with Greek-Latin, Greek-Latin-BGN and Greek-Latin-UNGEGN).
|
||||
|
||||
Roadmap
|
||||
-------
|
||||
|
||||
- **Geographic name aliasing (coming soon)**: New York, NYC and Nueva York alias
|
||||
to New York City. Uses the crowd-sourced GeoNames (geonames.org) database, so alternate
|
||||
names added by contributors can automatically improve libpostal.
|
||||
|
||||
- **Geographic disambiguation (coming soon)**: There are several equally
|
||||
likely Springfields in the US (formally known as The Simpsons problem), and
|
||||
some context like a state is required to disambiguate. There are also > 1200
|
||||
distinct San Franciscos in the world but the term "San Francisco" almost always
|
||||
refers to the one in California. Williamsburg can refer to a neighborhood in
|
||||
Brooklyn or a city in Virginia. Geo disambiguation is a subset of Word Sense
|
||||
Disambiguation, and attempts to resolve place names in a string to GeoNames
|
||||
entities. This can be useful for city-level geocoding suitable for polygon/area
|
||||
lookup. By default, if there is no other context, as in the San Francisco case,
|
||||
the most populous entity will be selected.
|
||||
|
||||
- **Ambiguous token classification (coming soon)**: e.g. "dr" => "doctor" or
|
||||
"drive" for an English address depending on the context. Multiclass logistic
|
||||
regression trained on OSM addresses, where abbreviations are discouraged,
|
||||
giving us many examples of fully qualified addresses on which to train.
|
||||
|
||||
Non-goals
|
||||
---------
|
||||
|
||||
- Verifying that a location is a valid address
|
||||
- Street-level geocoding
|
||||
- Actually geocoding addresses to a lat/lon (that requires a database/search index)
|
||||
|
||||
Raison d'être
|
||||
-------------
|
||||
@@ -532,8 +559,8 @@ needed to construct the training data.
|
||||
where we may be limited to < 1GB of RAM per process depending on the machine
|
||||
configuration. As much as possible libpostal uses contiguous arrays, tries
|
||||
(built on contiguous arrays), bloom filters and compressed sparse matrices to
|
||||
keep memory usage low. It's conceivable that libpostal could even be used on
|
||||
a mobile device, although that's not an explicit goal of the project.
|
||||
keep memory usage low. It's possible to use libpostal on a mobile device with
|
||||
models trained on a single country or a handful of countries.
|
||||
|
||||
3. **Performance**: this is last on the list for a reason. Most of the
|
||||
optimizations in libpostal are for memory usage rather than performance.
|
||||
@@ -546,8 +573,8 @@ isn't as important because everything's being done in parallel, but there are
|
||||
some streaming ingestion applications at Mapzen where this needs to
|
||||
run in-process.
|
||||
|
||||
C codebase
|
||||
----------
|
||||
C conventions
|
||||
-------------
|
||||
|
||||
libpostal is written in modern, legible, C99 and uses the following conventions:
|
||||
|
||||
@@ -559,31 +586,30 @@ libpostal is written in modern, legible, C99 and uses the following conventions:
|
||||
- Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
|
||||
- Data structrues take advantage of sparsity as much as possible
|
||||
- Efficient double-array trie implementation for most string dictionaries
|
||||
- Tries to stay cross-platform as much as possible, particularly for *nix
|
||||
- Cross-platform as much as possible, particularly for *nix
|
||||
|
||||
Python codebase
|
||||
---------------
|
||||
Preprocessing (Python)
|
||||
----------------------
|
||||
|
||||
The [geodata](https://github.com/openvenues/libpostal/tree/master/scripts/geodata) package in the libpostal repo is a confederation of scripts for preprocessing the various geo
|
||||
data sets and building input files for the C lib to use during model training.
|
||||
Said scripts shouldn't be needed for most users unless you're rebuilding data
|
||||
files for the C lib.
|
||||
The [geodata](https://github.com/openvenues/libpostal/tree/master/scripts/geodata) Python package in the libpostal repo contains the pipeline for preprocessing the various geo
|
||||
data sets and building training data for the C models to use.
|
||||
This package shouldn't be needed for most users, but for those interested in generating new types of addresses or improving libpostal's training data, this is where to look.
|
||||
|
||||
Address parser accuracy
|
||||
-----------------------
|
||||
|
||||
On held-out test data (meaning labeled parses that the model has _not_ seen
|
||||
before), the address parser achieves 98.9% full parse accuracy.
|
||||
before), the address parser achieves 99.45% full parse accuracy.
|
||||
|
||||
For some tasks like named entity recognition it's preferable to use something
|
||||
like an F1 score or variants, mostly because there's a class bias problem (most
|
||||
tokens are non-entities, and a system that simply predicted non-entity for
|
||||
words are non-entities, and a system that simply predicted non-entity for
|
||||
every token would actually do fairly well in terms of accuracy). That is not
|
||||
the case for address parsing. Every token has a label and there are millions
|
||||
of examples of each class in the training data, so accuracy is preferable as it's
|
||||
a clean, simple and intuitive measure of performance.
|
||||
|
||||
Here we use full parse accuracy, meaning we only give the parser a "point" in
|
||||
Here we use full parse accuracy, meaning we only give the parser one "point" in
|
||||
the numerator if it gets every single token in the address correct. That should
|
||||
be a better measure than simply looking at whether each token was correct.
|
||||
|
||||
@@ -592,7 +618,7 @@ Improving the address parser
|
||||
|
||||
Though the current parser works quite well for most standard addresses, there
|
||||
is still room for improvement, particularly in making sure the training data
|
||||
we use is as close as possible to addresses in the wild. There are four primary
|
||||
we use is as close as possible to addresses in the wild. There are two primary
|
||||
ways the address parser can be improved even further (in order of difficulty):
|
||||
|
||||
1. Contribute addresses to OSM. Anything with an addr:housenumber tag will be
|
||||
@@ -600,27 +626,19 @@ ways the address parser can be improved even further (in order of difficulty):
|
||||
2. If the address parser isn't working well for a particular country, language
|
||||
or style of address, chances are that some name variations or places being
|
||||
missed/mislabeled during training data creation. Sometimes the fix is to
|
||||
add more countries at: https://github.com/OpenCageData/address-formatting,
|
||||
update the formats at: https://github.com/OpenCageData/address-formatting,
|
||||
and in many other cases there are relatively simple tweaks we can make
|
||||
when creating the training data that will ensure the model is trained to
|
||||
handle your use case without you having to do any manual data entry.
|
||||
If you see a pattern of obviously bad address parses, the best thing to
|
||||
do is post an issue to Github.
|
||||
3. We currently don't have training data for things like apartment/flat numbers.
|
||||
The tags are fairly uncommon in OSM and the address-formatting templates
|
||||
don't use floor, level, apartment/flat number, etc. This would be a slightly
|
||||
more involved effort, but would be worth starting a discussion.
|
||||
4. We use a greedy averaged perceptron for the parser model primarily for its
|
||||
speed and relatively good performance compared to slower, fancier models.
|
||||
Viterbi inference using a linear-chain CRF may improve parser performance
|
||||
on certain classes of input since the score is the argmax over the entire
|
||||
label sequence not just the token. This may slow down training significantly
|
||||
although runtime performance would be relatively unaffected.
|
||||
|
||||
Contributing
|
||||
------------
|
||||
|
||||
Bug reports and pull requests are welcome on GitHub at https://github.com/openvenues/libpostal.
|
||||
Bug reports, issues and pull requests are welcome. Please read the [contributing guide](CONTRIBUTING.md) before submitting your issue, bug report, or pull request.
|
||||
|
||||
Submit issues at: https://github.com/openvenues/libpostal/issues.
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
60
configure.ac
60
configure.ac
@@ -1,7 +1,13 @@
|
||||
# -*- Autoconf -*-
|
||||
# Process this file with autoconf to produce a configure script.
|
||||
|
||||
AC_INIT([libpostal], [0.3])
|
||||
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
|
||||
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
|
||||
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
|
||||
|
||||
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
|
||||
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
|
||||
AM_INIT_AUTOMAKE([foreign subdir-objects])
|
||||
AC_CONFIG_SRCDIR([src])
|
||||
@@ -16,9 +22,6 @@ AC_PROG_INSTALL
|
||||
LDFLAGS="$LDFLAGS -L/usr/local/lib"
|
||||
|
||||
# Checks for libraries.
|
||||
AC_SEARCH_LIBS([snappy_compress],
|
||||
[snappy],,[AC_MSG_ERROR([Could not find snappy])
|
||||
])
|
||||
AC_SEARCH_LIBS([log],
|
||||
[m],,[AC_MSG_ERROR([Could not find math library])])
|
||||
|
||||
@@ -45,19 +48,58 @@ AC_TYPE_UINT8_T
|
||||
AC_CHECK_TYPES([ptrdiff_t])
|
||||
|
||||
# Checks for library functions.
|
||||
AC_FUNC_MALLOC
|
||||
AC_FUNC_MMAP
|
||||
AC_FUNC_REALLOC
|
||||
AC_CHECK_FUNCS([getcwd gettimeofday memmove memset munmap regcomp setlocale sqrt strdup strndup])
|
||||
AC_CHECK_FUNCS([malloc realloc getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
|
||||
|
||||
AC_CONFIG_FILES([Makefile
|
||||
libpostal.pc
|
||||
src/Makefile
|
||||
src/sparkey/Makefile
|
||||
test/Makefile])
|
||||
|
||||
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
||||
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
||||
|
||||
AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf available])])
|
||||
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Checks for SSE2 build
|
||||
# ------------------------------------------------------------------
|
||||
AC_ARG_ENABLE([sse2],
|
||||
AS_HELP_STRING(
|
||||
[--disable-sse2],
|
||||
[disable SSE2 optimization routines]
|
||||
)
|
||||
)
|
||||
|
||||
AS_IF([test "x$enable_sse2" != "xno"], [
|
||||
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
|
||||
])
|
||||
|
||||
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
|
||||
|
||||
AC_ARG_ENABLE([data-download],
|
||||
[ --disable-data-download Disable downloading data],
|
||||
[case "${enableval}" in
|
||||
yes) DOWNLOAD_DATA=true ;;
|
||||
no) DOWNLOAD_DATA=false ;;
|
||||
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
|
||||
esac], [DOWNLOAD_DATA=true])
|
||||
|
||||
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
|
||||
|
||||
AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],
|
||||
[
|
||||
if test "x$withval" = "xno"; then
|
||||
CFLAGS_SCANNER_EXTRA=""
|
||||
else
|
||||
CFLAGS_SCANNER_EXTRA="$withval"
|
||||
fi
|
||||
],
|
||||
[ CFLAGS_SCANNER_EXTRA="" ]
|
||||
)
|
||||
|
||||
AC_MSG_NOTICE([extra cflags for scanner.c: $CFLAGS_SCANNER_EXTRA])
|
||||
AC_SUBST(CFLAGS_SCANNER_EXTRA)
|
||||
AC_SUBST(LIBPOSTAL_SO_VERSION, LIBPOSTAL_MAJOR_VERSION:LIBPOSTAL_MINOR_VERSION:LIBPOSTAL_PATCH_VERSION)
|
||||
|
||||
AC_OUTPUT
|
||||
|
||||
172
m4/ax_cblas.m4
Normal file
172
m4/ax_cblas.m4
Normal file
@@ -0,0 +1,172 @@
|
||||
# ===========================================================================
|
||||
# http://autoconf-archive.cryp.to/acx_blas.html
|
||||
# ===========================================================================
|
||||
#
|
||||
# SYNOPSIS
|
||||
#
|
||||
# AX_CBLAS([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
|
||||
#
|
||||
# DESCRIPTION
|
||||
#
|
||||
# This macro looks for a library that implements the CBLAS linear-algebra
|
||||
# interface (see http://www.netlib.org/blas/). On success, it sets the
|
||||
# CBLAS_LIBS output variable to hold the requisite library linkages.
|
||||
#
|
||||
# To link with CBLAS, you should link with:
|
||||
#
|
||||
# $CBLAS_LIBS $LIBS
|
||||
#
|
||||
# in that order.
|
||||
#
|
||||
# Many libraries are searched for, from ATLAS to CXML to ESSL. The user
|
||||
# may also use --with-cblas=<lib> in order to use some specific CBLAS
|
||||
# library <lib>.
|
||||
#
|
||||
# ACTION-IF-FOUND is a list of shell commands to run if a BLAS library is
|
||||
# found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it is
|
||||
# not found. If ACTION-IF-FOUND is not specified, the default action will
|
||||
# define HAVE_BLAS.
|
||||
#
|
||||
# This macro requires autoconf 2.50 or later.
|
||||
#
|
||||
# LAST MODIFICATION
|
||||
#
|
||||
# 2008-12-29
|
||||
#
|
||||
# COPYLEFT
|
||||
#
|
||||
# Copyright (c) 2008 Patrick O. Perry <patperry@stanfordalumni.org>
|
||||
# Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it
|
||||
# under the terms of the GNU General Public License as published by the
|
||||
# Free Software Foundation, either version 3 of the License, or (at your
|
||||
# option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
||||
# Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
# As a special exception, the respective Autoconf Macro's copyright owner
|
||||
# gives unlimited permission to copy, distribute and modify the configure
|
||||
# scripts that are the output of Autoconf when processing the Macro. You
|
||||
# need not follow the terms of the GNU General Public License when using
|
||||
# or distributing such scripts, even though portions of the text of the
|
||||
# Macro appear in them. The GNU General Public License (GPL) does govern
|
||||
# all other use of the material that constitutes the Autoconf Macro.
|
||||
#
|
||||
# This special exception to the GPL applies to versions of the Autoconf
|
||||
# Macro released by the Autoconf Macro Archive. When you make and
|
||||
# distribute a modified version of the Autoconf Macro, you may extend this
|
||||
# special exception to the GPL to apply to your modified version as well.
|
||||
|
||||
AC_DEFUN([AX_CBLAS], [
|
||||
AC_PREREQ(2.50)
|
||||
ax_cblas_ok=no
|
||||
|
||||
AC_ARG_WITH(cblas,
|
||||
[AC_HELP_STRING([--with-cblas=<lib>], [use CBLAS library <lib>])])
|
||||
case $with_cblas in
|
||||
yes | "") ;;
|
||||
no) ax_cblas_ok=disable ;;
|
||||
-* | */* | *.a | *.so | *.so.* | *.o) CBLAS_LIBS="$with_cblas" ;;
|
||||
*) CBLAS_LIBS="-l$with_cblas" ;;
|
||||
esac
|
||||
|
||||
ax_cblas_save_LIBS="$LIBS"
|
||||
|
||||
# First, check CBLAS_LIBS environment variable
|
||||
if test $ax_cblas_ok = no; then
|
||||
if test "x$CBLAS_LIBS" != x; then
|
||||
save_LIBS="$LIBS"; LIBS="$CBLAS_LIBS $LIBS"
|
||||
AC_MSG_CHECKING([for cblas_dgemm in $CBLAS_LIBS])
|
||||
AC_TRY_LINK_FUNC(cblas_dgemm, [ax_cblas_ok=yes], [CBLAS_LIBS=""])
|
||||
AC_MSG_RESULT($ax_cblas_ok)
|
||||
LIBS="$save_LIBS"
|
||||
fi
|
||||
fi
|
||||
|
||||
# CBLAS linked to by default? (happens on some supercomputers)
|
||||
if test $ax_cblas_ok = no; then
|
||||
save_LIBS="$LIBS"; LIBS="$LIBS"
|
||||
AC_CHECK_FUNC(cblas_dgemm, [ax_cblas_ok=yes])
|
||||
LIBS="$save_LIBS"
|
||||
fi
|
||||
|
||||
# BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(atlas, ATL_xerbla,
|
||||
[AC_CHECK_LIB(cblas, cblas_dgemm,
|
||||
[ax_cblas_ok=yes
|
||||
CBLAS_LIBS="-lcblas -latlas"],
|
||||
[], [-latlas])])
|
||||
fi
|
||||
|
||||
# BLAS in Intel MKL library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(mkl, cblas_dgemm, [ax_cblas_ok=yes;CBLAS_LIBS="-lmkl"])
|
||||
fi
|
||||
|
||||
# BLAS in Apple vecLib library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
save_LIBS="$LIBS"; LIBS="-framework vecLib $LIBS"
|
||||
AC_CHECK_FUNC(cblas_dgemm, [ax_cblas_ok=yes;CBLAS_LIBS="-framework vecLib"])
|
||||
LIBS="$save_LIBS"
|
||||
fi
|
||||
|
||||
# BLAS in Alpha DXML library? (now called CXML, see above)
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(dxml, cblas_dgemm, [ax_cblas_ok=yes;CBLAS_LIBS="-ldxml"])
|
||||
fi
|
||||
|
||||
# BLAS in Sun Performance library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
if test "x$GCC" != xyes; then # only works with Sun CC
|
||||
AC_CHECK_LIB(sunmath, acosp,
|
||||
[AC_CHECK_LIB(sunperf, cblas_dgemm,
|
||||
[CBLAS_LIBS="-xlic_lib=sunperf -lsunmath"
|
||||
ax_cblas_ok=yes],[],[-lsunmath])])
|
||||
fi
|
||||
fi
|
||||
|
||||
# BLAS in SCSL library? (SGI/Cray Scientific Library)
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(scs, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lscs"])
|
||||
fi
|
||||
|
||||
# BLAS in SGIMATH library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(complib.sgimath, cblas_dgemm,
|
||||
[ax_cblas_ok=yes; CBLAS_LIBS="-lcomplib.sgimath"])
|
||||
fi
|
||||
|
||||
# BLAS in IBM ESSL library? (requires generic BLAS lib, too)
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(blas, cblas_dgemm,
|
||||
[AC_CHECK_LIB(essl, cblas_dgemm,
|
||||
[ax_cblas_ok=yes; CBLAS_LIBS="-lessl -lblas"],
|
||||
[], [-lblas])])
|
||||
fi
|
||||
|
||||
# Generic CBLAS library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"])
|
||||
fi
|
||||
|
||||
AC_SUBST(CBLAS_LIBS)
|
||||
|
||||
LIBS="$ax_cblas_save_LIBS"
|
||||
|
||||
# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
|
||||
if test x"$ax_cblas_ok" = xyes; then
|
||||
ifelse([$1],,AC_DEFINE(HAVE_CBLAS,1,[Define if you have a CBLAS library.]),[$1])
|
||||
:
|
||||
else
|
||||
ax_cblas_ok=no
|
||||
$2
|
||||
fi
|
||||
])dnl AX_CBLAS
|
||||
1001
resources/addresses/bg.yaml
Normal file
1001
resources/addresses/bg.yaml
Normal file
File diff suppressed because it is too large
Load Diff
585
resources/addresses/bs.yaml
Normal file
585
resources/addresses/bs.yaml
Normal file
@@ -0,0 +1,585 @@
|
||||
# bs.yaml
|
||||
# -------
|
||||
# Bosnian language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.7
|
||||
alphanumeric_probability: 0.3
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- staircase
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.1
|
||||
# For unit types like 2/34
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
|
||||
|
||||
numbers:
|
||||
no_number:
|
||||
default:
|
||||
canonical: bez broja
|
||||
abbreviated: bb
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
|
||||
default: &broj
|
||||
canonical: broj
|
||||
abbreviated: br
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "br."
|
||||
whitespace_probability: 0.6
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
alphanumeric_phrase_probability: 0.05
|
||||
no_number_probability: 0.05
|
||||
|
||||
|
||||
and:
|
||||
default: &i
|
||||
canonical: i
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
cross_streets:
|
||||
i: *i
|
||||
at: &na
|
||||
canonical: na
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner: &ugao
|
||||
canonical: ugao
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner_of: &uglu
|
||||
canonical: uglu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
na_uglu: &na_uglu
|
||||
canonical: na uglu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *i
|
||||
probability: 0.65
|
||||
alternatives:
|
||||
- alternative: *na
|
||||
probability: 0.1
|
||||
- alternative: *uglu
|
||||
probability: 0.1
|
||||
- alternative: *na_uglu
|
||||
probability: 0.1
|
||||
- alternative: *ugao
|
||||
probability: 0.05
|
||||
|
||||
izmedu: &izmedu
|
||||
canonical: između
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
between:
|
||||
default: *izmedu
|
||||
|
||||
levels:
|
||||
sprat: &sprat
|
||||
canonical: sprat
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
kat: &kat
|
||||
canonical: kat
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
prizemlje: &prizemlje
|
||||
canonical: prizemlje
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
parter: &parter
|
||||
canonical: parter
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
mezanino: &mezanin
|
||||
canonical: mezanin
|
||||
half_floors: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
sample: true
|
||||
# e.g. mezanin 2
|
||||
numeric:
|
||||
direction: left
|
||||
# e.g. 2. mezanin
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.1
|
||||
ordinal_probability: 0.2
|
||||
standalone_probability: 0.6
|
||||
podrum: &podrum
|
||||
canonical: podrum
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
# e.g. podrum 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. 1. podrum
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *podrum
|
||||
"-1":
|
||||
default: *podrum
|
||||
# Special token for half-floors
|
||||
half_floors:
|
||||
default: *mezanin
|
||||
"0":
|
||||
default: *prizemlje
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *parter
|
||||
probability: 0.4
|
||||
- alternative: *kat
|
||||
probability: 0.05
|
||||
- alternative: *sprat
|
||||
probability: 0.05
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *kat
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *sprat
|
||||
probability: 0.5
|
||||
numeric_probability: 0.69 # With this probability, pick an integer
|
||||
roman_numeral_probability: 0.3 # Pick a Roman numeral for the actual value
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: u blizini
|
||||
nearby:
|
||||
default:
|
||||
canonical: u blizini
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: u blizini ovdje
|
||||
probability: 0.3
|
||||
- alternative:
|
||||
canonical: ovde
|
||||
probability: 0.1
|
||||
|
||||
near_me:
|
||||
default:
|
||||
canonical: u blizini mene
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: u
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
directions:
|
||||
right: &desno
|
||||
canonical: desno
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &lijevo
|
||||
canonical: lijevo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *desno
|
||||
probability: 0.5
|
||||
- alternative: *lijevo
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &istok
|
||||
canonical: istok
|
||||
abbreviated: i
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: i
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &zapad
|
||||
canonical: zapad
|
||||
abbreviated: z
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &sjever
|
||||
canonical: sjever
|
||||
abbreviated: s
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &jug
|
||||
canonical: jug
|
||||
abbreviated: j
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: j
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *sjever
|
||||
probability: 0.25
|
||||
- alternative: *istok
|
||||
probability: 0.23
|
||||
- alternative: *jug
|
||||
probability: 0.23
|
||||
- alternative: *zapad
|
||||
probability: 0.23
|
||||
|
||||
entrances:
|
||||
ulaz: &ulaz
|
||||
canonical: ulaz
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Ulaz 1, Ulaz A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *ulaz
|
||||
numeric_probability: 0.1 # e.g. Ulaz 1
|
||||
alpha_probability: 0.85 # e.g. Ulaz A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
|
||||
staircases:
|
||||
stubiste: &stubiste
|
||||
canonical: stubište
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *stubiste
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *desno
|
||||
probability: 0.2
|
||||
- alternative: *lijevo
|
||||
probability: 0.2
|
||||
- alternative: *sjever
|
||||
probability: 0.15
|
||||
- alternative: *jug
|
||||
probability: 0.15
|
||||
- alternative: *istok
|
||||
probability: 0.15
|
||||
- alternative: *zapad
|
||||
probability: 0.15
|
||||
|
||||
po_boxes:
|
||||
postanski_pretinac: &postanski_pretinac
|
||||
canonical: poštanski pretinac
|
||||
abbreviated: p.p
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
|
||||
alphanumeric:
|
||||
default: *postanski_pretinac
|
||||
numeric_probability: 0.9 # pp 123
|
||||
alpha_probability: 0.05 # p.p A
|
||||
numeric_plus_alpha_probability: 0.04 # pp 123G
|
||||
alpha_plus_numeric_probability: 0.01 # pp A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
stan: &stan
|
||||
canonical: stan
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
apartman: &apartman
|
||||
canonical: apartman
|
||||
abbreviated: ap
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
soba: &soba
|
||||
canonical: soba
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ured: &ured
|
||||
canonical: ured
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *stan
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *apartman
|
||||
probability: 0.3
|
||||
- alternative: *soba
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. stan. 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. stan A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.05
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *soba
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *ured
|
||||
probability: 0.4
|
||||
numeric_probability: 0.95 # e.g. soba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
|
||||
alpha_probability: 0.03 # e.g. soba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *soba
|
||||
numeric_probability: 0.95 # e.g. soba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
|
||||
alpha_probability: 0.03 # e.g. soba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
856
resources/addresses/ca.yaml
Normal file
856
resources/addresses/ca.yaml
Normal file
@@ -0,0 +1,856 @@
|
||||
# ca.yaml
|
||||
# -------
|
||||
# Catalan language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
# If no floor number is specified
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.35
|
||||
standalone_probability: 0.05
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.3
|
||||
alphanumeric_probability: 0.65
|
||||
standalone_probability: 0.05
|
||||
|
||||
numbers:
|
||||
default: &numero
|
||||
canonical: número
|
||||
abbreviated: "nº"
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.7
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#" # e.g. #3, #2F, etc.
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative:
|
||||
direction: left # affix goes on the number's left
|
||||
|
||||
# Probabilities for numbers
|
||||
numeric_probability: 0.7
|
||||
numeric_affix_probability: 0.3
|
||||
|
||||
and:
|
||||
default: &i
|
||||
canonical: i
|
||||
abbreviated: "&"
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.1
|
||||
|
||||
house_numbers:
|
||||
# sense número (s/n) addresses
|
||||
no_number:
|
||||
default:
|
||||
canonical: sense número
|
||||
abbreviated: s/n
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.7
|
||||
sample_probability: 0.2
|
||||
alphanumeric:
|
||||
default: *numero
|
||||
|
||||
alphanumeric_phrase_probability: 0.01
|
||||
no_number_probability: 0.1 # With this probability, use sense número if no house_number is specified
|
||||
|
||||
|
||||
|
||||
levels:
|
||||
# Everywhere except Spain
|
||||
floor: &pis
|
||||
canonical: pis
|
||||
abbreviated: p
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true # Occasionally add variation of "number", e.g. Pis No 2
|
||||
add_number_phrase_probability: 0.05
|
||||
numeric_affix:
|
||||
affix: p
|
||||
direction: left # P2
|
||||
# e.g. 2o piso
|
||||
ordinal:
|
||||
direction: right
|
||||
direction_probability: 0.95 # Let it vary occasionally e.g. Pis 2o
|
||||
standalone_probability: 0.2 # Let e.g. 5º be the entire floor string
|
||||
# If ordinal is selected, chance of e.g. just using 2o without Piso
|
||||
null_phrase_probability: 0.6
|
||||
numeric_probability: 0.2
|
||||
numeric_affix_probability: 0.05
|
||||
ordinal_probability: 0.75
|
||||
# Ground floor
|
||||
baixos: &baixos
|
||||
canonical: baixos
|
||||
abbreviated: bxs
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.1
|
||||
pis_baix: &pis_baix
|
||||
canonical: pis baix
|
||||
abbreviated: pb
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.1
|
||||
sota: &sota
|
||||
canonical: sota
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
# Used when floor number is < 0 (starts at -1 in all countries)
|
||||
soterrani: &soterrani
|
||||
canonical: soterrani
|
||||
abbreviated: so
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.2
|
||||
# e.g. soterrani 1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: so
|
||||
direction: left
|
||||
# e.g. segon soterrani
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.985
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
sub_soterrani: &sub_soterrani
|
||||
canonical: sub soterrani
|
||||
abbreviated: ss
|
||||
sample: true
|
||||
# e.g. sub soterrani 1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: ss
|
||||
direction: left
|
||||
# e.g. segon sub soterrani
|
||||
ordinal:
|
||||
direction: right
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 2
|
||||
# Soterrani 2 == Sub-soterrani 1
|
||||
number_subtract_abs_value: 1
|
||||
standalone_probability: 0.985
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
entresol: &entresol
|
||||
canonical: entresòl
|
||||
abbreviated: entl
|
||||
half_floors: true
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.1
|
||||
# e.g. entresòl 2
|
||||
numeric:
|
||||
direction: left
|
||||
# e.g. ent2
|
||||
numeric_affix:
|
||||
affix: ent
|
||||
direction: left
|
||||
# e.g. segon entresòl
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.1
|
||||
numeric_affix_probability: 0.1
|
||||
ordinal_probability: 0.2
|
||||
standalone_probability: 0.6
|
||||
pis_principal: &pis_principal
|
||||
canonical: pis principal
|
||||
abbreviated: pis pral
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.5
|
||||
principal: &principal
|
||||
canonical: principal
|
||||
abbreviated: pral
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.2
|
||||
atic: &atic
|
||||
canonical: àtic
|
||||
abbreviated: át
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.2
|
||||
sobreatic: &sobreatic
|
||||
canonical: sobreàtic
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *soterrani
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *sub_soterrani
|
||||
probability: 0.3995
|
||||
- alternative: *pis
|
||||
probability: 0.0005
|
||||
"-1":
|
||||
default: *soterrani
|
||||
probability: 0.9995
|
||||
alternatives:
|
||||
- alternative: *pis
|
||||
probability: 0.0005
|
||||
# Special token for half-floors
|
||||
half_floors:
|
||||
default: *entresol
|
||||
"0":
|
||||
default: *baixos
|
||||
probability: 0.495
|
||||
alternatives:
|
||||
- alternative: *pis_baix
|
||||
probability: 0.395
|
||||
- alternative: *sota
|
||||
probability: 0.1
|
||||
- alternative: *pis
|
||||
# Piso 0 is uncommon
|
||||
probability: 0.01
|
||||
top:
|
||||
default: *pis
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: *atic
|
||||
probability: 0.1
|
||||
- alternative: *sobreatic
|
||||
probability: 0.05
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *pis
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
numeric_probability: 0.99
|
||||
alpha_probability: 0.01
|
||||
|
||||
blocks:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: bloc
|
||||
abbreviated: bl
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: a prop de
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: prop de
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: prop
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: a prop
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: proper
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: proper a
|
||||
probability: 0.05
|
||||
|
||||
nearby:
|
||||
default:
|
||||
canonical: proper
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: a prop
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: a prop d'aquí
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: a prop d'aqui
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: aquí
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: aqui
|
||||
probability: 0.1
|
||||
near_me:
|
||||
default:
|
||||
canonical: a prop meu
|
||||
in:
|
||||
default:
|
||||
canonical: a
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: dins
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: en
|
||||
probability: 0.2
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
cross_streets:
|
||||
and: *i
|
||||
amb: &amb
|
||||
canonical: amb
|
||||
a: &a
|
||||
canonical: a
|
||||
corner_of: &cantonada_de
|
||||
canonical: cantonada de
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
at_the_corner_of: &a_la_cantonada_de
|
||||
canonical: a la cantonada de
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
corner: &cantonada
|
||||
canonical: cantonada
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
|
||||
intersection:
|
||||
default: *i
|
||||
probability: 0.55
|
||||
alternatives:
|
||||
- alternative: *amb
|
||||
probability: 0.2
|
||||
- alternative: *a
|
||||
probability: 0.1
|
||||
- alternative: *cantonada_de
|
||||
probability: 0.09
|
||||
- alternative: *a_la_cantonada_de
|
||||
probability: 0.05
|
||||
- alternative: *cantonada
|
||||
probability: 0.01
|
||||
|
||||
between:
|
||||
canonical: entre
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probabililty: 0.5
|
||||
|
||||
|
||||
po_boxes:
|
||||
apartat: &apartat
|
||||
canonical: apartat
|
||||
abbreviated: apt
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.4 # Apt No 1234
|
||||
numeric_probability: 1.0
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *apartat
|
||||
numeric_probability: 0.9 # Apt 123
|
||||
alpha_probability: 0.05 # Apt A
|
||||
numeric_plus_alpha_probability: 0.04 # Apt 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Apt A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
postcodes:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: codi postal
|
||||
abbreviated: cp
|
||||
sample: true
|
||||
canonical_probability: 0.01
|
||||
abbreviated_probability: 0.95
|
||||
sample_probability: 0.04
|
||||
|
||||
numeric:
|
||||
# Postcodes in Spain and Latin America are sometimes prefixed by CP
|
||||
direction: left
|
||||
|
||||
numeric_affix:
|
||||
affix: cp
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.7
|
||||
numeric_probability: 0.18
|
||||
numeric_affix_probability: 0.12
|
||||
strict_numeric: true
|
||||
|
||||
directions:
|
||||
right: &dreta
|
||||
canonical: dreta
|
||||
abbreviated: dta
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: d
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
left: &esquerra
|
||||
canonical: esquerra
|
||||
abbreviated: esq
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: e
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
rear: &posterior
|
||||
canonical: posterior
|
||||
abbreviated: pos
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
front: &front
|
||||
canonical: front
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *dreta
|
||||
probability: 0.45
|
||||
- alternative: *esquerra
|
||||
probability: 0.45
|
||||
- alternative: *posterior
|
||||
probability: 0.05
|
||||
- alternative: *front
|
||||
probability: 0.05
|
||||
|
||||
anteroposterior:
|
||||
alternatives:
|
||||
- alternative: *front
|
||||
probability: 0.5
|
||||
- alternative: *posterior
|
||||
probability: 0.5
|
||||
|
||||
lateral:
|
||||
alternatives:
|
||||
- alternative: *dreta
|
||||
probability: 0.5
|
||||
- alternative: *esquerra
|
||||
probability: 0.5
|
||||
|
||||
|
||||
|
||||
|
||||
cardinal_directions:
|
||||
east: &est
|
||||
canonical: est
|
||||
abbreviated: e
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: e
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &oest
|
||||
canonical: oest
|
||||
abbreviated: w
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: w
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &nord
|
||||
canonical: nord
|
||||
abbreviated: n
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &sud
|
||||
canonical: sud
|
||||
abbreviated: s
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
probability: 0.25
|
||||
- alternative: *est
|
||||
probability: 0.25
|
||||
- alternative: *sud
|
||||
probability: 0.25
|
||||
- alternative: *oest
|
||||
probability: 0.25
|
||||
|
||||
entrances:
|
||||
entrada: &entrada
|
||||
canonical: entrada
|
||||
abbreviated: entr
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Entrance 1, Entrance A, etc.
|
||||
alphanumeric:
|
||||
default: *entrada
|
||||
numeric_probability: 0.1 # e.g. Entrance 1
|
||||
alpha_probability: 0.85 # e.g. Entrnace A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *est
|
||||
- alternative: *oest
|
||||
- alternative: *dreta
|
||||
- alternative: *esquerra
|
||||
- alternative: *posterior
|
||||
- alternative: *front
|
||||
|
||||
staircases:
|
||||
escala: &escala
|
||||
canonical: escala
|
||||
abbreviated: esc
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
# For alphanumerics, Stair A, Stair 1, etc.
|
||||
default: *escala
|
||||
numeric_probability: 0.6 # e.g. Escalera 1
|
||||
alpha_probability: 0.35 # e.g. Escalera A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right # e.g. Escalera Izq
|
||||
direction_probability: 0.8
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *est
|
||||
- alternative: *oest
|
||||
- alternative: *dreta
|
||||
- alternative: *esquerra
|
||||
- alternative: *posterior
|
||||
- alternative: *front
|
||||
|
||||
units:
|
||||
flat: &apartament
|
||||
canonical: apartament
|
||||
abbreviated: apmt
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
door: &porta
|
||||
canonical: porta
|
||||
abbreviated: pta
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
# If it's just puerta B, many times it's just e.g. 3o B for "tercero piso puerta B"
|
||||
null_phrase_probability: 0.15
|
||||
ordinal:
|
||||
direction: right
|
||||
gender: f
|
||||
direction_probability: 0.95 # Let it vary occasionally e.g. Porta 2a
|
||||
null_phrase_probability: 0.8 # Let e.g. 5a be the entire unit string
|
||||
numeric_probability: 0.25
|
||||
ordinal_probability: 0.75
|
||||
lletra: &lletra
|
||||
canonical: lletra
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
office: &oficina
|
||||
canonical: oficina
|
||||
abbreviated: of
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
# Another word for unit, used more in Colombia
|
||||
unitat: &unitat
|
||||
canonical: unitat
|
||||
abbreviated: un
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
lot: &lot
|
||||
canonical: lot
|
||||
abbreviated: lt
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
parcel: &parcella
|
||||
canonical: parcel·la
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
habitacio: &habitacio
|
||||
canonical: habitació
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
casa: &casa
|
||||
canonical: casa
|
||||
numeric:
|
||||
direction: left
|
||||
room: &sala
|
||||
canonical: sala
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *porta
|
||||
probability: 0.8
|
||||
sample: true
|
||||
alternatives:
|
||||
- alternative: *apartament
|
||||
probability: 0.1
|
||||
- alternative: *casa
|
||||
probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2o Izq, 2 Dcha, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.1
|
||||
add_direction_numeric: true # Only for numbers
|
||||
add_direction_standalone: true # A unit can be as simple as "D"
|
||||
|
||||
numeric_probability: 0.7 # e.g. Porta 1a
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Porta 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Porta A1
|
||||
alpha_probability: 0.28 # e.g. Porta A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
alpha:
|
||||
default: *porta
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *lletra
|
||||
probability: 0.12
|
||||
- alternative: *apartament
|
||||
probability: 0.05
|
||||
- alternative: *casa
|
||||
probability: 0.01
|
||||
- alternative: *unitat
|
||||
probability: 0.01
|
||||
- alternative: *habitacio
|
||||
probability: 0.01
|
||||
|
||||
zones:
|
||||
residential: *unit_alphanumeric
|
||||
commercial:
|
||||
default: *oficina
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *sala
|
||||
probability: 0.2
|
||||
|
||||
numeric_probability: 0.9 # e.g. Oficina 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Oficina 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Oficina A1
|
||||
alpha_probability: 0.08 # e.g. Oficina A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
alpha:
|
||||
default: *oficina
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *sala
|
||||
probability: 0.15
|
||||
- alternative: *lletra
|
||||
probability: 0.05
|
||||
|
||||
industrial:
|
||||
default: *lot
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *oficina
|
||||
probability: 0.3
|
||||
- alternative: *unitat
|
||||
probability: 0.19
|
||||
- alternative: *parcella
|
||||
probability: 0.01
|
||||
|
||||
numeric_probability: 0.9 # e.g. Lote 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Lote 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Lote A1
|
||||
alpha_probability: 0.08 # e.g. Lote A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
university:
|
||||
default: *sala
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *porta
|
||||
probability: 0.1
|
||||
|
||||
numeric_probability: 0.9 # e.g. Sala 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Sala 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Sala A1
|
||||
alpha_probability: 0.08 # e.g. Sala A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
alpha:
|
||||
default: *sala
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *porta
|
||||
probability: 0.08
|
||||
- alternative: *lletra
|
||||
probability: 0.02
|
||||
|
||||
allotments:
|
||||
lot:
|
||||
default: *lot
|
||||
numeric_probability: 0.8
|
||||
alphanumeric_probability: 0.1
|
||||
alpha_probability: 0.1
|
||||
parcel:
|
||||
default: *parcella
|
||||
numeric_probability: 0.3
|
||||
alphanumeric_probability: 0.3
|
||||
alpha_probability: 0.4
|
||||
lot_probability: 0.9
|
||||
parcel_probability: 0.06
|
||||
lot_plus_parcel_probability: 0.02
|
||||
parcel_plus_lot_probability: 0.02
|
||||
570
resources/addresses/cs.yaml
Normal file
570
resources/addresses/cs.yaml
Normal file
@@ -0,0 +1,570 @@
|
||||
# cs.yaml
|
||||
# -------
|
||||
# Czech language specification
|
||||
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.95
|
||||
alphanumeric_probability: 0.04
|
||||
standalone_probability: 0.01
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
# Note: no combinations because of the house numbering scheme
|
||||
|
||||
numbers:
|
||||
default: &cislo
|
||||
canonical: číslo
|
||||
abbreviated: č
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "č."
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
and:
|
||||
default: &a
|
||||
canonical: a
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
conscription_numbers:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: číslo popisné
|
||||
abbreviated: "č.p."
|
||||
canonical_probability: 0.05
|
||||
abbreviated_probability: 0.85
|
||||
sample: true
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
cross_streets:
|
||||
and: *a
|
||||
at: &na
|
||||
canonical: na
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner_of: &rohu
|
||||
canonical: rohu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner: &roh
|
||||
canonical: roh
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &na_rohu
|
||||
canonical: na rohu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *a
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *na
|
||||
probability: 0.1
|
||||
- alternative: *rohu
|
||||
probability: 0.1
|
||||
- alternative: *roh
|
||||
probability: 0.1
|
||||
- alternative: *na_rohu
|
||||
probability: 0.1
|
||||
|
||||
between:
|
||||
canonical: mezi
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &patro
|
||||
canonical: patro
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
nadzemni_podlazi: &nadzemni_podlazi
|
||||
canonical: nadzemní podlaží
|
||||
abbreviated: np
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.8
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
etaz: &etaz
|
||||
canonical: etáž
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
prizemi: &prizemi
|
||||
canonical: přízemí
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
podzemni_podlazi: &podzemni_podlazi
|
||||
canonical: podzemní podlaží
|
||||
abbreviated: pp
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.3
|
||||
# e.g. podzemní podlaží 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. pp1
|
||||
numeric_affix:
|
||||
affix: pp
|
||||
direction: left
|
||||
# e.g. 1. podzemní podlaží
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.985
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *podzemni_podlazi
|
||||
"-1":
|
||||
default: *podzemni_podlazi
|
||||
"0":
|
||||
default: *prizemi
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *patro
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *patro
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *nadzemni_podlazi
|
||||
probability: 0.19
|
||||
- alternative: *etaz
|
||||
probability: 0.01
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: poblíž
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: v blízkém okolí
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: u
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: kolem
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
probability: 0.05
|
||||
nearby:
|
||||
default:
|
||||
canonical: poblíž
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.45
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: blízko
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: v blízkosti
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: tady poblíž
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: tady
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: okolo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: v okolí
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
near_me:
|
||||
default:
|
||||
canonical: v blízkosti mně
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: v
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: ve
|
||||
probability: 0.3
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
directions:
|
||||
right: &prava
|
||||
canonical: pravá
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &leva
|
||||
canonical: levá
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *prava
|
||||
probability: 0.5
|
||||
- alternative: *leva
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &vychod
|
||||
canonical: východ
|
||||
abbreviated: v
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &zapad
|
||||
canonical: západ
|
||||
abbreviated: z
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &sever
|
||||
canonical: sever
|
||||
abbreviated: s
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &jih
|
||||
canonical: jih
|
||||
abbreviated: j
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: j
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *sever
|
||||
probability: 0.25
|
||||
- alternative: *vychod
|
||||
probability: 0.25
|
||||
- alternative: *jih
|
||||
probability: 0.25
|
||||
- alternative: *zapad
|
||||
probability: 0.25
|
||||
entrances:
|
||||
vchod: &vchod
|
||||
canonical: vchod
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Wejście 1, Wejście A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *vchod
|
||||
numeric_probability: 0.1 # e.g. Wejście 1
|
||||
alpha_probability: 0.85 # e.g. Wejście A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
schodiste: &schodiste
|
||||
canonical: schodiště
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *schodiste
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *sever
|
||||
- alternative: *jih
|
||||
- alternative: *vychod
|
||||
- alternative: *zapad
|
||||
|
||||
po_boxes:
|
||||
postovni_prihradka: &postovni_prihradka
|
||||
canonical: poštovní přihrádka
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # poštovní přihrádka 1234
|
||||
alphanumeric:
|
||||
default: *postovni_prihradka
|
||||
numeric_probability: 0.9 # poštovní přihrádka 123
|
||||
alpha_probability: 0.05 # poštovní přihrádka A
|
||||
numeric_plus_alpha_probability: 0.04 # poštovní přihrádka 123G
|
||||
alpha_plus_numeric_probability: 0.01 # poštovní přihrádka A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
apartaman: &apartaman
|
||||
canonical: apartmán
|
||||
abbreviated: apt
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
pokoj: &pokoj
|
||||
canonical: pokoj
|
||||
abbreviated: pok
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
kancelar: &kancelar
|
||||
canonical: kancelář
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *apartaman
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *pokoj
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. apt. 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. apt. A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.01
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *pokoj
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *kancelar
|
||||
probability: 0.4
|
||||
numeric_probability: 0.95 # e.g. pokoj 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. pokoj 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. pokoj A1
|
||||
alpha_probability: 0.03 # e.g. pokoj A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *pokoj
|
||||
numeric_probability: 0.95 # e.g. pokoj 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. pok 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. pokoj A1
|
||||
alpha_probability: 0.03 # e.g. pokoj A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
599
resources/addresses/da.yaml
Normal file
599
resources/addresses/da.yaml
Normal file
@@ -0,0 +1,599 @@
|
||||
# da.yaml
|
||||
# -------
|
||||
# Danish language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85
|
||||
alphanumeric_probability: 0.1
|
||||
standalone_probability: 0.05
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- level
|
||||
- unit
|
||||
label: unit
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.9
|
||||
- separator: " - "
|
||||
probability: 0.1
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- entrance
|
||||
- unit
|
||||
label: unit
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.9
|
||||
- separator: " - "
|
||||
probability: 0.1
|
||||
probability: 0.001
|
||||
|
||||
|
||||
numbers:
|
||||
default: &nummer
|
||||
canonical: nummer
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *nummer
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
|
||||
|
||||
and:
|
||||
default: &og
|
||||
canonical: og
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
cross_streets:
|
||||
and: *og
|
||||
corner_of: &hjorne_af
|
||||
canonical: hjørne af
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &pa_hjornet_af
|
||||
canonical: på hjørnet af
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *og
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *hjorne_af
|
||||
probability: 0.15
|
||||
- alternative: *pa_hjornet_af
|
||||
probability: 0.15
|
||||
|
||||
between:
|
||||
canonical: mellem
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &sal
|
||||
canonical: sal
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.9
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
etage: &etage
|
||||
canonical: etage
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
stuen: &stuen
|
||||
canonical: stuen
|
||||
abbreviated: st
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
stueetage: &stueetage
|
||||
canonical: stueetage
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
sample_probability: 0.7
|
||||
kaelderen: &kaelderen
|
||||
canonical: kælderen
|
||||
abbreviated: kl
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.2
|
||||
# e.g. 1 kælderen
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.8
|
||||
# e.g. k1
|
||||
numeric_affix:
|
||||
affix: k
|
||||
direction: left
|
||||
# e.g. 1. kl
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.985
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *kaelderen
|
||||
"-1":
|
||||
default: *kaelderen
|
||||
"0":
|
||||
default: *stuen
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *stueetage
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *sal
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *etage
|
||||
probability: 0.3
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: i nærheden af
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: tæt på
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: tæt ved
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
nearby:
|
||||
default:
|
||||
canonical: i nærheden
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.4
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: rundt her
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: nær her
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: nær
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: omkring her
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: tæt på her
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
near_me:
|
||||
default:
|
||||
canonical: nær mig
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: i nærheden af mig
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: tæt på mig
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: i
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: om
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: på
|
||||
probability: 0.1
|
||||
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
|
||||
directions:
|
||||
right: &til_hojre
|
||||
canonical: til højre
|
||||
abbreviated: t.h
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: t.h
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
left: &til_venstre
|
||||
canonical: til venstre
|
||||
abbreviated: t.v
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: t.v
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
middle: &midt_for
|
||||
canonical: midt for
|
||||
abbreviated: m.f
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: m.f
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
alternatives:
|
||||
- alternative: *til_hojre
|
||||
probability: 0.45
|
||||
- alternative: *til_venstre
|
||||
probability: 0.45
|
||||
- alternative: *midt_for
|
||||
probability: 0.1
|
||||
|
||||
|
||||
cardinal_directions:
|
||||
east: &ost
|
||||
canonical: øst
|
||||
abbreviated: ø
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: ø
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &vest
|
||||
canonical: vest
|
||||
abbreviated: v
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &nord
|
||||
canonical: nord
|
||||
abbreviated: n
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &syd
|
||||
canonical: syd
|
||||
abbreviated: s
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
probability: 0.25
|
||||
- alternative: *ost
|
||||
probability: 0.25
|
||||
- alternative: *syd
|
||||
probability: 0.25
|
||||
- alternative: *vest
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
indgang: &indgang
|
||||
canonical: indgang
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Eingang 1, Eingang A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *indgang
|
||||
numeric_probability: 0.1 # e.g. Eingang 1
|
||||
alpha_probability: 0.85 # e.g. Eingang A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
stiege: &stiege
|
||||
canonical: stiege
|
||||
abbreviated: stg
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
trappe: &trappe
|
||||
canonical: trappe
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *trappe
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *stiege
|
||||
probability: 0.2
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *syd
|
||||
- alternative: *ost
|
||||
- alternative: *vest
|
||||
|
||||
po_boxes:
|
||||
postboks: &postboks
|
||||
canonical: postboks
|
||||
abbreviated: pb
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Pb No 1234
|
||||
boks: &boks
|
||||
canonical: boks
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Boks No 1234
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *postboks
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *boks
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # Pb 123
|
||||
alpha_probability: 0.05 # Pb A
|
||||
numeric_plus_alpha_probability: 0.04 # Pb 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Pb A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
lejlighed: &lejlighed
|
||||
canonical: lejlighed
|
||||
abbreviated: ljd
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
null_phrase_probability: 0.5
|
||||
# Lejlighed nummer 4
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
hus: &hus
|
||||
canonical: hus
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
vaerelse: &vaerelse
|
||||
canonical: værelse
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *lejlighed
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *hus
|
||||
probability: 0.1
|
||||
- alternative: *vaerelse
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. Lejlighed 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. Lejl A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2R, 2L, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.5
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. Lejlighed Rechts
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.1
|
||||
715
resources/addresses/de.yaml
Normal file
715
resources/addresses/de.yaml
Normal file
@@ -0,0 +1,715 @@
|
||||
# de.yaml
|
||||
# -------
|
||||
# Note: this will only apply to the German language code, which encompasses Germany,
|
||||
# Austria, Switzerland (but not Swiss-German, which has its own language code),
|
||||
# Lichtenstein, Luxembourg (Luxembourgish has its own language code), and part of Belgium.
|
||||
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85
|
||||
alphanumeric_probability: 0.1
|
||||
standalone_probability: 0.05
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
combinations:
|
||||
# e.g. 2/34, more common way to specify a unit number in German
|
||||
# if unit exists in the first place
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: /
|
||||
probability: 0.8
|
||||
- separator: "-"
|
||||
probability: 0.1
|
||||
- separator: " - "
|
||||
probability: 0.1
|
||||
probability: 0.05
|
||||
|
||||
|
||||
numbers:
|
||||
default: &nummer
|
||||
canonical: nummer
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *nummer
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
|
||||
conscription_numbers:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: konskriptionsnummer
|
||||
abbreviated: konskr. nr
|
||||
canonical_probability: 0.15
|
||||
abbreviated_probability: 0.65
|
||||
sample: true
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
and:
|
||||
default: &und
|
||||
canonical: und
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
cross_streets:
|
||||
and: *und
|
||||
corner_of: &ecke_von
|
||||
canonical: ecke von
|
||||
at_the_corner_of: &an_der_ecke_von
|
||||
canonical: an der ecke von
|
||||
intersection:
|
||||
default: *und
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *ecke_von
|
||||
probability: 0.15
|
||||
- alternative: *an_der_ecke_von
|
||||
probability: 0.15
|
||||
|
||||
between:
|
||||
canonical: zwischen
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &obergeschoss
|
||||
canonical: obergeschoss
|
||||
abbreviated: og
|
||||
sample: true
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: og
|
||||
direction: right
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.3
|
||||
numeric_affix_probability: 0.5
|
||||
ordinal_probability: 0.2
|
||||
etage: &etage
|
||||
canonical: etage
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
stock: &stock
|
||||
canonical: stock
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.1
|
||||
ordinal_probability: 0.9
|
||||
erdgeschoss: &erdgeschoss
|
||||
canonical: erdgeschoss
|
||||
abbreviated: eg
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
untergeschoss: &untergeschoss
|
||||
canonical: untergeschoss
|
||||
abbreviated: ug
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.1
|
||||
# e.g. Basement 1
|
||||
numeric:
|
||||
direction: left
|
||||
# e.g. 1ug
|
||||
numeric_affix:
|
||||
affix: ug
|
||||
direction: left
|
||||
# e.g. 1. UG
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.985
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
unterste_etage: &unterste_etage
|
||||
canonical: unterste etage
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
oberste_etage: &oberste_etage
|
||||
canonical: oberste etage
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *untergeschoss
|
||||
"-1":
|
||||
default: *untergeschoss
|
||||
"0":
|
||||
default: *erdgeschoss
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *unterste_etage
|
||||
probability: 0.1
|
||||
"top":
|
||||
default: *obergeschoss
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative: *stock
|
||||
probability: 0.1
|
||||
- alternative: *etage
|
||||
probability: 0.05
|
||||
- alternative: *oberste_etage
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *obergeschoss
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: *stock
|
||||
probability: 0.1
|
||||
- alternative: *etage
|
||||
probability: 0.05
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: nähe
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: bei
|
||||
probability: 0.3
|
||||
- alternative:
|
||||
canonical: nah
|
||||
probability: 0.15
|
||||
- alternative:
|
||||
canonical: nahe an
|
||||
probability: 0.05
|
||||
nearby:
|
||||
default:
|
||||
canonical: hier in der nähe
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.4
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: in der nähe
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.3
|
||||
- alternative:
|
||||
canonical: in der nähe hier
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: in der nähe von
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: nahe gelegen
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: hier in der gegend
|
||||
probability: 0.05
|
||||
|
||||
near_me:
|
||||
default:
|
||||
canonical: in meiner nähe
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: in der nähe zu mir
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: in
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: im
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: um
|
||||
probability: 0.2
|
||||
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
|
||||
directions:
|
||||
right: &rechts
|
||||
canonical: rechts
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: r
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
left: &links
|
||||
canonical: links
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: l
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *rechts
|
||||
probability: 0.5
|
||||
- alternative: *links
|
||||
probability: 0.5
|
||||
|
||||
|
||||
cardinal_directions:
|
||||
east: &ost
|
||||
canonical: ost
|
||||
abbreviated: o
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: o
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &west
|
||||
canonical: west
|
||||
abbreviated: w
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: w
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &nord
|
||||
canonical: nord
|
||||
abbreviated: n
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &sud
|
||||
canonical: süd
|
||||
abbreviated: s
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
probability: 0.25
|
||||
- alternative: *ost
|
||||
probability: 0.25
|
||||
- alternative: *sud
|
||||
probability: 0.25
|
||||
- alternative: *west
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
eingang: &eingang
|
||||
canonical: eingang
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Eingang 1, Eingang A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *eingang
|
||||
numeric_probability: 0.1 # e.g. Eingang 1
|
||||
alpha_probability: 0.85 # e.g. Eingang A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
stiege: &stiege
|
||||
canonical: stiege
|
||||
abbreviated: stg
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
treppe: &treppe
|
||||
canonical: treppe
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *stiege
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *treppe
|
||||
probability: 0.4
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *ost
|
||||
- alternative: *west
|
||||
|
||||
po_boxes:
|
||||
postfach: &postfach
|
||||
canonical: postfach
|
||||
abbreviated: pf
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # PF No 1234
|
||||
numeric_probability: 1.0
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *postfach
|
||||
numeric_probability: 0.9 # Apdo 123
|
||||
alpha_probability: 0.05 # Apdo A
|
||||
numeric_plus_alpha_probability: 0.04 # Apdo 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Apdo A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
halle: &halle
|
||||
canonical: halle
|
||||
numeric:
|
||||
direction: left
|
||||
wohnung: &wohnung
|
||||
canonical: wohnung
|
||||
abbreviated: whg
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.3
|
||||
plural:
|
||||
canonical: wohnungen
|
||||
numeric:
|
||||
direction: left
|
||||
# Wohnung nummer 4
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
haus: &haus
|
||||
canonical: haus
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
wohnungsnummer: &wohnungsnummer
|
||||
canonical: wohnungsnummer
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
appartement: &appartement
|
||||
canonical: appartement
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
buro: &buro
|
||||
canonical: büro
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
zimmer: &zimmer
|
||||
canonical: zimmer
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *wohnung
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *wohnungsnummer
|
||||
probability: 0.1
|
||||
- alternative: *appartement
|
||||
probability: 0.05
|
||||
- alternative: *haus
|
||||
probability: 0.05
|
||||
|
||||
numeric_probability: 0.9 # e.g. Wohnung 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. Wohnung A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2R, 2L, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.1
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. Wohnung Rechts
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.1
|
||||
|
||||
zone:
|
||||
residential: *unit_alphanumeric
|
||||
commercial:
|
||||
default: *buro
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *zimmer
|
||||
probability: 0.1
|
||||
university:
|
||||
default: *halle
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *zimmer
|
||||
probability: 0.1
|
||||
|
||||
|
||||
countries:
|
||||
# Austria
|
||||
at:
|
||||
# Staircase and entrance numbers more common
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.3
|
||||
standalone_probability: 0.1
|
||||
staircase:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
entrance:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
unit:
|
||||
null_probability: 0.4
|
||||
alphanumeric_probability: 0.6
|
||||
|
||||
# Combined apartment numbers are very common
|
||||
combinations:
|
||||
# e.g. Neubaugasse 55/A/1/5
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- entrance
|
||||
- staircase
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: /
|
||||
probability: 0.98
|
||||
- separator: "-"
|
||||
probability: 0.02
|
||||
probability: 0.9
|
||||
# e.g. Neubaugasse 55/1/5
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- staircase
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: /
|
||||
probability: 0.98
|
||||
- separator: "-"
|
||||
probability: 0.02
|
||||
probability: 0.8
|
||||
# e.g. Neubaugasse 55/5
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
probability: 0.7
|
||||
separators:
|
||||
- separator: /
|
||||
probability: 0.98
|
||||
- separator: "-"
|
||||
probability: 0.02
|
||||
|
||||
units:
|
||||
top: &top
|
||||
canonical: top
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &austria_units_alphanumeric
|
||||
default: *top
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative: *haus
|
||||
probability: 0.15
|
||||
- alternative: *wohnung
|
||||
probability: 0.05
|
||||
- alternative: *wohnungsnummer
|
||||
probability: 0.025
|
||||
- alternative: *appartement
|
||||
probability: 0.025
|
||||
368
resources/addresses/el.yaml
Normal file
368
resources/addresses/el.yaml
Normal file
@@ -0,0 +1,368 @@
|
||||
# el.yaml
|
||||
# -------
|
||||
# Greek language specification
|
||||
|
||||
|
||||
alphabet: αβγδεζηθικλμνξοπρστυφχψω
|
||||
alphabet_probability: 0.8
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.95
|
||||
alphanumeric_probability: 0.05
|
||||
|
||||
entrance:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
unit:
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.1
|
||||
|
||||
levels:
|
||||
orofos: &orofos
|
||||
canonical: όροφος
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
orofos_latin: &orofos_latin
|
||||
canonical: órofos
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
|
||||
isogelo: &isogelo
|
||||
canonical: ισόγειο
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
isogelo_latin: &isogelo_latin
|
||||
canonical: isógeio
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
imiorofos: &imiorofos
|
||||
canonical: ημιώροφος
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
imiorofos_latin: &imiorofos_latin
|
||||
canonical: imiórofos
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
|
||||
ypogeio: &ypogeio
|
||||
canonical: υπόγειο
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: left
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
standalone_probability: 0.985
|
||||
numeric_probability: 0.01
|
||||
ordinal_probability: 0.005
|
||||
ypogeio_latin: &ypogeio_latin
|
||||
canonical: ypógeio
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: left
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
standalone_probability: 0.985
|
||||
numeric_probability: 0.01
|
||||
ordinal_probability: 0.005
|
||||
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *ypogeio
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *ypogeio_latin
|
||||
probability: 0.1
|
||||
"-1":
|
||||
default: *ypogeio
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *ypogeio_latin
|
||||
probability: 0.1
|
||||
|
||||
half_floors:
|
||||
default: *imiorofos
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *imiorofos_latin
|
||||
probability: 0.1
|
||||
|
||||
"0":
|
||||
default: *isogelo
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *isogelo_latin
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *orofos
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *orofos_latin
|
||||
probability: 0.1
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
entrances:
|
||||
eisodos: &eisodos
|
||||
canonical: είσοδος
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
eisodos_latin: &eisodos_latin
|
||||
canonical: eísodos
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# είσοδος 1, etc.
|
||||
alphanumeric:
|
||||
default: *eisodos
|
||||
probability: 0.99
|
||||
alternatives:
|
||||
- alternative: *eisodos_latin
|
||||
probability: 0.01
|
||||
numeric_probability: 0.1
|
||||
alpha_probability: 0.9
|
||||
|
||||
staircases:
|
||||
skala: &skala
|
||||
canonical: σκάλα
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
skala_latin: &skala_latin
|
||||
canonical: skála
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
# For alphanumerics, skála A, skála 1, etc.
|
||||
default: *skala
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *skala_latin
|
||||
probability: 0.1
|
||||
numeric_probability: 0.6 # e.g. skála 1
|
||||
alpha_probability: 0.35 # e.g. skála A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
po_boxes:
|
||||
tachydromiki_thyrida: &tachydromiki_thyrida
|
||||
canonical: ταχυδρομική θυρίδα
|
||||
abbreviated: τ.θ
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
tachydromiki_thyrida_latin: &tachydromiki_thyrida_latin
|
||||
canonical: tachydromikí thyrída
|
||||
abbreviated: t.th
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
default: *tachydromiki_thyrida
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *tachydromiki_thyrida_latin
|
||||
probability: 0.2
|
||||
numeric_probability: 0.9 # t.th 123
|
||||
alpha_probability: 0.05 # t.th А
|
||||
numeric_plus_alpha_probability: 0.04 # t.th 123А
|
||||
alpha_plus_numeric_probability: 0.01 # t.th А123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
diamerisma: &diamerisma
|
||||
canonical: διαμέρισμα
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.6
|
||||
ordinal_probability: 0.4
|
||||
diamerisma_latin: &diamerisma_latin
|
||||
canonical: diamérisma
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.6
|
||||
ordinal_probability: 0.4
|
||||
|
||||
domatio: &domatio
|
||||
canonical: δωμάτιο
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.6
|
||||
ordinal_probability: 0.4
|
||||
domatio_latin: &domatio_latin
|
||||
canonical: domátio
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.6
|
||||
ordinal_probability: 0.4
|
||||
|
||||
grafeiou: &grafeiou
|
||||
canonical: γραφείου
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.6
|
||||
ordinal_probability: 0.4
|
||||
grafeiou_latin: &grafeiou_latin
|
||||
canonical: grafeíou
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.6
|
||||
ordinal_probability: 0.4
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *diamerisma
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *diamerisma_latin
|
||||
probability: 0.1
|
||||
- alternative: *domatio
|
||||
probability: 0.09
|
||||
- alternative: *domatio_latin
|
||||
probability: 0.01
|
||||
|
||||
numeric_probability: 0.9 # e.g. diamérisma 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1А
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. AА1
|
||||
alpha_probability: 0.04 # e.g. διαμέρισμα А
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.1
|
||||
|
||||
zone:
|
||||
residential: *unit_alphanumeric
|
||||
commercial:
|
||||
default: *grafeiou
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *grafeiou_latin
|
||||
probability: 0.1
|
||||
university:
|
||||
default: *domatio
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *domatio_latin
|
||||
probability: 0.1
|
||||
1468
resources/addresses/en.yaml
Normal file
1468
resources/addresses/en.yaml
Normal file
File diff suppressed because it is too large
Load Diff
1189
resources/addresses/es.yaml
Normal file
1189
resources/addresses/es.yaml
Normal file
File diff suppressed because it is too large
Load Diff
470
resources/addresses/et.yaml
Normal file
470
resources/addresses/et.yaml
Normal file
@@ -0,0 +1,470 @@
|
||||
# et.yaml
|
||||
# -------
|
||||
# Estonian language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.97
|
||||
alphanumeric_probability: 0.02
|
||||
standalone_probability: 0.01
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.95
|
||||
- separator: " - "
|
||||
probability: 0.05
|
||||
probability: 0.7
|
||||
|
||||
|
||||
numbers:
|
||||
default: &number
|
||||
canonical: number
|
||||
abbreviated: nbr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *number
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
|
||||
|
||||
and:
|
||||
default: &ja
|
||||
canonical: ja
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
cross_streets:
|
||||
and: *ja
|
||||
corner_of: &nurgas
|
||||
canonical: nurgas
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &nurgal
|
||||
canonical: nurgal
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *ja
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *nurgas
|
||||
probability: 0.15
|
||||
- alternative: *nurgal
|
||||
probability: 0.15
|
||||
|
||||
between:
|
||||
canonical: vahel
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &korrusel
|
||||
canonical: korrusel
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.9
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
parter: &parter
|
||||
canonical: parter
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
kelder: &kelder
|
||||
canonical: kelder
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
standalone_probability: 1.0
|
||||
keldris: &keldris
|
||||
canonical: keldris
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
# e.g. 1 keldris
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.8
|
||||
# e.g. k1
|
||||
numeric_affix:
|
||||
affix: k
|
||||
direction: left
|
||||
# e.g. 1. keldris
|
||||
ordinal:
|
||||
direction: right
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.05
|
||||
numeric_affix_probability: 0.9
|
||||
ordinal_probability: 0.05
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *kelder
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: *keldris
|
||||
probability: 0.15
|
||||
"-1":
|
||||
default: *kelder
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: *keldris
|
||||
probability: 0.1
|
||||
- alternative: *korrusel
|
||||
probability: 0.05
|
||||
"1":
|
||||
default: *parter
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *korrusel
|
||||
probability: 0.5
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *korrusel
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
|
||||
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: lähedal
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
nearby:
|
||||
default:
|
||||
canonical: lähedal
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: siin lähedal
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: siinkandis
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
|
||||
near_me:
|
||||
default:
|
||||
canonical: lähedal mulle
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.7
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
|
||||
directions:
|
||||
right: &paremal
|
||||
canonical: paremal
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: p
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
paramale: &paremale
|
||||
canonical: paremale
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: p
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
left: &vasakul
|
||||
canonical: vasakul
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
vasakule: &vasakule
|
||||
canonical: vasakule
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
alternatives:
|
||||
- alternative: *paremal
|
||||
probability: 0.25
|
||||
- alternative: *paremale
|
||||
probability: 0.25
|
||||
- alternative: *vasakul
|
||||
probability: 0.25
|
||||
- alternative: *vasakule
|
||||
probability: 0.25
|
||||
|
||||
cardinal_directions:
|
||||
east: &ida
|
||||
canonical: ida
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
west: &laas
|
||||
canonical: lääs
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
north: &pohi
|
||||
canonical: põhi
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
south: &louna
|
||||
canonical: lõuna
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
alternatives:
|
||||
- alternative: *pohi
|
||||
probability: 0.25
|
||||
- alternative: *ida
|
||||
probability: 0.25
|
||||
- alternative: *louna
|
||||
probability: 0.25
|
||||
- alternative: *laas
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
sissepaas: &sissepaas
|
||||
canonical: sissepääs
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Eingang 1, Eingang A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *sissepaas
|
||||
numeric_probability: 0.1 # e.g. Eingang 1
|
||||
alpha_probability: 0.85 # e.g. Eingang A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
trepikoda: &trepikoda
|
||||
canonical: trepikoda
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *trepikoda
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *pohi
|
||||
- alternative: *louna
|
||||
- alternative: *ida
|
||||
- alternative: *laas
|
||||
|
||||
po_boxes:
|
||||
postboks: &abonementpostkast
|
||||
canonical: abonementpostkast
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # abonementpostkast #1234
|
||||
kast: &kast
|
||||
canonical: kast
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Kast #1234
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *abonementpostkast
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *kast
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # 123
|
||||
alpha_probability: 0.05 # A
|
||||
numeric_plus_alpha_probability: 0.04 # 123G
|
||||
alpha_plus_numeric_probability: 0.01 # A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
korter: &korter
|
||||
canonical: korter
|
||||
abbreviated: k
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
null_phrase_probability: 0.3
|
||||
# Lejlighed nummer 4
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
ruumi: &ruumi
|
||||
canonical: ruumi
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *korter
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *ruumi
|
||||
probability: 0.1
|
||||
numeric_probability: 1.0 # e.g. korter 1
|
||||
|
||||
# Separate random probability for adding directions like 2P, 2V, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.005
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. Korter vasakule
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.05
|
||||
375
resources/addresses/eu.yaml
Normal file
375
resources/addresses/eu.yaml
Normal file
@@ -0,0 +1,375 @@
|
||||
# eu.yaml
|
||||
# -------
|
||||
# Basque language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
# If no floor number is specified
|
||||
null_probability: 0.8
|
||||
alphanumeric_probability: 0.2
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.4
|
||||
alphanumeric_probability: 0.6
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- level
|
||||
- unit
|
||||
label: unit
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.85
|
||||
- separator: "/"
|
||||
probability: 0.15
|
||||
probability: 0.7
|
||||
|
||||
|
||||
and:
|
||||
default: &eta
|
||||
canonical: eta
|
||||
abbreviated: "&"
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.1
|
||||
|
||||
house_numbers:
|
||||
# zenbakirik gabe (zk.g) addresses
|
||||
no_number:
|
||||
default:
|
||||
canonical: zenbakirik gabe
|
||||
abbreviated: zk.g
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.3
|
||||
|
||||
no_number_probability: 0.1 # With this probability, use sense número if no house_number is specified
|
||||
|
||||
levels:
|
||||
floor: &solairua
|
||||
canonical: solairua
|
||||
abbreviated: sol
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
# e.g. 2. solairua
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.25
|
||||
ordinal_probability: 0.75
|
||||
# Ground floor
|
||||
beheko_solairua: &beheko_solairua
|
||||
canonical: beheko solairua
|
||||
abbreviated: beheko sol
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.1
|
||||
behe_solairua: &behe_solairua
|
||||
canonical: behe-solairua
|
||||
abbreviated: behe-sol
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.1
|
||||
aliases:
|
||||
"0":
|
||||
default: *beheko_solairua
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *behe_solairua
|
||||
probability: 0.4
|
||||
- alternative: *solairua
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *solairua
|
||||
numeric_probability: 0.99
|
||||
alpha_probability: 0.01
|
||||
|
||||
blocks:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: blokea
|
||||
abbreviated: bl
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.2
|
||||
ordinal_probability: 0.8
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: gertu
|
||||
|
||||
nearby:
|
||||
default:
|
||||
canonical: gertuko
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: hemen gertu
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: hemen
|
||||
probability: 0.1
|
||||
near_me:
|
||||
default:
|
||||
canonical: me gertu
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.7
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
|
||||
cross_streets:
|
||||
and: *eta
|
||||
txoko: &txoko
|
||||
canonical: txoko
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
|
||||
intersection:
|
||||
default: *eta
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *txoko
|
||||
probability: 0.2
|
||||
|
||||
between:
|
||||
canonical: arteko
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probabililty: 0.5
|
||||
|
||||
|
||||
po_boxes:
|
||||
posta_kutxa: &posta_kutxa
|
||||
canonical: posta-kutxa
|
||||
abbreviated: p.-ku
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_probability: 1.0
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *posta_kutxa
|
||||
numeric_probability: 0.9 # P.-Ku 123
|
||||
alpha_probability: 0.05 # P.-Ku A
|
||||
numeric_plus_alpha_probability: 0.04 # P.-Ku 123G
|
||||
alpha_plus_numeric_probability: 0.01 # P.-Ku A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
|
||||
postcodes:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: posta-kodea
|
||||
abbreviated: p.-k
|
||||
sample: true
|
||||
canonical_probability: 0.01
|
||||
abbreviated_probability: 0.9
|
||||
sample_probability: 0.09
|
||||
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
numeric_affix:
|
||||
affix: p.-k.
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.7
|
||||
numeric_probability: 0.18
|
||||
numeric_affix_probability: 0.12
|
||||
strict_numeric: true
|
||||
|
||||
directions:
|
||||
right: &eskuina
|
||||
canonical: eskuina
|
||||
abbreviated: esk
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: esk.
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.9
|
||||
numeric_affix_probability: 0.1
|
||||
left: &ezkerkada
|
||||
canonical: ezkerkada
|
||||
abbreviated: ezk
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: ezk.
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.9
|
||||
numeric_affix_probability: 0.1
|
||||
ezkerreko: &ezkerreko
|
||||
canonical: ezkerreko
|
||||
abbreviated: ezk.-ko
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alternatives:
|
||||
- alternative: *eskuina
|
||||
probability: 0.5
|
||||
- alternative: *ezkerkada
|
||||
probability: 0.5
|
||||
|
||||
|
||||
entrances:
|
||||
sarrera: &sarrera
|
||||
canonical: sarrera
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Sarrera 1, Sarrera A, etc.
|
||||
alphanumeric:
|
||||
default: *sarrera
|
||||
numeric_probability: 0.1 # e.g. Sarrera 1
|
||||
alpha_probability: 0.85 # e.g. Sarrera A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *eskuina
|
||||
- alternative: *ezkerreko
|
||||
|
||||
staircases:
|
||||
eskailera: &eskailera
|
||||
canonical: eskailera
|
||||
abbreviated: eskra
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
# For alphanumerics, Eskra A, Eskra 1, etc.
|
||||
default: *eskailera
|
||||
numeric_probability: 0.6 # e.g. Eskra 1
|
||||
alpha_probability: 0.35 # e.g. Eskra A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left # e.g. Ezk.-ko Eskra
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *eskuina
|
||||
- alternative: *ezkerreko
|
||||
|
||||
units:
|
||||
flat: &apartamentu
|
||||
canonical: apartamentu
|
||||
abbreviated: aptu
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
# If it's just puerta B, many times it's just e.g. 3o B for "tercero piso puerta B"
|
||||
null_phrase_probability: 0.15
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.6
|
||||
ordinal_probability: 0.4
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *apartamentu
|
||||
|
||||
# Separate random probability for adding directions like 2. Ezk, 2 Esk, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.1
|
||||
add_direction_numeric: true # Only for numbers
|
||||
add_direction_standalone: true # A unit can be as simple as "D"
|
||||
|
||||
numeric_probability: 0.7 # e.g. 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. A1
|
||||
alpha_probability: 0.28 # e.g. A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
403
resources/addresses/fi.yaml
Normal file
403
resources/addresses/fi.yaml
Normal file
@@ -0,0 +1,403 @@
|
||||
# fi.yaml
|
||||
# -------
|
||||
# Finnish language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.97
|
||||
alphanumeric_probability: 0.02
|
||||
standalone_probability: 0.01
|
||||
|
||||
staircase:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- staircase
|
||||
- unit
|
||||
label: unit
|
||||
separators:
|
||||
- separator: " "
|
||||
probability: 0.8
|
||||
- separator: "-"
|
||||
probability: 0.1
|
||||
- separator: "/"
|
||||
probability: 0.05
|
||||
- separator: " - "
|
||||
probability: 0.05
|
||||
probability: 0.85
|
||||
|
||||
numbers:
|
||||
default: &numero
|
||||
canonical: numero
|
||||
abbreviated: nro
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.4
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.7
|
||||
numeric_affix_probability: 0.3
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *numero
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
|
||||
|
||||
and:
|
||||
default: &ja
|
||||
canonical: ja
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
cross_streets:
|
||||
and: *ja
|
||||
corner_of: &kulmassa
|
||||
canonical: kulmassa
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *ja
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *kulmassa
|
||||
probability: 0.3
|
||||
|
||||
between:
|
||||
canonical: välillä
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &kerros
|
||||
canonical: kerros
|
||||
abbreviated: krs
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.9
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *kerros
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
|
||||
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: lähellä
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
nearby:
|
||||
default:
|
||||
canonical: lähistöllä
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: lähellä
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: tässä lähellä
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: täällä
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
|
||||
near_me:
|
||||
default:
|
||||
canonical: lähellä minua
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.7
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
|
||||
directions:
|
||||
right: &oikea
|
||||
canonical: oikea
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: o
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
oikealla: &oikealla
|
||||
canonical: oikealla
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: o
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
left: &vasen
|
||||
canonical: vasen
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
vasemmalla: &vasemmalla
|
||||
canonical: vasemmalla
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
alternatives:
|
||||
- alternative: *oikea
|
||||
probability: 0.25
|
||||
- alternative: *oikealla
|
||||
probability: 0.25
|
||||
- alternative: *vasen
|
||||
probability: 0.25
|
||||
- alternative: *vasemmalla
|
||||
probability: 0.25
|
||||
|
||||
cardinal_directions:
|
||||
east: &itaan
|
||||
canonical: itään
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
west: &lansi
|
||||
canonical: länsi
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
north: &pohja
|
||||
canonical: pohja
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
south: &etela
|
||||
canonical: etelä
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
alternatives:
|
||||
- alternative: *pohja
|
||||
probability: 0.25
|
||||
- alternative: *itaan
|
||||
probability: 0.25
|
||||
- alternative: *etela
|
||||
probability: 0.25
|
||||
- alternative: *lansi
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
sissepaas: &sisaankaynti
|
||||
canonical: sisäänkäynti
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Portaikko 1, Portaikko A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *sisaankaynti
|
||||
numeric_probability: 0.1 # e.g. Portaikko 1
|
||||
alpha_probability: 0.85 # e.g. Portaikko A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
portaikko: &portaikko
|
||||
canonical: portaikko
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *portaikko
|
||||
alpha_probability: 1.0
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *pohja
|
||||
- alternative: *etela
|
||||
- alternative: *itaan
|
||||
- alternative: *lansi
|
||||
|
||||
po_boxes:
|
||||
postilokero: &postilokero
|
||||
canonical: postilokero
|
||||
abbreviated: pl
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # PL #1234
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *postilokero
|
||||
numeric_probability: 0.9 # 123
|
||||
alpha_probability: 0.05 # A
|
||||
numeric_plus_alpha_probability: 0.04 # 123G
|
||||
alpha_plus_numeric_probability: 0.01 # A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
asunto: &asunto
|
||||
canonical: asunto
|
||||
abbreviated: as
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.7
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
null_phrase_probability: 0.3
|
||||
# as nro 4
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
ruumi: &huone
|
||||
canonical: huone
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *asunto
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *huone
|
||||
probability: 0.1
|
||||
numeric_probability: 1.0 # e.g. as 1
|
||||
|
||||
# Separate random probability for adding directions like 2O, 2V, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.005
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. asunto
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.05
|
||||
951
resources/addresses/fr.yaml
Normal file
951
resources/addresses/fr.yaml
Normal file
@@ -0,0 +1,951 @@
|
||||
# Note: default config is for France. Canadian, Swiss, Belgian, and other
|
||||
# conventions go in country overrides
|
||||
|
||||
components:
|
||||
level:
|
||||
# If no floor number is specified
|
||||
null_probability: 0.8
|
||||
alphanumeric_probability: 0.2
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.8
|
||||
alphanumeric_probability: 0.2
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: /
|
||||
probability: 0.8
|
||||
- separator: "-"
|
||||
probability: 0.1
|
||||
- separator: " - "
|
||||
probability: 0.1
|
||||
probability: 0.005
|
||||
|
||||
numbers:
|
||||
default: &numero
|
||||
canonical: numéro
|
||||
abbreviated: "nº"
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
sample_exclude:
|
||||
- "#" # Used in numeric affix. Needs to be quoted, otherwise it's a comment
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
# Probabilities for numbers
|
||||
numeric_probability: 0.7
|
||||
numeric_affix_probability: 0.3
|
||||
|
||||
and:
|
||||
default: &and
|
||||
canonical: et
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.25
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
house_numbers:
|
||||
# sans numéro (s/n) addresses
|
||||
no_number:
|
||||
canonical: sans numéro
|
||||
abbreviated: s/n
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.7
|
||||
sample_probability: 0.2
|
||||
|
||||
alphanumeric:
|
||||
default: *numero
|
||||
|
||||
alphanumeric_phrase_probability: 0.01
|
||||
no_number_probability: 0.05 # With this probability, use sin número if no house_number is specified
|
||||
|
||||
levels:
|
||||
floor: &etage
|
||||
canonical: étage
|
||||
abbreviated: ét
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.75
|
||||
ordinal_probability: 0.25
|
||||
niveau: &niveau
|
||||
canonical: niveau
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.75
|
||||
ordinal_probability: 0.25
|
||||
bel_etage: &bel_etage
|
||||
canonical: bel étage
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
etage_noble: &etage_noble
|
||||
canonical: étage noble
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
dernier_etage: &dernier_etage
|
||||
canonical: dernier étage
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
basement: &sous_sol
|
||||
canonical: sous-sol
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
standalone_probability: 0.99
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
sub_basement: &soubassement
|
||||
canonical: soubassement
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 2
|
||||
number_subtract_abs_value: 1
|
||||
standalone_probability: 0.99
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
mezzanine: &entresol
|
||||
canonical: entresol
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
# Ground floor
|
||||
rez_de_chaussee: &rez_de_chaussee
|
||||
canonical: rez-de-chaussée
|
||||
abbreviated: rdc
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.3
|
||||
rez_de_chaussee_bas: &rez_de_chaussee_bas
|
||||
canonical: rez-de-chaussée bas
|
||||
abbreviated: rcb
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
rez_de_chaussee_haut: &rez_de_chaussee_haut
|
||||
canonical: rez-de-chaussée haut
|
||||
abbreviated: rch
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
parterre: &parterre
|
||||
canonical: parterre
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
rez_de_jardin: &rez_de_jardin
|
||||
canonical: rez-de-jardin
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *sous_sol
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *soubassement
|
||||
probability: 0.3995
|
||||
- alternative: *etage
|
||||
probability: 0.0005
|
||||
"-1":
|
||||
default: *sous_sol
|
||||
probability: 0.9995
|
||||
alternatives:
|
||||
- alternative: *etage
|
||||
probability: 0.0005
|
||||
half_floors:
|
||||
default: *entresol
|
||||
"0":
|
||||
default: *rez_de_chaussee
|
||||
probability: 0.74
|
||||
alternatives:
|
||||
- alternative: *rez_de_jardin
|
||||
probability: 0.01
|
||||
- alternative: *rez_de_chaussee_bas
|
||||
probability: 0.1
|
||||
- alternative: *rez_de_chaussee_haut
|
||||
probability: 0.1
|
||||
- alternative: *etage
|
||||
probability: 0.05
|
||||
"1":
|
||||
default: *etage
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *bel_etage
|
||||
probability: 0.1
|
||||
- alternative: *etage_noble
|
||||
probability: 0.1
|
||||
top:
|
||||
default: *etage
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *dernier_etage
|
||||
probability: 0.1
|
||||
|
||||
alphanumeric:
|
||||
default: *etage
|
||||
probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *niveau
|
||||
probability: 0.05
|
||||
numeric_probability: 0.99
|
||||
alpha_probability: 0.01
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
|
||||
cross_streets:
|
||||
# 26th & 6th Avenue
|
||||
and: *and
|
||||
# 26th @ Broadway
|
||||
a: &a
|
||||
canonical: à
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
au: &au
|
||||
canonical: au
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
corner_of: &langle_de
|
||||
canonical: l'angle de
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
at_the_corner_of: &a_langle_de
|
||||
canonical: à l'angle de
|
||||
|
||||
intersection:
|
||||
default: *and
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *a
|
||||
probability: 0.025
|
||||
- alternative: *au
|
||||
probability: 0.025
|
||||
- alternative: *langle_de
|
||||
probability: 0.15
|
||||
- alternative: *a_langle_de
|
||||
probability: 0.1
|
||||
|
||||
# 26th betw 5th Ave and 6th Ave
|
||||
between:
|
||||
canonical: entre
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
|
||||
|
||||
directions:
|
||||
right: &droit
|
||||
canonical: droit
|
||||
abbreviated: dr
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: d
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.7
|
||||
numeric_affix_probability: 0.3
|
||||
left: &gauche
|
||||
canonical: gauche
|
||||
abbreviated: g
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: g
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
rear: &arriere
|
||||
canonical: arrière
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
front: &avant
|
||||
canonical: avant
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *droit
|
||||
probability: 0.49
|
||||
- alternative: *gauche
|
||||
probability: 0.49
|
||||
- alternative: *arriere
|
||||
probability: 0.01
|
||||
- alternative: *avant
|
||||
probability: 0.01
|
||||
|
||||
anteroposterior:
|
||||
alternatives:
|
||||
- alternative: *avant
|
||||
probability: 0.5
|
||||
- alternative: *arriere
|
||||
probability: 0.5
|
||||
|
||||
lateral:
|
||||
alternatives:
|
||||
- alternative: *droit
|
||||
probability: 0.5
|
||||
- alternative: *gauche
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &est
|
||||
canonical: est
|
||||
abbreviated: e
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: e
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &ouest
|
||||
canonical: ouest
|
||||
abbreviated: o
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: o
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &nord
|
||||
canonical: nord
|
||||
abbreviated: n
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &sud
|
||||
canonical: sud
|
||||
abbreviated: s
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
probability: 0.25
|
||||
- alternative: *est
|
||||
probability: 0.25
|
||||
- alternative: *sud
|
||||
probability: 0.25
|
||||
- alternative: *ouest
|
||||
probability: 0.25
|
||||
|
||||
entrances:
|
||||
entrance: &entrance
|
||||
canonical: entrance
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Entrance 1, Entrance A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *entrance
|
||||
numeric_probability: 0.1 # e.g. Entrance 1
|
||||
alpha_probability: 0.85 # e.g. Entrnace A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
modifier:
|
||||
direction: right # e.g. Entrance Nord
|
||||
direction_probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *est
|
||||
- alternative: *ouest
|
||||
- alternative: *droit
|
||||
- alternative: *gauche
|
||||
- alternative: *arriere
|
||||
- alternative: *avant
|
||||
|
||||
staircases:
|
||||
escalier: &escalier
|
||||
canonical: escalier
|
||||
abbreviated: esc
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
# For alphanumerics, Escalier A, Esc 1, etc.
|
||||
default: *escalier
|
||||
numeric_probability: 0.6 # e.g. Escalier 1
|
||||
alpha_probability: 0.35 # e.g. Escalier A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right # e.g. Escalier Izq
|
||||
direction_probability: 0.9
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *est
|
||||
- alternative: *ouest
|
||||
- alternative: *droit
|
||||
- alternative: *gauche
|
||||
- alternative: *arriere
|
||||
- alternative: *avant
|
||||
|
||||
|
||||
po_boxes:
|
||||
boite_postal: &boite_postal
|
||||
canonical: boîte postale
|
||||
abbreviated: bp
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # BP No 1234
|
||||
numeric_probability: 1.0
|
||||
case_postal: &case_postal
|
||||
canonical: case postale
|
||||
abbreviated: cp
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # CP No 1234
|
||||
numeric_probability: 1.0
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *boite_postal
|
||||
numeric_probability: 0.9 # BP 123
|
||||
alpha_probability: 0.05 # BP A
|
||||
numeric_plus_alpha_probability: 0.04 # BP 123G
|
||||
alpha_plus_numeric_probability: 0.01 # BP A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
|
||||
units:
|
||||
flat: &appartement
|
||||
canonical: appartement
|
||||
abbreviated: app
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
unit: &unite
|
||||
canonical: unité
|
||||
abbreviated: u
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
suite: &suite
|
||||
canonical: suite
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.4
|
||||
office: &bureau
|
||||
canonical: bureau
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.3
|
||||
door: &porte
|
||||
canonical: porte
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
room: &salle
|
||||
canonical: salle
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
chambre: &chambre
|
||||
canonical: chambre
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
boite: &boite
|
||||
canonical: boîte
|
||||
abbreviated: bte
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
lot: &lotissement
|
||||
canonical: lotissement
|
||||
abbreviated: lot
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
parcelle: &parcelle
|
||||
canonical: parcelle
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
|
||||
allotments:
|
||||
lot:
|
||||
default: *lotissement
|
||||
numeric_probability: 0.8
|
||||
alphanumeric_probability: 0.1
|
||||
alpha_probability: 0.1
|
||||
parcel:
|
||||
default: *parcelle
|
||||
numeric_probability: 0.3
|
||||
alphanumeric_probability: 0.3
|
||||
alpha_probability: 0.4
|
||||
lot_probability: 0.9
|
||||
parcel_probability: 0.06
|
||||
lot_plus_parcel_probability: 0.02
|
||||
parcel_plus_lot_probability: 0.02
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *appartement
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
# e.g. just plain #3 or No. 4
|
||||
- alternative: *numero
|
||||
probability: 0.05
|
||||
- alternative: *porte
|
||||
probability: 0.095
|
||||
- alternative: *chambre
|
||||
probability: 0.005
|
||||
numeric_probability: 0.9 # e.g. Appartement 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. Appartement A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2D, 2G, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.1
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. Unité Gauche
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.1
|
||||
|
||||
zones:
|
||||
residential: *unit_alphanumeric
|
||||
commercial:
|
||||
default: *bureau
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *suite
|
||||
probability: 0.2
|
||||
|
||||
numeric_probability: 0.9 # e.g. Bureau 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Bureau 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Bureau A1
|
||||
alpha_probability: 0.08 # e.g. Bureau A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
industrial:
|
||||
default: *lotissement
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *bureau
|
||||
probability: 0.3
|
||||
- alternative: *unite
|
||||
probability: 0.19
|
||||
- alternative: *parcelle
|
||||
probability: 0.01
|
||||
|
||||
numeric_probability: 0.9 # e.g. Lote 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Lote 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Lote A1
|
||||
alpha_probability: 0.08 # e.g. Lote A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
university:
|
||||
default: *salle
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *porte
|
||||
probability: 0.1
|
||||
|
||||
numeric_probability: 0.9 # e.g. Salle 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Salle 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Salle A1
|
||||
alpha_probability: 0.08 # e.g. Salle A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: près de
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: à coté de
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: proche de
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: proches de
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: a cote de
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: pres de
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: aux environs de
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: à proximité de
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: a proximite de
|
||||
probability: 0.05
|
||||
nearby:
|
||||
default:
|
||||
canonical: proche
|
||||
probability: 0.4
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: à coté
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: a cote
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: près d'ici
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: près dici
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: pres d'ici
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: pres dici
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: près de là
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: pres de la
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: par ici
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: dans les alentours
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: à proximité de là
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: a proximite de la
|
||||
probability: 0.05
|
||||
near_me:
|
||||
default:
|
||||
canonical: proche de chez moi
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: près de moi
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: pres de moi
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: à proximité de moi
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: a proximite de moi
|
||||
probability: 0.1
|
||||
in:
|
||||
default:
|
||||
canonical: à
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: en
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: a
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: dans
|
||||
probability: 0.1
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
countries:
|
||||
# Belgium
|
||||
be:
|
||||
units:
|
||||
alphanumeric:
|
||||
default: *boite
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative: *appartement
|
||||
probability: 0.1
|
||||
# e.g. just plain #3 or No. 4
|
||||
- alternative: *numero
|
||||
probability: 0.05
|
||||
- alternative: *porte
|
||||
probability: 0.095
|
||||
- alternative: *chambre
|
||||
probability: 0.005
|
||||
# Canada
|
||||
ca:
|
||||
components:
|
||||
|
||||
unit:
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- unit
|
||||
- house_number
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: /
|
||||
probability: 0.04
|
||||
- separator: "-"
|
||||
probability: 0.95
|
||||
- separator: " - "
|
||||
probability: 0.01
|
||||
probability: 0.1
|
||||
levels:
|
||||
numbering_starts_at: 1
|
||||
aliases:
|
||||
"1":
|
||||
# Have to do this because etage is numeric
|
||||
# and has keys like "numeric_probability" which
|
||||
# we don't want to infect rez_de_chausee when doing
|
||||
# a recursive merge
|
||||
default: *etage
|
||||
probability: 0.1
|
||||
alternatives:
|
||||
- alternative: *rez_de_chaussee
|
||||
probability: 0.8
|
||||
- alternative: *bel_etage
|
||||
probability: 0.05
|
||||
- alternative: *etage_noble
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
alphanumeric:
|
||||
# More common to use in in Canada, as in the US
|
||||
use_floor_probability: 0.35
|
||||
|
||||
po_boxes:
|
||||
alphanumeric:
|
||||
default: *case_postal
|
||||
# Switzerland
|
||||
ch:
|
||||
levels:
|
||||
aliases:
|
||||
"0":
|
||||
default: *parterre
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *rez_de_chaussee
|
||||
probability: 0.05
|
||||
- alternative: *etage
|
||||
probability: 0.05
|
||||
po_boxes:
|
||||
alphanumeric:
|
||||
default: *case_postal
|
||||
269
resources/addresses/he.yaml
Normal file
269
resources/addresses/he.yaml
Normal file
@@ -0,0 +1,269 @@
|
||||
# he.yaml
|
||||
# -------
|
||||
# Hebrew language specification
|
||||
|
||||
|
||||
alphabet: אבגדהוזחטיכךלמםנןסעפףצץקרשת
|
||||
alphabet_probability: 0.8
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.95
|
||||
alphanumeric_probability: 0.05
|
||||
|
||||
entrance:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
unit:
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- entrance
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.7
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- entrance
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: " "
|
||||
probability: 0.5
|
||||
- separator: ""
|
||||
probability: 0.2
|
||||
- separator: "/"
|
||||
probability: 0.1
|
||||
- separator: "-"
|
||||
probability: 0.1
|
||||
- separator: " - "
|
||||
probability: 0.1
|
||||
probability: 0.7
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.1
|
||||
|
||||
levels:
|
||||
koma: &koma
|
||||
canonical: קומה
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
koma_latin: &koma_latin
|
||||
canonical: koma
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
|
||||
komat_karka: &komat_karka
|
||||
canonical: קומת קרקע
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
komat_karka_latin: &komat_karka_latin
|
||||
canonical: komát karká
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
|
||||
martef: &martef
|
||||
canonical: מרתף
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: left
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
standalone_probability: 0.985
|
||||
numeric_probability: 0.01
|
||||
ordinal_probability: 0.005
|
||||
martef_latin: &martef_latin
|
||||
canonical: martef
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: left
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
standalone_probability: 0.985
|
||||
numeric_probability: 0.01
|
||||
ordinal_probability: 0.005
|
||||
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *martef
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *martef_latin
|
||||
probability: 0.1
|
||||
"-1":
|
||||
default: *martef
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *martef_latin
|
||||
probability: 0.1
|
||||
"0":
|
||||
default: *komat_karka
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *komat_karka_latin
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *koma
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *koma_latin
|
||||
probability: 0.1
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
entrances:
|
||||
knisa: &knisa
|
||||
canonical: כניסה
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
knisa_latin: &knisa_latin
|
||||
canonical: knisa
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# вход 1, вход A, etc.
|
||||
alphanumeric:
|
||||
default: *knisa
|
||||
probability: 0.99
|
||||
alternatives:
|
||||
- alternative: *knisa_latin
|
||||
probability: 0.01
|
||||
numeric_probability: 0.1
|
||||
alpha_probability: 0.9
|
||||
|
||||
po_boxes:
|
||||
ta_doar: &ta_doar
|
||||
canonical: תיבת דואר
|
||||
abbreviated: ת.ד.
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ta_doar_latin: &ta_doar_latin
|
||||
canonical: abonementnyy pochtovyy yashchik
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
default: *ta_doar
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *ta_doar_latin
|
||||
probability: 0.2
|
||||
numeric_probability: 0.9 # ta doar 123
|
||||
alpha_probability: 0.05 # ta doar А
|
||||
numeric_plus_alpha_probability: 0.04 # ta doar 123А
|
||||
alpha_plus_numeric_probability: 0.01 # ta doar А123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
dira: &dira
|
||||
canonical: דירה
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
dira_latin: &dira_latin
|
||||
canonical: dira
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *dira
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *dira_latin
|
||||
probability: 0.1
|
||||
|
||||
numeric_probability: 0.9 # e.g. dira 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1А
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. AА1
|
||||
alpha_probability: 0.04 # e.g. dira А
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.1
|
||||
586
resources/addresses/hr.yaml
Normal file
586
resources/addresses/hr.yaml
Normal file
@@ -0,0 +1,586 @@
|
||||
# hr.yaml
|
||||
# -------
|
||||
# Croatian language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.7
|
||||
alphanumeric_probability: 0.3
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- staircase
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.1
|
||||
# For unit types like 2/34
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
|
||||
|
||||
numbers:
|
||||
no_number:
|
||||
default:
|
||||
canonical: bez broja
|
||||
abbreviated: bb
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
|
||||
default: &broj
|
||||
canonical: broj
|
||||
abbreviated: br
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "br."
|
||||
whitespace_probability: 0.6
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
alphanumeric_phrase_probability: 0.05
|
||||
no_number_probability: 0.05
|
||||
|
||||
|
||||
and:
|
||||
default: &i
|
||||
canonical: i
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
|
||||
cross_streets:
|
||||
i: *i
|
||||
at: &na
|
||||
canonical: na
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner: &ugao
|
||||
canonical: ugao
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner_of: &uglu
|
||||
canonical: uglu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
na_uglu: &na_uglu
|
||||
canonical: na uglu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *i
|
||||
probability: 0.65
|
||||
alternatives:
|
||||
- alternative: *na
|
||||
probability: 0.1
|
||||
- alternative: *uglu
|
||||
probability: 0.1
|
||||
- alternative: *na_uglu
|
||||
probability: 0.1
|
||||
- alternative: *ugao
|
||||
probability: 0.05
|
||||
|
||||
izmedu: &izmedu
|
||||
canonical: između
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
between:
|
||||
default: *izmedu
|
||||
|
||||
levels:
|
||||
kat: &kat
|
||||
canonical: kat
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
etaza: &etaza
|
||||
canonical: etaža
|
||||
abbreviated: et
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
prizemlje: &prizemlje
|
||||
canonical: prizemlje
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
parter: &parter
|
||||
canonical: parter
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
mezanino: &polukat
|
||||
canonical: polukat
|
||||
half_floors: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
sample: true
|
||||
# e.g. polukat 2
|
||||
numeric:
|
||||
direction: left
|
||||
# e.g. 2. entresuelo
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.1
|
||||
ordinal_probability: 0.2
|
||||
standalone_probability: 0.6
|
||||
podrum: &podrum
|
||||
canonical: podrum
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
# e.g. подрум 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. 1. подрум
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *podrum
|
||||
"-1":
|
||||
default: *podrum
|
||||
# Special token for half-floors
|
||||
half_floors:
|
||||
default: *polukat
|
||||
"0":
|
||||
default: *prizemlje
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *parter
|
||||
probability: 0.4
|
||||
- alternative: *kat
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *kat
|
||||
probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *etaza
|
||||
probability: 0.05
|
||||
numeric_probability: 0.69 # With this probability, pick an integer
|
||||
roman_numeral_probability: 0.3 # Pick a Roman numeral for the actual value
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: u blizini
|
||||
nearby:
|
||||
default:
|
||||
canonical: u blizini
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: u blizini ovdje
|
||||
probability: 0.3
|
||||
- alternative:
|
||||
canonical: oko ovdje
|
||||
probability: 0.1
|
||||
|
||||
near_me:
|
||||
default:
|
||||
canonical: u blizini mene
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: u
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
directions:
|
||||
right: &desno
|
||||
canonical: desno
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &lijevo
|
||||
canonical: lijevo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *desno
|
||||
probability: 0.5
|
||||
- alternative: *lijevo
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &istok
|
||||
canonical: istok
|
||||
abbreviated: i
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: i
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &zapad
|
||||
canonical: zapad
|
||||
abbreviated: z
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &sjever
|
||||
canonical: sjever
|
||||
abbreviated: s
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &jug
|
||||
canonical: jug
|
||||
abbreviated: j
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: j
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *sjever
|
||||
probability: 0.25
|
||||
- alternative: *istok
|
||||
probability: 0.23
|
||||
- alternative: *jug
|
||||
probability: 0.23
|
||||
- alternative: *zapad
|
||||
probability: 0.23
|
||||
|
||||
entrances:
|
||||
ulaz: &ulaz
|
||||
canonical: ulaz
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Ulaz 1, Ulaz A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *ulaz
|
||||
numeric_probability: 0.1 # e.g. Ulaz 1
|
||||
alpha_probability: 0.85 # e.g. Ulaz A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
|
||||
staircases:
|
||||
stubiste: &stubiste
|
||||
canonical: stubište
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *stubiste
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *desno
|
||||
probability: 0.2
|
||||
- alternative: *lijevo
|
||||
probability: 0.2
|
||||
- alternative: *sjever
|
||||
probability: 0.15
|
||||
- alternative: *jug
|
||||
probability: 0.15
|
||||
- alternative: *istok
|
||||
probability: 0.15
|
||||
- alternative: *zapad
|
||||
probability: 0.15
|
||||
|
||||
po_boxes:
|
||||
postanski_pretinac: &postanski_pretinac
|
||||
canonical: poštanski pretinac
|
||||
abbreviated: p.p
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
|
||||
alphanumeric:
|
||||
default: *postanski_pretinac
|
||||
numeric_probability: 0.9 # pp 123
|
||||
alpha_probability: 0.05 # p.p A
|
||||
numeric_plus_alpha_probability: 0.04 # pp 123G
|
||||
alpha_plus_numeric_probability: 0.01 # pp A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
stan: &stan
|
||||
canonical: stan
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
apartman: &apartman
|
||||
canonical: apartman
|
||||
abbreviated: ap
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
soba: &soba
|
||||
canonical: soba
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ured: &ured
|
||||
canonical: ured
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *stan
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *apartman
|
||||
probability: 0.3
|
||||
- alternative: *soba
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. stan. 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. stan A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.05
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *soba
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *ured
|
||||
probability: 0.4
|
||||
numeric_probability: 0.95 # e.g. soba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
|
||||
alpha_probability: 0.03 # e.g. soba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *soba
|
||||
numeric_probability: 0.95 # e.g. soba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
|
||||
alpha_probability: 0.03 # e.g. soba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
439
resources/addresses/hu.yaml
Normal file
439
resources/addresses/hu.yaml
Normal file
@@ -0,0 +1,439 @@
|
||||
# hu.yaml
|
||||
# -------
|
||||
# Hungarian language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.2
|
||||
standalone_probability: 0.05
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- level
|
||||
- unit
|
||||
label: unit
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.55
|
||||
- separator: " "
|
||||
probability: 0.4
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.8
|
||||
|
||||
|
||||
numbers:
|
||||
default: &szam
|
||||
canonical: szám
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
and:
|
||||
default: &es
|
||||
canonical: és
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: &es_a
|
||||
canonical: és a
|
||||
canonical_probability: 0.9
|
||||
sample: true
|
||||
sample_probability: 0.1
|
||||
probability: 0.2
|
||||
- alternative: &es_az
|
||||
canonical: és az
|
||||
canonical_probability: 0.9
|
||||
sample: true
|
||||
sample_probability: 0.1
|
||||
probability: 0.2
|
||||
|
||||
cross_streets:
|
||||
and: *es
|
||||
corner_of: &sarkan
|
||||
canonical: sarkán
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *es
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *es_a
|
||||
probability: 0.1
|
||||
- alternative: *es_az
|
||||
probability: 0.1
|
||||
- alternative: *sarkan
|
||||
probability: 0.2
|
||||
|
||||
between:
|
||||
canonical: között
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &emelet
|
||||
canonical: emelet
|
||||
abbreviated: em
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.85
|
||||
sample_probability: 0.05
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.2
|
||||
roman_numeral_probability: 0.8
|
||||
numeric_probability: 0.1
|
||||
ordinal_probability: 0.9
|
||||
foldszint: &foldszint
|
||||
canonical: földszint
|
||||
abbreviated: fszt
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.2
|
||||
felemelet: &felemelet
|
||||
canonical: félemelet
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
magasfoldszint: &magasfoldszint
|
||||
canonical: magasföldszint
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
pince: &pince
|
||||
canonical: pince
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
alagsor: &alagsor
|
||||
canonical: alagsor
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
felszuteren: &felszuteren
|
||||
canonical: félszuterén
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
szuteren: &szuteren
|
||||
canonical: szuterén
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *alagsor
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *pince
|
||||
probability: 0.3
|
||||
- alternative: *szuteren
|
||||
probability: 0.1
|
||||
"-1":
|
||||
default: *alagsor
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *pince
|
||||
probability: 0.3
|
||||
- alternative: *szuteren
|
||||
probability: 0.1
|
||||
- alternative: *felszuteren
|
||||
probability: 0.1
|
||||
|
||||
"0":
|
||||
default: *foldszint
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *emelet
|
||||
probability: 0.1
|
||||
|
||||
"1":
|
||||
default: *emelet
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *felemelet
|
||||
probability: 0.1
|
||||
|
||||
"2":
|
||||
default: *emelet
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *magasfoldszint
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *emelet
|
||||
numeric_probability: 0.59 # With this probability, pick an integer
|
||||
roman_numeral_probability: 0.4 # Pick a Roman numeral for the actual value
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: közelében
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
nearby:
|
||||
default:
|
||||
canonical: közelben
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
near_me:
|
||||
default:
|
||||
canonical: közelemben
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.7
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
|
||||
directions:
|
||||
right: &jobb
|
||||
canonical: jobb
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &bal
|
||||
canonical: bal
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *jobb
|
||||
probability: 0.5
|
||||
- alternative: *bal
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &kelet
|
||||
canonical: kelet
|
||||
abbreviated: k
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: k
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &nyugat
|
||||
canonical: nyugat
|
||||
abbreviated: n
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &eszak
|
||||
canonical: észak
|
||||
abbreviated: e
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: e
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &del
|
||||
canonical: dél
|
||||
abbreviated: d
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: d
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *eszak
|
||||
probability: 0.25
|
||||
- alternative: *kelet
|
||||
probability: 0.25
|
||||
- alternative: *del
|
||||
probability: 0.25
|
||||
- alternative: *nyugat
|
||||
probability: 0.25
|
||||
|
||||
|
||||
po_boxes:
|
||||
postafiok: &postafiok
|
||||
canonical: postafiók
|
||||
abbreviated: pf
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.7
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric:
|
||||
default: *postafiok
|
||||
numeric_probability: 0.9 # Pf 123
|
||||
alpha_probability: 0.05 # Pf A
|
||||
numeric_plus_alpha_probability: 0.04 # Pf 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Pf A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
lakas: &lakas
|
||||
canonical: lakás
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.3
|
||||
ordinal_probability: 0.7
|
||||
iroda: &iroda
|
||||
canonical: iroda
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
szoba: &szoba
|
||||
canonical: szoba
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *lakas
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *szoba
|
||||
probability: 0.1
|
||||
numeric_probability: 0.95 # e.g. m. 1
|
||||
numeric_plus_alpha_probability: 0.005 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.005 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. m. A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.2
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *iroda
|
||||
numeric_probability: 0.95 # e.g. pokój 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. pokój 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. pokój A1
|
||||
alpha_probability: 0.03 # e.g. pokój A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university: *commercial_unit_types
|
||||
459
resources/addresses/is.yaml
Normal file
459
resources/addresses/is.yaml
Normal file
@@ -0,0 +1,459 @@
|
||||
# is.yaml
|
||||
# -------
|
||||
# Icelandic language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- level
|
||||
- unit
|
||||
label: unit
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.9
|
||||
- separator: " - "
|
||||
probability: 0.1
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- entrance
|
||||
- unit
|
||||
label: unit
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.9
|
||||
- separator: " - "
|
||||
probability: 0.1
|
||||
probability: 0.001
|
||||
|
||||
|
||||
numbers:
|
||||
default: &numer
|
||||
canonical: númer
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *numer
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
|
||||
|
||||
and:
|
||||
default: &og
|
||||
canonical: og
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
cross_streets:
|
||||
and: *og
|
||||
corner_of: &horn_of
|
||||
canonical: horn af
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &a_horinu_a
|
||||
canonical: á horninu á
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *og
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *horn_of
|
||||
probability: 0.15
|
||||
- alternative: *a_horinu_a
|
||||
probability: 0.15
|
||||
|
||||
between:
|
||||
canonical: milli
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &haeo
|
||||
canonical: hæð
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.9
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
|
||||
jarohaeo: &jarohaeo
|
||||
canonical: jarðhæð
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
sample_probability: 0.7
|
||||
kjallara: &kjallara
|
||||
canonical: kjallara
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
# e.g. 1 kjallara
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.8
|
||||
# e.g. k1
|
||||
numeric_affix:
|
||||
affix: k
|
||||
direction: left
|
||||
# e.g. 1. kjallara
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.985
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *kjallara
|
||||
"-1":
|
||||
default: *kjallara
|
||||
"0":
|
||||
default: *jarohaeo
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *haeo
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: nálægt
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
nearby:
|
||||
default:
|
||||
canonical: nálægt
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: nálægt hér
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: hérna
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: hér
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
near_me:
|
||||
default:
|
||||
canonical: nálægt mér
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: í
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
|
||||
|
||||
directions:
|
||||
right: &til_haegri
|
||||
canonical: til hægri
|
||||
abbreviated: t.h
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: t.h
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
left: &til_vinstri
|
||||
canonical: til vinstri
|
||||
abbreviated: t.v
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: t.v
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
alternatives:
|
||||
- alternative: *til_haegri
|
||||
probability: 0.5
|
||||
- alternative: *til_vinstri
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &austur
|
||||
canonical: austur
|
||||
abbreviated: a
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: a
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &vestur
|
||||
canonical: vestur
|
||||
abbreviated: v
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &norour
|
||||
canonical: norður
|
||||
abbreviated: n
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &suour
|
||||
canonical: suður
|
||||
abbreviated: s
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *norour
|
||||
probability: 0.25
|
||||
- alternative: *austur
|
||||
probability: 0.25
|
||||
- alternative: *suour
|
||||
probability: 0.25
|
||||
- alternative: *vestur
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
inngangur: &inngangur
|
||||
canonical: inngangur
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Inngangur 1, Inngangur A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *inngangur
|
||||
numeric_probability: 0.1 # e.g. Inngangur 1
|
||||
alpha_probability: 0.85 # e.g. Inngangur A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
stiege: &stigi
|
||||
canonical: stigi
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *stigi
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *norour
|
||||
- alternative: *suour
|
||||
- alternative: *austur
|
||||
- alternative: *vestur
|
||||
|
||||
po_boxes:
|
||||
postholf: &postholf
|
||||
canonical: pósthólf
|
||||
abbreviated: ph
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Ph Nr 1234
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *postholf
|
||||
numeric_probability: 0.9 # Ph 123
|
||||
alpha_probability: 0.05 # Ph A
|
||||
numeric_plus_alpha_probability: 0.04 # Ph 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Ph A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
ibuo: &ibuo
|
||||
canonical: íbúð
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
null_phrase_probability: 0.5
|
||||
# íbúð nummer 4
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *ibuo
|
||||
numeric_probability: 0.9 # e.g. íbúð 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. íbúð A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2R, 2L, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.1
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.1
|
||||
673
resources/addresses/it.yaml
Normal file
673
resources/addresses/it.yaml
Normal file
@@ -0,0 +1,673 @@
|
||||
# it.yaml
|
||||
# -------
|
||||
# Italian language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
# If no floor number is specified
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.8
|
||||
alphanumeric_probability: 0.2
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: /
|
||||
probability: 1.0
|
||||
probability: 0.5
|
||||
|
||||
numbers:
|
||||
default: &numero
|
||||
canonical: numero
|
||||
abbreviated: "nº"
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.5
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "n."
|
||||
direction: left
|
||||
# Probabilities for numbers
|
||||
numeric_probability: 0.7
|
||||
numeric_affix_probability: 0.3
|
||||
|
||||
and:
|
||||
default: &e
|
||||
canonical: e
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.25
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
house_numbers:
|
||||
# sans numéro (s/n) addresses
|
||||
no_number:
|
||||
canonical: senza numero civico
|
||||
abbreviated: snc
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.7
|
||||
sample_probability: 0.2
|
||||
|
||||
alphanumeric:
|
||||
default: *numero
|
||||
|
||||
alphanumeric_phrase_probability: 0.01
|
||||
no_number_probability: 0.05 # With this probability, use sin número if no house_number is specified
|
||||
|
||||
levels:
|
||||
floor: &piano
|
||||
canonical: piano
|
||||
abbreviated: pº
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.15
|
||||
sample_probability: 0.25
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.95
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
digits:
|
||||
ascii_probability: 0.9
|
||||
roman_numeral_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
roman_numeral_probability: 0.3
|
||||
numeric_probability: 0.55
|
||||
ordinal_probability: 0.45
|
||||
livello: &livello
|
||||
canonical: livello
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
numeric_probability: 0.75
|
||||
ordinal_probability: 0.25
|
||||
piano_nobile: &piano_nobile
|
||||
canonical: piano nobile
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
piano_terra: &piano_terra
|
||||
canonical: piano terra
|
||||
abbreviated: p.t
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.25
|
||||
sample_probability: 0.25
|
||||
basement: &seminterrato
|
||||
canonical: seminterrato
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
standalone_probability: 0.99
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *seminterrato
|
||||
probability: 0.995
|
||||
alternatives:
|
||||
- alternative: *piano
|
||||
probability: 0.005
|
||||
"-1":
|
||||
default: *seminterrato
|
||||
probability: 0.9995
|
||||
alternatives:
|
||||
- alternative: *piano
|
||||
probability: 0.0005
|
||||
"0":
|
||||
default: *piano_terra
|
||||
probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *piano
|
||||
probability: 0.05
|
||||
"1":
|
||||
default: *piano
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *piano_nobile
|
||||
probability: 0.1
|
||||
|
||||
alphanumeric:
|
||||
default: *piano
|
||||
probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *livello
|
||||
probability: 0.05
|
||||
numeric_probability: 0.99
|
||||
alpha_probability: 0.01
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
cross_streets:
|
||||
# 26th & 6th Avenue
|
||||
and: *e
|
||||
# 26th @ Broadway
|
||||
a: &a
|
||||
canonical: a
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
corner_of: &angolo_di
|
||||
canonical: angolo di
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
corner: &angolo
|
||||
canonical: angolo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
at_the_corner_of: &all_angolo_tra
|
||||
canonical: all'angolo tra
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
intersection:
|
||||
default: *e
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *a
|
||||
probability: 0.05
|
||||
- alternative: *angolo_di
|
||||
probability: 0.15
|
||||
- alternative: *all_angolo_tra
|
||||
probability: 0.1
|
||||
|
||||
# 26th betw 5th Ave and 6th Ave
|
||||
between:
|
||||
canonical: tra
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: vicino a
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: presso a
|
||||
probability: 0.25
|
||||
nearby:
|
||||
default:
|
||||
canonical: vicino
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: qui vicino
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: nelle vicinanze
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: intorno a qui
|
||||
probability: 0.1
|
||||
|
||||
near_me:
|
||||
default:
|
||||
canonical: vicino a me
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: a
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: ad
|
||||
probability: 0.15
|
||||
- alternative:
|
||||
canonical: in
|
||||
probability: 0.15
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
|
||||
directions:
|
||||
right: &destra
|
||||
canonical: destra
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
left: &sinistra
|
||||
canonical: sinistra
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
rear: &posteriore
|
||||
canonical: posteriore
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
front: &anteriore
|
||||
canonical: anteriore
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *destra
|
||||
probability: 0.49
|
||||
- alternative: *sinistra
|
||||
probability: 0.49
|
||||
- alternative: *posteriore
|
||||
probability: 0.01
|
||||
- alternative: *anteriore
|
||||
probability: 0.01
|
||||
|
||||
anteroposterior:
|
||||
alternatives:
|
||||
- alternative: *anteriore
|
||||
probability: 0.5
|
||||
- alternative: *posteriore
|
||||
probability: 0.5
|
||||
|
||||
lateral:
|
||||
alternatives:
|
||||
- alternative: *destra
|
||||
probability: 0.5
|
||||
- alternative: *sinistra
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &est
|
||||
canonical: est
|
||||
abbreviated: e
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: e
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &ovest
|
||||
canonical: ovest
|
||||
abbreviated: o
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: o
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &nord
|
||||
canonical: nord
|
||||
abbreviated: n
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &sud
|
||||
canonical: sud
|
||||
abbreviated: s
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
probability: 0.25
|
||||
- alternative: *est
|
||||
probability: 0.25
|
||||
- alternative: *sud
|
||||
probability: 0.25
|
||||
- alternative: *ovest
|
||||
probability: 0.25
|
||||
|
||||
entrances:
|
||||
entrance: &ingresso
|
||||
canonical: ingresso
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Ingresso 1, Ingresso A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *ingresso
|
||||
numeric_probability: 0.1 # e.g. Ingresso 1
|
||||
alpha_probability: 0.85 # e.g. Ingresso A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
modifier:
|
||||
direction: right # e.g. Ingresso Nord
|
||||
direction_probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *est
|
||||
- alternative: *ovest
|
||||
- alternative: *destra
|
||||
- alternative: *sinistra
|
||||
- alternative: *posteriore
|
||||
- alternative: *anteriore
|
||||
|
||||
staircases:
|
||||
scala: &scala
|
||||
canonical: scala
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
# For alphanumerics, Scala A, Scala 1, etc.
|
||||
default: *scala
|
||||
numeric_probability: 0.6 # e.g. Scala 1
|
||||
alpha_probability: 0.35 # e.g. Scala A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right # e.g. Scala Destra
|
||||
direction_probability: 0.9
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *est
|
||||
- alternative: *ovest
|
||||
- alternative: *destra
|
||||
- alternative: *sinistra
|
||||
- alternative: *posteriore
|
||||
- alternative: *anteriore
|
||||
|
||||
|
||||
po_boxes:
|
||||
casella_postale: &casella_postale
|
||||
canonical: casella postale
|
||||
abbreviated: cp
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # CP No 1234
|
||||
numeric_probability: 1.0
|
||||
alphanumeric:
|
||||
default: *casella_postale
|
||||
numeric_probability: 0.9 # CP 123
|
||||
alpha_probability: 0.05 # CP A
|
||||
numeric_plus_alpha_probability: 0.04 # CP 123G
|
||||
alpha_plus_numeric_probability: 0.01 # CP A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
|
||||
units:
|
||||
flat: &appartamento
|
||||
canonical: appartamento
|
||||
abbreviated: app
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
casa: &casa
|
||||
canonical: casa
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
unit: &unita
|
||||
canonical: unità
|
||||
abbreviated: u
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
office: &officina
|
||||
canonical: officina
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.3
|
||||
lotto: &lotto
|
||||
canonical: lotto
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
door: &porta
|
||||
canonical: porta
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
interno: &interno
|
||||
canonical: interno
|
||||
abbreviated: int
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
room: &sala
|
||||
canonical: sala
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *appartamento
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative: *interno
|
||||
probability: 0.1
|
||||
# e.g. just plain #3 or No. 4
|
||||
- alternative: *numero
|
||||
probability: 0.05
|
||||
- alternative: *casa
|
||||
probability: 0.05
|
||||
- alternative: *porta
|
||||
probability: 0.045
|
||||
- alternative: *sala
|
||||
probability: 0.005
|
||||
numeric_probability: 0.9 # e.g. Appartement 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. Appartement A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2D, 2G, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.1
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. Unité Gauche
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.1
|
||||
|
||||
zones:
|
||||
residential: *unit_alphanumeric
|
||||
commercial:
|
||||
default: *officina
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *sala
|
||||
probability: 0.2
|
||||
|
||||
numeric_probability: 0.9 # e.g. Bureau 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Bureau 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Bureau A1
|
||||
alpha_probability: 0.08 # e.g. Bureau A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
industrial:
|
||||
default: *lotto
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *officina
|
||||
probability: 0.3
|
||||
- alternative: *unita
|
||||
probability: 0.2
|
||||
|
||||
numeric_probability: 0.9 # e.g. Lotto 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Lotto 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Lotto A1
|
||||
alpha_probability: 0.08 # e.g. Lotto A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
university:
|
||||
default: *sala
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *porta
|
||||
probability: 0.1
|
||||
|
||||
numeric_probability: 0.9 # e.g. Salle 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Salle 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Salle A1
|
||||
alpha_probability: 0.08 # e.g. Salle A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
161
resources/addresses/ja.yaml
Normal file
161
resources/addresses/ja.yaml
Normal file
@@ -0,0 +1,161 @@
|
||||
# ja.yaml
|
||||
# -------
|
||||
# Japanese language specification
|
||||
|
||||
whitespace: false
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.95 # Probability of doing nothing if no floor number is specified
|
||||
alphanumeric_probability: 0.05
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 1.0
|
||||
conditional:
|
||||
- component: level
|
||||
probabilities:
|
||||
null_probability: 0.95
|
||||
alphanumeric_probability: 0.05
|
||||
- component: house_number
|
||||
probabilities:
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
|
||||
combinations:
|
||||
# Unit is just appended onto the house number
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 1.0
|
||||
probability: 1.0
|
||||
|
||||
numbers:
|
||||
default: &go
|
||||
canonical: 号
|
||||
numeric_affix:
|
||||
affix: 号
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
|
||||
blocks:
|
||||
alphanumeric:
|
||||
default: &ban
|
||||
canonical: 番
|
||||
numeric_affix:
|
||||
affix: 番
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: &banchi
|
||||
canonical: 番地
|
||||
numeric_affix:
|
||||
affix: 番地
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.1
|
||||
- alternative: &banchi_no
|
||||
canonical: 番地の
|
||||
numeric_affix:
|
||||
affix: 番地の
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.05
|
||||
numeric_probability: 1.0
|
||||
alphanumeric_phrase_probability: 0.4
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *go
|
||||
alphanumeric_phrase_probability: 0.4
|
||||
|
||||
levels:
|
||||
kai: &kai
|
||||
canonical: 階
|
||||
numeric_affix:
|
||||
affix: 階
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
unicode_full_width_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *kai
|
||||
numeric_probability: 1.0
|
||||
|
||||
po_boxes:
|
||||
shishobako: &shishobako
|
||||
canonical: 私書箱
|
||||
numeric_affix:
|
||||
affix: 私書箱
|
||||
direction: left
|
||||
digits:
|
||||
unicode_full_width_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
alphanumeric:
|
||||
default: *shishobako
|
||||
numeric_probability: 1.0
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
metro_stations:
|
||||
alphanumeric:
|
||||
default: &eki
|
||||
canonical: 駅
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: 駅
|
||||
direction: right
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
alphanumeric_phrase_probability: 1.0
|
||||
|
||||
postcodes:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: 〒
|
||||
numeric_affix:
|
||||
affix: 〒
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.1
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 0.9
|
||||
|
||||
units:
|
||||
alphanumeric:
|
||||
numeric_probability: 1.0
|
||||
use_positive_numbers_probability: 1.0
|
||||
# If we have a floor number (from building:levels), use it
|
||||
use_floor_probability: 0.8
|
||||
180
resources/addresses/ja_rm.yaml
Normal file
180
resources/addresses/ja_rm.yaml
Normal file
@@ -0,0 +1,180 @@
|
||||
# ja_rm.yaml
|
||||
# ----------
|
||||
# Romaji (Romanized Japanese) language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.95 # Probability of doing nothing if no floor number is specified
|
||||
alphanumeric_probability: 0.05
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 1.0
|
||||
conditional:
|
||||
- component: level
|
||||
probabilities:
|
||||
null_probability: 0.95
|
||||
alphanumeric_probability: 0.05
|
||||
- component: house_number
|
||||
probabilities:
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
|
||||
|
||||
combinations:
|
||||
# Unit is just appended onto the house number
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 1.0
|
||||
probability: 1.0
|
||||
|
||||
numbers:
|
||||
default: &go
|
||||
canonical: go
|
||||
numeric_affix:
|
||||
affix: -go
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
blocks:
|
||||
alphanumeric:
|
||||
default: &ban
|
||||
canonical: ban
|
||||
numeric_affix:
|
||||
affix: -ban
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: &banchi
|
||||
canonical: banchi
|
||||
numeric_affix:
|
||||
affix: -ban
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.1
|
||||
- alternative: &banchi_no
|
||||
canonical: banchi-no
|
||||
numeric_affix:
|
||||
affix: -banchi-no
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.05
|
||||
numeric_probability: 1.0
|
||||
alphanumeric_phrase_probability: 0.4
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *go
|
||||
alphanumeric_phrase_probability: 0.4
|
||||
|
||||
levels:
|
||||
kai: &kai
|
||||
canonical: kai
|
||||
numeric_affix:
|
||||
affix: -kai
|
||||
upper_case: false
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
unicode_full_width_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
gai: &gai
|
||||
canonical: gai
|
||||
numeric_affix:
|
||||
affix: -gai
|
||||
upper_case: false
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
unicode_full_width_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *kai
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *gai
|
||||
probability: 0.4
|
||||
numeric_probability: 1.0
|
||||
|
||||
po_boxes:
|
||||
shishobako: &shishobako
|
||||
canonical: shishobako
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_probability: 1.0
|
||||
|
||||
alphanumeric:
|
||||
default: *shishobako
|
||||
numeric_probability: 1.0
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
|
||||
metro_stations:
|
||||
alphanumeric:
|
||||
default: &eki
|
||||
canonical: eki
|
||||
numeric:
|
||||
direction: right
|
||||
title_case: false
|
||||
numeric_affix:
|
||||
affix: -eki
|
||||
title_case: false
|
||||
direction: right
|
||||
numeric_affix_probability: 1.0
|
||||
alphanumeric_phrase_probability: 1.0
|
||||
|
||||
|
||||
postcodes:
|
||||
alphanumeric:
|
||||
# This should still be the default in Romaji
|
||||
default:
|
||||
canonical: 〒
|
||||
numeric_affix:
|
||||
affix: 〒
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.1
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 0.9
|
||||
|
||||
units:
|
||||
alphanumeric:
|
||||
numeric_probability: 1.0
|
||||
use_positive_numbers_probability: 1.0
|
||||
# If we have a floor number (from building:levels), use it
|
||||
use_floor_probability: 0.8
|
||||
122
resources/addresses/ko.yaml
Normal file
122
resources/addresses/ko.yaml
Normal file
@@ -0,0 +1,122 @@
|
||||
# ko.yaml
|
||||
# -------
|
||||
# Korean language specification
|
||||
|
||||
whitespace: false
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85 # Probability of doing nothing if no floor number is specified
|
||||
alphanumeric_probability: 0.15
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
|
||||
numbers:
|
||||
combinations:
|
||||
# Unit is just appended onto the house number
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 1.0
|
||||
probability: 1.0
|
||||
|
||||
numbers:
|
||||
default: &ho
|
||||
canonical: 호
|
||||
numeric_affix:
|
||||
affix: 호
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: &ho_traditional
|
||||
canonical: 號
|
||||
numeric_affix:
|
||||
affix: 號
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.1
|
||||
|
||||
levels:
|
||||
cheung: &cheung
|
||||
canonical: 층
|
||||
numeric_affix:
|
||||
affix: 층
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
unicode_full_width_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *cheung
|
||||
numeric_probability: 1.0
|
||||
|
||||
po_boxes:
|
||||
saseoham: &saseoham
|
||||
canonical: 사서함
|
||||
numeric_affix:
|
||||
affix: 사서함
|
||||
direction: left
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
unicode_full_width_probability: 0.1
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
alphanumeric:
|
||||
default: *saseoham
|
||||
numeric_probability: 1.0
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
|
||||
postcodes:
|
||||
alphanumeric:
|
||||
default: &upyeon_beonho
|
||||
canonical: 우편번호
|
||||
numeric_affix:
|
||||
affix: 우편번호
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.9
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 0.1
|
||||
|
||||
units:
|
||||
alphanumeric:
|
||||
default: *ho
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *ho_traditional
|
||||
probability: 0.1
|
||||
numeric_probability: 1.0
|
||||
use_positive_numbers_probability: 1.0
|
||||
# If we have a floor number (from building:levels), use it
|
||||
use_floor_probability: 0.8
|
||||
90
resources/addresses/ko_rm.yaml
Normal file
90
resources/addresses/ko_rm.yaml
Normal file
@@ -0,0 +1,90 @@
|
||||
# ko_rm.yaml
|
||||
# ----------
|
||||
# Romanized Korean language specification
|
||||
|
||||
whitespace: false
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85 # Probability of doing nothing if no floor number is specified
|
||||
alphanumeric_probability: 0.15
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
|
||||
numbers:
|
||||
combinations:
|
||||
# Unit is just appended onto the house number
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 1.0
|
||||
probability: 1.0
|
||||
|
||||
numbers:
|
||||
default: &ho
|
||||
canonical: ho
|
||||
numeric_affix:
|
||||
affix: -ho
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
levels:
|
||||
cheung: &cheung
|
||||
canonical: cheung
|
||||
numeric_affix:
|
||||
affix: -cheung
|
||||
upper_case: false
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
unicode_full_width_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *cheung
|
||||
numeric_probability: 1.0
|
||||
|
||||
po_boxes:
|
||||
saseoham: &saseoham
|
||||
canonical: saseoham
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
default: *saseoham
|
||||
numeric_probability: 1.0
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
alphanumeric:
|
||||
default: *ho
|
||||
numeric_probability: 1.0
|
||||
use_positive_numbers_probability: 1.0
|
||||
# If we have a floor number (from building:levels), use it
|
||||
use_floor_probability: 0.8
|
||||
391
resources/addresses/lt.yaml
Normal file
391
resources/addresses/lt.yaml
Normal file
@@ -0,0 +1,391 @@
|
||||
# lt.yaml
|
||||
# -------
|
||||
# Lithuanian language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.97
|
||||
alphanumeric_probability: 0.02
|
||||
standalone_probability: 0.01
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.95
|
||||
- separator: " - "
|
||||
probability: 0.05
|
||||
probability: 0.8
|
||||
|
||||
|
||||
numbers:
|
||||
default: &numeris
|
||||
canonical: numeris
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
and:
|
||||
default: &ir
|
||||
canonical: ir
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
|
||||
cross_streets:
|
||||
and: *ir
|
||||
corner_of: &kampelis
|
||||
canonical: kampelis
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *ir
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *kampelis
|
||||
probability: 0.3
|
||||
|
||||
between:
|
||||
canonical: nuo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
|
||||
levels:
|
||||
aukstas: &aukstas
|
||||
canonical: aukštas
|
||||
abbreviated: auk
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
numeric_probability: 0.2
|
||||
ordinal_probability: 0.8
|
||||
aukste: &aukste
|
||||
<<: *aukstas
|
||||
canonical: aukšte
|
||||
# Ground floor
|
||||
pirmas_aukstas: &pirmas_aukstas
|
||||
canonical: pirmas aukštas
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
rusys: &rusys
|
||||
canonical: rūsys
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
standalone_probability: 1.0
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
rusyje: &rusyje
|
||||
canonical: rūsyje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
# e.g. rūsyje 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. r1
|
||||
numeric_affix:
|
||||
affix: r
|
||||
direction: left
|
||||
# e.g. 1. rūsyje
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.1
|
||||
ordinal_probability: 0.4
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *rusyje
|
||||
"-1":
|
||||
default: *rusys
|
||||
"0": &ground_floor
|
||||
default: *pirmas_aukstas
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *aukste
|
||||
probability: 0.3
|
||||
- alternative: *aukstas
|
||||
probability: 0.1
|
||||
"1": *ground_floor
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *aukstas
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
directions:
|
||||
right: &desineje
|
||||
canonical: dešinėje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &kaireje
|
||||
canonical: kairėje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *desineje
|
||||
probability: 0.5
|
||||
- alternative: *kaireje
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &rytai
|
||||
canonical: rytai
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
west: &vakarai
|
||||
canonical: vakarai
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
north: &siaure
|
||||
canonical: šiaurė
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
south: &pietus
|
||||
canonical: pietūs
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
alternatives:
|
||||
- alternative: *siaure
|
||||
probability: 0.25
|
||||
- alternative: *rytai
|
||||
probability: 0.25
|
||||
- alternative: *pietus
|
||||
probability: 0.25
|
||||
- alternative: *vakarai
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
wejscie: &iejimas
|
||||
canonical: įėjimas
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# įėjimas 1, įėjimas A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *iejimas
|
||||
numeric_probability: 0.1 # e.g. įėjimas 1
|
||||
alpha_probability: 0.85 # e.g. įėjimas A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
laiptai: &laiptai
|
||||
canonical: laiptai
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *laiptai
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *siaure
|
||||
- alternative: *rytai
|
||||
- alternative: *pietus
|
||||
- alternative: *vakarai
|
||||
|
||||
|
||||
po_boxes:
|
||||
pasto_dezute: &pasto_dezute
|
||||
canonical: pašto dėžutė
|
||||
abbreviated: p d
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.5
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # pašto dėžutė 1234
|
||||
alphanumeric:
|
||||
default: *pasto_dezute
|
||||
numeric_probability: 0.95 # P. d. 123
|
||||
alpha_probability: 0.01 # pašto dėžutė A
|
||||
numeric_plus_alpha_probability: 0.03 # P. d. 123G
|
||||
alpha_plus_numeric_probability: 0.01 # P. d. A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
butas: &butas
|
||||
canonical: butas
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
biuro: &biuro
|
||||
canonical: biuro
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
kambarys: &kambarys
|
||||
canonical: kambarys
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *butas
|
||||
numeric_probability: 0.9 # e.g. butas 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. butas A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.01
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *biuro
|
||||
numeric_probability: 0.95 # e.g. biuro 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. biuro 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. biuro A1
|
||||
alpha_probability: 0.03 # e.g. biuro A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *kambarys
|
||||
numeric_probability: 0.95 # e.g. kambarys 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. kambarys 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. kambarys A1
|
||||
alpha_probability: 0.03 # e.g. kambarys A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
403
resources/addresses/lv.yaml
Normal file
403
resources/addresses/lv.yaml
Normal file
@@ -0,0 +1,403 @@
|
||||
# lv.yaml
|
||||
# -------
|
||||
# Latvian language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.97
|
||||
alphanumeric_probability: 0.02
|
||||
standalone_probability: 0.01
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.95
|
||||
- separator: " - "
|
||||
probability: 0.05
|
||||
probability: 0.2
|
||||
|
||||
|
||||
numbers:
|
||||
default: &numurs
|
||||
canonical: numurs
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
and:
|
||||
default: &un
|
||||
canonical: un
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
|
||||
cross_streets:
|
||||
and: *un
|
||||
corner_of: &sturis
|
||||
canonical: stūris
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &sturi
|
||||
canonical: stūrī
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *un
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *sturi
|
||||
probability: 0.2
|
||||
- alternative: *sturis
|
||||
probability: 0.1
|
||||
|
||||
between:
|
||||
canonical: starp
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
|
||||
levels:
|
||||
stavs: &stavs
|
||||
canonical: stāvs
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
ordinal:
|
||||
direction: right
|
||||
whitespace_probability: 0.5 # sometimes should be 2.stāvs
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
# Needs to be 1.0 so we don't get e.g. IIstāvs
|
||||
ordinal_suffix_probability: 1.0
|
||||
numeric_probability: 0.2
|
||||
ordinal_probability: 0.8
|
||||
|
||||
# Ground floor
|
||||
pirmais_stavs: &pirmais_stavs
|
||||
canonical: pirmais stāvs
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
pagrabs: &pagrabs
|
||||
canonical: pagrabs
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
standalone_probability: 1.0
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
pagraba: &pagraba
|
||||
canonical: pagraba
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
# e.g. pagraba 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. p1
|
||||
numeric_affix:
|
||||
affix: p
|
||||
direction: left
|
||||
# e.g. 1. pagraba
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.1
|
||||
ordinal_probability: 0.4
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *pagraba
|
||||
"-1":
|
||||
default: *pagrabs
|
||||
"0": &ground_floor
|
||||
default: *pirmais_stavs
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *stavs
|
||||
probability: 0.4
|
||||
"1": *ground_floor
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *stavs
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
directions:
|
||||
right: &pa_labi
|
||||
canonical: pa labi
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &pa_kreisi
|
||||
canonical: pa kreisi
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *pa_labi
|
||||
probability: 0.5
|
||||
- alternative: *pa_kreisi
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &austrumu
|
||||
canonical: austrumu
|
||||
abbreviated: a
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.05
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: a
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &rietumu
|
||||
canonical: rietumu
|
||||
abbreviated: r
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.05
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: r
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &ziemelu
|
||||
canonical: ziemeļu
|
||||
abbreviated: z
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.05
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
|
||||
south: &dienvidu
|
||||
canonical: dienvidu
|
||||
abbreviated: d
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.05
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: d
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *ziemelu
|
||||
probability: 0.25
|
||||
- alternative: *dienvidu
|
||||
probability: 0.25
|
||||
- alternative: *austrumu
|
||||
probability: 0.25
|
||||
- alternative: *rietumu
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
ieeja: &ieeja
|
||||
canonical: ieeja
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# ieeja 1, ieeja A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *ieeja
|
||||
numeric_probability: 0.1 # e.g. ieeja 1
|
||||
alpha_probability: 0.85 # e.g. ieeja A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
kapnu: &kapnu
|
||||
canonical: kāpņu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
kapnu_telpa: &kapnu_telpa
|
||||
canonical: kāpņu telpa
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *kapnu
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *kapnu_telpa
|
||||
probability: 0.4
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *pa_labi
|
||||
- alternative: *pa_kreisi
|
||||
- alternative: *ziemelu
|
||||
- alternative: *dienvidu
|
||||
- alternative: *austrumu
|
||||
- alternative: *rietumu
|
||||
|
||||
|
||||
units:
|
||||
dzivoklis: &dzivoklis
|
||||
canonical: dzīvoklis
|
||||
abbreviated: dz
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.8
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
birojs: &birojs
|
||||
canonical: birojs
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
istaba: &istaba
|
||||
canonical: istaba
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *dzivoklis
|
||||
numeric_probability: 0.9 # e.g. m. 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. m. A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.01
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *birojs
|
||||
numeric_probability: 0.95 # e.g. birojs 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. birojs 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. birojs A1
|
||||
alpha_probability: 0.03 # e.g. birojs A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *istaba
|
||||
numeric_probability: 0.95 # e.g. istaba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. istaba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. istaba A1
|
||||
alpha_probability: 0.03 # e.g. istaba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
563
resources/addresses/nb.yaml
Normal file
563
resources/addresses/nb.yaml
Normal file
@@ -0,0 +1,563 @@
|
||||
# nb.yaml
|
||||
# -------
|
||||
# Norwegian language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85
|
||||
alphanumeric_probability: 0.1
|
||||
standalone_probability: 0.05
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
# Bolignummer
|
||||
-
|
||||
components:
|
||||
- level
|
||||
- unit
|
||||
label: unit
|
||||
zero_pad_digits: 2
|
||||
separators:
|
||||
- separator: ""
|
||||
probability: 1.0
|
||||
probability: 0.05
|
||||
|
||||
|
||||
numbers:
|
||||
default: &nummer
|
||||
canonical: nummer
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *nummer
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
|
||||
|
||||
and:
|
||||
default: &og
|
||||
canonical: og
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
cross_streets:
|
||||
and: *og
|
||||
corner_of: &hjorne_av
|
||||
canonical: hjørne av
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &pa_hjornet_av
|
||||
canonical: på hjørnet av
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *og
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *hjorne_av
|
||||
probability: 0.15
|
||||
- alternative: *pa_hjornet_av
|
||||
probability: 0.15
|
||||
|
||||
between:
|
||||
canonical: mellom
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &etasje
|
||||
canonical: etasje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.9
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
hovedetasje: &hovedetasje
|
||||
canonical: hovedetasje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: h
|
||||
direction: left
|
||||
zero_pad: 2
|
||||
numeric_probability: 0.1
|
||||
numeric_affix_probability: 0.9
|
||||
underetasje: &underetasje
|
||||
canonical: underetasje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: u
|
||||
direction: left
|
||||
zero_pad: 2
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.1
|
||||
numeric_affix_probability: 0.9
|
||||
loftsetasje: &loftsetasje
|
||||
canonical: loftsetasje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: l
|
||||
direction: left
|
||||
zero_pad: 2
|
||||
numeric_probability: 0.1
|
||||
numeric_affix_probability: 0.9
|
||||
loft: &loft
|
||||
canonical: loft
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
kjeller: &kjeller
|
||||
canonical: kjeller
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
# e.g. 1 kjeller
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.8
|
||||
# e.g. k01
|
||||
numeric_affix:
|
||||
affix: k
|
||||
direction: left
|
||||
zero_pad: 2
|
||||
# e.g. 1. k
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.9
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.09
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *kjeller
|
||||
"-1":
|
||||
default: *kjeller
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: *etasje
|
||||
probability: 0.05
|
||||
- alternative: *underetasje
|
||||
probability: 0.1
|
||||
|
||||
"top":
|
||||
default: *etasje
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: *loftsetasje
|
||||
probability: 0.1
|
||||
- alternative: *loft
|
||||
probability: 0.05
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *etasje
|
||||
probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *hovedetasje
|
||||
probability: 0.05
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: i nærheten av
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: nær
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
nearby:
|
||||
default:
|
||||
canonical: i nærheten
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: rundt her
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: nær
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
near_me:
|
||||
default:
|
||||
canonical: nær meg
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: i nærheten av meg
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.4
|
||||
|
||||
in:
|
||||
default:
|
||||
canonical: i
|
||||
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
directions:
|
||||
right: &hoyre
|
||||
canonical: høyre
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
sample_probability: 0.9
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: h
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
left: &venstre
|
||||
canonical: venstre
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
sample_probability: 0.9
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
alternatives:
|
||||
- alternative: *hoyre
|
||||
probability: 0.5
|
||||
- alternative: *venstre
|
||||
probability: 0.5
|
||||
|
||||
|
||||
cardinal_directions:
|
||||
east: &ost
|
||||
canonical: øst
|
||||
abbreviated: ø
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: ø
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &vest
|
||||
canonical: vest
|
||||
abbreviated: v
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &nord
|
||||
canonical: nord
|
||||
abbreviated: n
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &syd
|
||||
canonical: syd
|
||||
abbreviated: s
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
probability: 0.25
|
||||
- alternative: *ost
|
||||
probability: 0.25
|
||||
- alternative: *syd
|
||||
probability: 0.25
|
||||
- alternative: *vest
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
inngang: &inngang
|
||||
canonical: inngang
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Eingang 1, Eingang A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *inngang
|
||||
numeric_probability: 0.1 # e.g. Eingang 1
|
||||
alpha_probability: 0.85 # e.g. Eingang A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
stiege: &stiege
|
||||
canonical: stiege
|
||||
abbreviated: stg
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
trapp: &trapp
|
||||
canonical: trapp
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *trapp
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *stiege
|
||||
probability: 0.2
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *syd
|
||||
- alternative: *ost
|
||||
- alternative: *vest
|
||||
|
||||
po_boxes:
|
||||
postboks: &postboks
|
||||
canonical: postboks
|
||||
abbreviated: pb
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Pb No 1234
|
||||
boks: &boks
|
||||
canonical: boks
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Boks No 1234
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *postboks
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *boks
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # Pb 123
|
||||
alpha_probability: 0.05 # Pb A
|
||||
numeric_plus_alpha_probability: 0.04 # Pb 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Pb A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
leilighet: &leilighet
|
||||
canonical: leilighet
|
||||
abbreviated: leil
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
null_phrase_probability: 0.3
|
||||
# Lejlighed nummer 4
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
hus: &hus
|
||||
canonical: hus
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
vaerelse: &vaerelse
|
||||
canonical: værelse
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *leilighet
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *hus
|
||||
probability: 0.1
|
||||
- alternative: *vaerelse
|
||||
probability: 0.1
|
||||
numeric_probability: 0.95 # e.g. Lejlighed 1
|
||||
alpha_probability: 0.05 # e.g. Lejl A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2H, 2V, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.005
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. Lejlighed Venstre
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.2
|
||||
|
||||
# Use the actual floor phrase as long as the whole phrase is numeric
|
||||
# Has the effect of creating Bolignummer-style units
|
||||
use_floor_affix_unit_num_digits: 2
|
||||
572
resources/addresses/nl.yaml
Normal file
572
resources/addresses/nl.yaml
Normal file
@@ -0,0 +1,572 @@
|
||||
# nl.yaml
|
||||
# -------
|
||||
# Note: base config covers Dutch as spoken in the Netherlands
|
||||
# Belgium overrides go in country configs
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85
|
||||
alphanumeric_probability: 0.1
|
||||
standalone_probability: 0.05
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.8
|
||||
alphanumeric_probability: 0.2
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: /
|
||||
probability: 0.9
|
||||
- separator: "-"
|
||||
probability: 0.1
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "-"
|
||||
probability: 0.9
|
||||
- separator: /
|
||||
probability: 0.1
|
||||
probability: 0.01
|
||||
|
||||
|
||||
and:
|
||||
default: &en
|
||||
canonical: en
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
numbers:
|
||||
default: &nummer
|
||||
canonical: nummer
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *nummer
|
||||
alphanumeric_phrase_probability: 0.01
|
||||
|
||||
levels:
|
||||
verdieping: &verdieping
|
||||
canonical: verdieping
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
roman_numeral_probability: 0.2
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.5
|
||||
roman_numeral_probability: 0.3
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.7
|
||||
ordinal_probability: 0.3
|
||||
etage: &etage
|
||||
canonical: etage
|
||||
abbreviated: et
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
roman_numeral_probability: 0.2
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.5
|
||||
roman_numeral_probability: 0.3
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.7
|
||||
ordinal_probability: 0.3
|
||||
begane_grond: &begane_grond
|
||||
canonical: begane grond
|
||||
abbreviated: bg
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.3
|
||||
benedenverdieping: &benedenverdieping
|
||||
canonical: benedenverdieping
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parterre: &parterre
|
||||
canonical: parterre
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
gelijkvloers: &gelijkvloers
|
||||
canonical: gelijkvloers
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
het_gelijkvloers: &het_gelijkvloers
|
||||
canonical: het gelijkvloers
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
aliases:
|
||||
"0":
|
||||
default: *begane_grond
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *benedenverdieping
|
||||
probability: 0.35
|
||||
- alternative: *parterre
|
||||
probability: 0.04
|
||||
- alternative: *het_gelijkvloers
|
||||
probability: 0.005
|
||||
- alternative: *gelijkvloers
|
||||
probability: 0.005
|
||||
alphanumeric:
|
||||
default: *verdieping
|
||||
probability: 0.99
|
||||
alternatives:
|
||||
- alternative: *etage
|
||||
probability: 0.01
|
||||
numeric_probability: 0.79 # With this probability, pick an integer
|
||||
roman_numeral_probability: 0.2 # Pick a Roman numeral for the actual value
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: in de buurt van
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: bij
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: nabij
|
||||
probability: 0.1
|
||||
nearby:
|
||||
default:
|
||||
canonical: in de buurt
|
||||
near_me:
|
||||
default:
|
||||
canonical: in de buurt van me
|
||||
|
||||
in:
|
||||
default:
|
||||
canonical: in
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: te
|
||||
probability: 0.4
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
|
||||
|
||||
cross_streets:
|
||||
and: *en
|
||||
corner_of: &hoek_van
|
||||
canonical: hoek van
|
||||
at_the_corner_of: &op_de_hoek_van
|
||||
canonical: op de hoek van
|
||||
intersection:
|
||||
default: *en
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *hoek_van
|
||||
probability: 0.15
|
||||
- alternative: *op_de_hoek_van
|
||||
probability: 0.15
|
||||
|
||||
between:
|
||||
canonical: tussen
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
|
||||
entrances:
|
||||
ingang: &ingang
|
||||
canonical: ingang
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Eingang 1, Eingang A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *ingang
|
||||
numeric_probability: 0.1 # e.g. Eingang 1
|
||||
alpha_probability: 0.85 # e.g. Eingang A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
po_boxes:
|
||||
postbus: &postbus
|
||||
canonical: postbus
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
antwoordnummer: &antwoordnummer
|
||||
canonical: antwoordnummer
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *postbus
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *antwoordnummer
|
||||
probability: 0.2
|
||||
numeric_probability: 0.9 # 123
|
||||
alpha_probability: 0.05 # A
|
||||
numeric_plus_alpha_probability: 0.04 # 123G
|
||||
alpha_plus_numeric_probability: 0.01 # A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
directions:
|
||||
right: &rechts
|
||||
canonical: rechts
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: r
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
left: &links
|
||||
canonical: links
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: l
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *rechts
|
||||
probability: 0.5
|
||||
- alternative: *links
|
||||
probability: 0.5
|
||||
|
||||
|
||||
cardinal_directions:
|
||||
east: &oost
|
||||
canonical: oost
|
||||
abbreviated: o
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: o
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
oosten: &oosten
|
||||
<<: *oost
|
||||
canonical: oosten
|
||||
|
||||
oostelijke: &oostelijke
|
||||
canonical: oostelijke
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
west: &west
|
||||
canonical: west
|
||||
abbreviated: w
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: w
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
westen: &westen
|
||||
<<: *west
|
||||
canonical: westen
|
||||
|
||||
westelijke: &westelijke
|
||||
canonical: westelijke
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
north: &noord
|
||||
canonical: noord
|
||||
abbreviated: n
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
noorden: &noorden
|
||||
<<: *noord
|
||||
canonical: noorden
|
||||
|
||||
noordelijke: &noordelijke
|
||||
canonical: noordelijke
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
south: &zuid
|
||||
canonical: zuid
|
||||
abbreviated: z
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
zuiden: &zuiden
|
||||
<<: *zuid
|
||||
canonical: zuiden
|
||||
|
||||
zuidelijke: &zuidelijke
|
||||
canonical: zuidelijke
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
alternatives:
|
||||
- alternative: *noord
|
||||
probability: 0.25
|
||||
- alternative: *oost
|
||||
probability: 0.25
|
||||
- alternative: *zuid
|
||||
probability: 0.25
|
||||
- alternative: *west
|
||||
probability: 0.25
|
||||
|
||||
|
||||
staircases:
|
||||
stiege: &stiege
|
||||
canonical: stiege
|
||||
abbreviated: stg
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
trap: &trap
|
||||
canonical: trap
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *trap
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *stiege
|
||||
probability: 0.4
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
units:
|
||||
appartement: &appartement
|
||||
canonical: appartement
|
||||
abbreviated: apt
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
kamer: &kamer
|
||||
canonical: kamer
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *appartement
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *kamer
|
||||
probability: 0.4
|
||||
numeric_probability: 0.9 # e.g. Apt 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. Apt A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2R, 2L, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.1
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. Apt Rechts
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.1
|
||||
|
||||
|
||||
countries:
|
||||
be:
|
||||
components:
|
||||
unit:
|
||||
null_probability: 0.65
|
||||
alphanumeric_probability: 0.35
|
||||
|
||||
levels:
|
||||
verdieping: &verdieping_flemish
|
||||
canonical: verdieping
|
||||
abbreviated: verdiep
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.7
|
||||
ordinal_probability: 0.3
|
||||
|
||||
aliases:
|
||||
"0":
|
||||
default: *het_gelijkvloers
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *gelijkvloers
|
||||
probability: 0.5
|
||||
alphanumeric:
|
||||
default: *verdieping_flemish
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *etage
|
||||
probability: 0.1
|
||||
|
||||
units:
|
||||
bus: &bus
|
||||
canonical: bus
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric:
|
||||
default: *appartement
|
||||
probability: 0.1
|
||||
alternatives:
|
||||
- alternative: *bus
|
||||
probability: 0.7
|
||||
- alternative: *kamer
|
||||
probability: 0.2
|
||||
509
resources/addresses/pl.yaml
Normal file
509
resources/addresses/pl.yaml
Normal file
@@ -0,0 +1,509 @@
|
||||
# pl.yaml
|
||||
# -------
|
||||
# Polish language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.95
|
||||
alphanumeric_probability: 0.04
|
||||
standalone_probability: 0.01
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.9
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
- separator: " - "
|
||||
probability: 0.05
|
||||
probability: 0.01
|
||||
|
||||
numbers:
|
||||
default: &numer
|
||||
canonical: numer
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
house_numbers:
|
||||
dom: &dom
|
||||
canonical: dom
|
||||
abbreviated: d
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric:
|
||||
default: *numer
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *dom
|
||||
probability: 0.4
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
|
||||
and:
|
||||
default: &i
|
||||
canonical: i
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
cross_streets:
|
||||
and: *i
|
||||
at: &w
|
||||
canonical: w
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner_of: &rogu
|
||||
canonical: rogu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &na_rogu
|
||||
canonical: na rogu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *i
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *w
|
||||
probability: 0.1
|
||||
- alternative: *rogu
|
||||
probability: 0.1
|
||||
- alternative: *na_rogu
|
||||
probability: 0.1
|
||||
|
||||
between:
|
||||
canonical: pomiędzy
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &pietro
|
||||
canonical: piętro
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
ordinal_suffix_probability: 0.6
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
parter: &parter
|
||||
canonical: parter
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
suterena: &suterena
|
||||
canonical: suterena
|
||||
# e.g. suterena 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. s1
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: left
|
||||
# e.g. 1. suterena
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.985
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *suterena
|
||||
"-1":
|
||||
default: *suterena
|
||||
"0":
|
||||
default: *parter
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *pietro
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *pietro
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: w pobliżu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: blisko
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: koło
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: niedaleko
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: obok
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: przy
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
nearby:
|
||||
default:
|
||||
canonical: w pobliżu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: w pobliżu tutaj
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: wokół tutaj
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: blisko
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
near_me:
|
||||
default:
|
||||
canonical: w pobliżu mnie
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: w
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: we
|
||||
probability: 0.3
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
|
||||
directions:
|
||||
right: &prawo
|
||||
canonical: prawo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &lewo
|
||||
canonical: lewo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *prawo
|
||||
probability: 0.5
|
||||
- alternative: *lewo
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &wschod
|
||||
canonical: wschód
|
||||
abbreviated: w
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: w
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &zachod
|
||||
canonical: zachód
|
||||
abbreviated: z
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &polnoc
|
||||
canonical: północ
|
||||
abbreviated: pn
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: pn
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &poludnie
|
||||
canonical: południe
|
||||
abbreviated: pd
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: pd
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *polnoc
|
||||
probability: 0.25
|
||||
- alternative: *wschod
|
||||
probability: 0.25
|
||||
- alternative: *poludnie
|
||||
probability: 0.25
|
||||
- alternative: *zachod
|
||||
probability: 0.25
|
||||
|
||||
|
||||
entrances:
|
||||
wejscie: &wejscie
|
||||
canonical: wejście
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Wejście 1, Wejście A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *wejscie
|
||||
numeric_probability: 0.1 # e.g. Wejście 1
|
||||
alpha_probability: 0.85 # e.g. Wejście A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
schody: &schody
|
||||
canonical: schody
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *schody
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *polnoc
|
||||
- alternative: *poludnie
|
||||
- alternative: *wschod
|
||||
- alternative: *zachod
|
||||
|
||||
|
||||
po_boxes:
|
||||
skrytka_pocztowa: &skrytka_pocztowa
|
||||
canonical: skrytka pocztowa
|
||||
abbreviated: skr poczt
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Skr Poczt 1234
|
||||
alphanumeric:
|
||||
default: *skrytka_pocztowa
|
||||
numeric_probability: 0.9 # Skr Poczt 123
|
||||
alpha_probability: 0.05 # Skr Poczt A
|
||||
numeric_plus_alpha_probability: 0.04 # Skr Poczt 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Skr Poczt A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
mieszkanie: &mieszkanie
|
||||
canonical: mieszkanie
|
||||
abbreviated: m
|
||||
sample: true
|
||||
canonical_probability: 0.05
|
||||
abbreviated_probability: 0.9
|
||||
sample_probability: 0.05
|
||||
numeric:
|
||||
direction: left
|
||||
pokoj: &pokoj
|
||||
canonical: pokój
|
||||
abbreviated: pok
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *mieszkanie
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *pokoj
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. m. 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. m. A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.01
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *pokoj
|
||||
numeric_probability: 0.95 # e.g. pokój 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. pokój 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. pokój A1
|
||||
alpha_probability: 0.03 # e.g. pokój A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university: *commercial_unit_types
|
||||
1054
resources/addresses/pt.yaml
Normal file
1054
resources/addresses/pt.yaml
Normal file
File diff suppressed because it is too large
Load Diff
504
resources/addresses/ro.yaml
Normal file
504
resources/addresses/ro.yaml
Normal file
@@ -0,0 +1,504 @@
|
||||
# ro.yaml
|
||||
# -------
|
||||
# Romanian language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
# If no floor number is specified
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.35
|
||||
standalone_probability: 0.05
|
||||
|
||||
staircase:
|
||||
null_probability: 0.95
|
||||
alphanumeric_probability: 0.05
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.3
|
||||
alphanumeric_probability: 0.65
|
||||
standalone_probability: 0.05
|
||||
|
||||
numbers:
|
||||
default: &numar
|
||||
canonical: număr
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.7
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#" # e.g. #3, #2F, etc.
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative:
|
||||
direction: left # affix goes on the number's left
|
||||
|
||||
# Probabilities for numbers
|
||||
numeric_probability: 0.9
|
||||
numeric_affix_probability: 0.1
|
||||
|
||||
and:
|
||||
default: &si
|
||||
canonical: și
|
||||
abbreviated: "&"
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.1
|
||||
|
||||
cross_streets:
|
||||
and: *si
|
||||
corner_of: &colt
|
||||
canonical: colț
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
at_the_corner_of: &la_coltul_de_pe
|
||||
canonical: la colțul de pe
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
intersection:
|
||||
default: *si
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *colt
|
||||
probability: 0.2
|
||||
- alternative: *la_coltul_de_pe
|
||||
probability: 0.1
|
||||
|
||||
between:
|
||||
canonical: între
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
parentheses_probabililty: 0.5
|
||||
|
||||
|
||||
house_numbers:
|
||||
# fara numar (FN) addresses
|
||||
no_number:
|
||||
default:
|
||||
canonical: fără număr
|
||||
abbreviated: fn
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.7
|
||||
sample_probability: 0.2
|
||||
alphanumeric:
|
||||
default: *numar
|
||||
|
||||
alphanumeric_phrase_probability: 0.7
|
||||
no_number_probability: 0.1 # With this probability, use fara numar if no house_number is specified
|
||||
|
||||
|
||||
|
||||
levels:
|
||||
floor: &etaj
|
||||
canonical: etaj
|
||||
abbreviated: et
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true # Occasionally add variation of "number", e.g. et. nr 2
|
||||
add_number_phrase_probability: 0.05
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
roman_numeral_probability: 0.2
|
||||
# Ground floor
|
||||
parter: &parter
|
||||
canonical: parter
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
aliases:
|
||||
"0":
|
||||
default: *parter
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *etaj
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *etaj
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
numeric_probability: 0.99
|
||||
alpha_probability: 0.01
|
||||
|
||||
blocks:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: bloc
|
||||
abbreviated: bl
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: in apropiere de
|
||||
|
||||
nearby:
|
||||
default:
|
||||
canonical: în apropiere
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: in apropiere
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: aproape de aici
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: aici
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: în jurul aici
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: in jurul aici
|
||||
probability: 0.05
|
||||
near_me:
|
||||
default:
|
||||
canonical: lângă mine
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: langa mine
|
||||
probability: 0.3
|
||||
in:
|
||||
default:
|
||||
canonical: din
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
directions:
|
||||
right: &dreapta
|
||||
canonical: dreapta
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: d
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
left: &stanga
|
||||
canonical: stânga
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
alternatives:
|
||||
- alternative: *dreapta
|
||||
probability: 0.5
|
||||
- alternative: *stanga
|
||||
probability: 0.5
|
||||
|
||||
|
||||
cardinal_directions:
|
||||
east: &est
|
||||
canonical: est
|
||||
abbreviated: e
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: e
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &vest
|
||||
canonical: vest
|
||||
abbreviated: v
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &nord
|
||||
canonical: nord
|
||||
abbreviated: n
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &sud
|
||||
canonical: sud
|
||||
abbreviated: s
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.6
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
probability: 0.25
|
||||
- alternative: *est
|
||||
probability: 0.25
|
||||
- alternative: *sud
|
||||
probability: 0.25
|
||||
- alternative: *vest
|
||||
probability: 0.25
|
||||
|
||||
entrances:
|
||||
entrada: &intrare
|
||||
canonical: intrare
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Intrare 1, Intare A, etc.
|
||||
alphanumeric:
|
||||
default: *intrare
|
||||
numeric_probability: 0.1 # e.g. Intrare 1
|
||||
alpha_probability: 0.85 # e.g. Intrare A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *est
|
||||
- alternative: *vest
|
||||
- alternative: *dreapta
|
||||
- alternative: *stanga
|
||||
|
||||
staircases:
|
||||
scara: &scara
|
||||
canonical: scara
|
||||
abbreviated: sc
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
# For alphanumerics, Scara A, Scara 1, etc.
|
||||
default: *scara
|
||||
numeric_probability: 0.35 # e.g. Scara 1
|
||||
alpha_probability: 0.6 # e.g. Scara A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right # e.g. Scara Nord
|
||||
direction_probability: 0.8
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *nord
|
||||
- alternative: *sud
|
||||
- alternative: *est
|
||||
- alternative: *vest
|
||||
- alternative: *dreapta
|
||||
- alternative: *stanga
|
||||
|
||||
po_boxes:
|
||||
casuta_postala: &casuta_postala
|
||||
canonical: căsuță poștală
|
||||
abbreviated: cp
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.4 # Apdo No 1234
|
||||
numeric_probability: 1.0
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *casuta_postala
|
||||
numeric_probability: 0.9 # Apdo 123
|
||||
alpha_probability: 0.05 # Apdo A
|
||||
numeric_plus_alpha_probability: 0.04 # Apdo 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Apdo A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
apartament: &apartament
|
||||
canonical: apartament
|
||||
abbreviated: ap
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
sala: &sala
|
||||
canonical: sală
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
birou: &birou
|
||||
canonical: birou
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
lotul: &lotul
|
||||
canonical: lotul
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *apartament
|
||||
probability: 0.9
|
||||
sample: true
|
||||
alternatives:
|
||||
- alternative: *sala
|
||||
probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2o Izq, 2 Dcha, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.1
|
||||
add_direction_numeric: true # Only for numbers
|
||||
|
||||
numeric_probability: 0.9 # e.g. ap 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. ap 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. ap A1
|
||||
alpha_probability: 0.08 # e.g. ap A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
|
||||
zones:
|
||||
residential: *unit_alphanumeric
|
||||
commercial:
|
||||
default: *birou
|
||||
numeric_probability: 0.9 # e.g. Birou 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Birou 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Birou A1
|
||||
alpha_probability: 0.08 # e.g. Birou A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
industrial:
|
||||
default: *lotul
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *birou
|
||||
probability: 0.3
|
||||
- alternative: *sala
|
||||
probability: 0.2
|
||||
|
||||
numeric_probability: 0.9 # e.g. Lotul 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Lotul 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Lotul A1
|
||||
alpha_probability: 0.08 # e.g. Lotul A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
university:
|
||||
default: *sala
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *birou
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. Sala 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. Sala 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. Sala A1
|
||||
alpha_probability: 0.08 # e.g. Sala A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
1171
resources/addresses/ru.yaml
Normal file
1171
resources/addresses/ru.yaml
Normal file
File diff suppressed because it is too large
Load Diff
603
resources/addresses/sk.yaml
Normal file
603
resources/addresses/sk.yaml
Normal file
@@ -0,0 +1,603 @@
|
||||
# sk.yaml
|
||||
# -------
|
||||
# Slovakian language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.95
|
||||
alphanumeric_probability: 0.04
|
||||
standalone_probability: 0.01
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
# Note: no combinations because of the house numbering scheme
|
||||
|
||||
|
||||
numbers:
|
||||
default: &cislo
|
||||
canonical: číslo
|
||||
abbreviated: č
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "č."
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
and:
|
||||
default: &a
|
||||
canonical: a
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
conscription_numbers:
|
||||
alphanumeric:
|
||||
default:
|
||||
canonical: súpisné číslo
|
||||
abbreviated: s.č.
|
||||
canonical_probability: 0.05
|
||||
abbreviated_probability: 0.85
|
||||
sample: true
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
cross_streets:
|
||||
and: *a
|
||||
at: &na
|
||||
canonical: na
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner_of: &rohu
|
||||
canonical: rohu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner: &roh
|
||||
canonical: roh
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &na_rohu
|
||||
canonical: na rohu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *a
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *na
|
||||
probability: 0.1
|
||||
- alternative: *roh
|
||||
probability: 0.1
|
||||
- alternative: *rohu
|
||||
probability: 0.1
|
||||
- alternative: *na_rohu
|
||||
probability: 0.1
|
||||
|
||||
between:
|
||||
canonical: medzi
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
levels:
|
||||
floor: &poschodie
|
||||
canonical: poschodie
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
podlazie: &podlazie
|
||||
canonical: podlažie
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
|
||||
nadzemne_podlazie: &nadzemne_podlazie
|
||||
canonical: nadzemné podlažie
|
||||
abbreviated: np
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
abbreviated_probability: 0.8
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
etaz: &etaz
|
||||
canonical: etáž
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
prizemie: &prizemie
|
||||
canonical: prízemie
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
podzemne_podlazie: &podzemne_podlazie
|
||||
canonical: podzemné podlažie
|
||||
abbreviated: pp
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.3
|
||||
# e.g. podzemné podlažie 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. pp1
|
||||
numeric_affix:
|
||||
affix: pp
|
||||
direction: left
|
||||
# e.g. 1. podzemné podlažie
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.985
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *podzemne_podlazie
|
||||
"-1":
|
||||
default: *podzemne_podlazie
|
||||
"0":
|
||||
default: *prizemie
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *poschodie
|
||||
probability: 0.05
|
||||
- alternative: *podlazie
|
||||
probability: 0.05
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *poschodie
|
||||
probability: 0.45
|
||||
alternatives:
|
||||
- alternative: *podlazie
|
||||
probability: 0.35
|
||||
- alternative: *nadzemne_podlazie
|
||||
probability: 0.19
|
||||
- alternative: *etaz
|
||||
probability: 0.01
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: v blízkosti
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: u
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: v okolí
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: okolo
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
probability: 0.05
|
||||
nearby:
|
||||
default:
|
||||
canonical: blízkosti
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.4
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: blízko
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: v blízkosti
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: tady blízkosti
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: tady
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: tu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: v blízkosti tu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
- alternative:
|
||||
canonical: v okolí
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.05
|
||||
near_me:
|
||||
default:
|
||||
canonical: v blízkosti mne
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: v
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: vo
|
||||
probability: 0.3
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
directions:
|
||||
right: &prava
|
||||
canonical: pravá
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &lava
|
||||
canonical: ľavá
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *prava
|
||||
probability: 0.5
|
||||
- alternative: *lava
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &vychod
|
||||
canonical: východ
|
||||
abbreviated: v
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &zapad
|
||||
canonical: západ
|
||||
abbreviated: z
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &sever
|
||||
canonical: sever
|
||||
abbreviated: s
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &juh
|
||||
canonical: juh
|
||||
abbreviated: j
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: j
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *sever
|
||||
probability: 0.25
|
||||
- alternative: *vychod
|
||||
probability: 0.25
|
||||
- alternative: *juh
|
||||
probability: 0.25
|
||||
- alternative: *zapad
|
||||
probability: 0.25
|
||||
|
||||
entrances:
|
||||
vchod: &vchod
|
||||
canonical: vchod
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Wejście 1, Wejście A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *vchod
|
||||
numeric_probability: 0.1 # e.g. Wejście 1
|
||||
alpha_probability: 0.85 # e.g. Wejście A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
schodisko: &schodisko
|
||||
canonical: schodisko
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *schodisko
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *sever
|
||||
- alternative: *juh
|
||||
- alternative: *vychod
|
||||
- alternative: *zapad
|
||||
|
||||
po_boxes:
|
||||
postova_priehradka: &postova_priehradka
|
||||
canonical: poštová priehradka
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # poštová priehradka 1234
|
||||
alphanumeric:
|
||||
default: *postova_priehradka
|
||||
numeric_probability: 0.9 # poštová priehradka 123
|
||||
alpha_probability: 0.05 # poštová priehradka A
|
||||
numeric_plus_alpha_probability: 0.04 # poštová priehradka 123G
|
||||
alpha_plus_numeric_probability: 0.01 # poštová priehradka A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
apartaman: &apartaman
|
||||
canonical: apartmán
|
||||
abbreviated: apt
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
izba: &izba
|
||||
canonical: izba
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
kancelaria: &kancelaria
|
||||
canonical: kancelária
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *apartaman
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *izba
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. apt. 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. apt. A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.01
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *izba
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *kancelaria
|
||||
probability: 0.4
|
||||
numeric_probability: 0.95 # e.g. pokoj 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. pokoj 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. pokoj A1
|
||||
alpha_probability: 0.03 # e.g. pokoj A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *izba
|
||||
numeric_probability: 0.95 # e.g. pokoj 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. pok 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. pokoj A1
|
||||
alpha_probability: 0.03 # e.g. pokoj A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
539
resources/addresses/sl.yaml
Normal file
539
resources/addresses/sl.yaml
Normal file
@@ -0,0 +1,539 @@
|
||||
# sl.yaml
|
||||
# -------
|
||||
# Slovenian language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.7
|
||||
alphanumeric_probability: 0.3
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- staircase
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.1
|
||||
# For unit types like 2/34
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
|
||||
|
||||
numbers:
|
||||
no_number:
|
||||
default:
|
||||
canonical: brez številke
|
||||
abbreviated: brez št
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.2
|
||||
|
||||
default: &stevilke
|
||||
canonical: številke
|
||||
abbreviated: št
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "št."
|
||||
whitespace_probability: 0.6
|
||||
direction: left
|
||||
numeric_probability: 0.6
|
||||
numeric_affix_probability: 0.4
|
||||
|
||||
alphanumeric_phrase_probability: 0.05
|
||||
no_number_probability: 0.05
|
||||
|
||||
|
||||
and:
|
||||
default: &in
|
||||
canonical: in
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
|
||||
cross_streets:
|
||||
i: *in
|
||||
at: &na
|
||||
canonical: na
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner: &vogalu
|
||||
canonical: vogalu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
na_vogalu: &na_vogalu
|
||||
canonical: na vogalu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *in
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *na
|
||||
probability: 0.1
|
||||
- alternative: *vogalu
|
||||
probability: 0.15
|
||||
- alternative: *na_vogalu
|
||||
probability: 0.05
|
||||
|
||||
med: &med
|
||||
canonical: med
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
between:
|
||||
default: *med
|
||||
|
||||
levels:
|
||||
nadstropje: &nadstropje
|
||||
canonical: nadstropje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
|
||||
pritlicje: &pritlicje
|
||||
canonical: pritličje
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
parter: &parter
|
||||
canonical: parter
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
kleti: &kleti
|
||||
canonical: kleti
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
# e.g. kleti 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. 1. kleti
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *kleti
|
||||
"-1":
|
||||
default: *kleti
|
||||
"0":
|
||||
default: *pritlicje
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *parter
|
||||
probability: 0.4
|
||||
- alternative: *nadstropje
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *nadstropje
|
||||
numeric_probability: 0.69 # With this probability, pick an integer
|
||||
roman_numeral_probability: 0.3 # Pick a Roman numeral for the actual value
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: v bližini
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: pri
|
||||
probability: 0.4
|
||||
|
||||
nearby:
|
||||
default:
|
||||
canonical: v bližini
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: v bližini tukaj
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.3
|
||||
- alternative:
|
||||
canonical: okoli tukaj
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: tukaj
|
||||
probability: 0.1
|
||||
|
||||
near_me:
|
||||
default:
|
||||
canonical: blizu mene
|
||||
|
||||
# Don't worry about agreement
|
||||
in:
|
||||
default:
|
||||
canonical: v
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
directions:
|
||||
right: &prav
|
||||
canonical: prav
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &levo
|
||||
canonical: levo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *prav
|
||||
probability: 0.5
|
||||
- alternative: *levo
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &vzhod
|
||||
canonical: vzhod
|
||||
abbreviated: v
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &zahod
|
||||
canonical: zahod
|
||||
abbreviated: z
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &sever
|
||||
canonical: sever
|
||||
abbreviated: s
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &jug
|
||||
canonical: jug
|
||||
abbreviated: j
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: j
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *sever
|
||||
probability: 0.25
|
||||
- alternative: *vzhod
|
||||
probability: 0.23
|
||||
- alternative: *jug
|
||||
probability: 0.23
|
||||
- alternative: *zahod
|
||||
probability: 0.23
|
||||
|
||||
entrances:
|
||||
vhod: &vhod
|
||||
canonical: vhod
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Vhod 1, Vhod A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *vhod
|
||||
numeric_probability: 0.1 # e.g. Vhod 1
|
||||
alpha_probability: 0.85 # e.g. Vhod A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
|
||||
staircases:
|
||||
stopnisce: &stopnisce
|
||||
canonical: stopnišče
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *stopnisce
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *prav
|
||||
probability: 0.2
|
||||
- alternative: *levo
|
||||
probability: 0.2
|
||||
- alternative: *sever
|
||||
probability: 0.15
|
||||
- alternative: *jug
|
||||
probability: 0.15
|
||||
- alternative: *vzhod
|
||||
probability: 0.15
|
||||
- alternative: *zahod
|
||||
probability: 0.15
|
||||
|
||||
po_boxes:
|
||||
postni_predal: &postni_predal
|
||||
canonical: poštni predal
|
||||
abbreviated: p.p
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
|
||||
alphanumeric:
|
||||
default: *postni_predal
|
||||
numeric_probability: 0.9 # pp 123
|
||||
alpha_probability: 0.05 # p.p A
|
||||
numeric_plus_alpha_probability: 0.04 # pp 123G
|
||||
alpha_plus_numeric_probability: 0.01 # pp A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
stanovanje: &stanovanje
|
||||
canonical: stanovanje
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
soba: &soba
|
||||
canonical: soba
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
urad: &urad
|
||||
canonical: urad
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *stanovanje
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *soba
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. stanovanje 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. stanovanje A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.05
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *soba
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *urad
|
||||
probability: 0.4
|
||||
numeric_probability: 0.95 # e.g. soba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
|
||||
alpha_probability: 0.03 # e.g. soba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *soba
|
||||
numeric_probability: 0.95 # e.g. soba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
|
||||
alpha_probability: 0.03 # e.g. soba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
953
resources/addresses/sr.yaml
Normal file
953
resources/addresses/sr.yaml
Normal file
@@ -0,0 +1,953 @@
|
||||
# sr.yaml
|
||||
# -------
|
||||
# Serbian language specification
|
||||
|
||||
alphabet: абвгдђежзијклљмнњопрстћуфхцчџш
|
||||
alphanumeric_probability: 0.7
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.8
|
||||
alphanumeric_probability: 0.2
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.7
|
||||
alphanumeric_probability: 0.3
|
||||
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- staircase
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.1
|
||||
# For unit types like 2/34
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
|
||||
|
||||
|
||||
numbers:
|
||||
default: &broj
|
||||
canonical: број
|
||||
abbreviated: бр
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "бр."
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
alternatives:
|
||||
- alternative: &broj_latin
|
||||
canonical: broj
|
||||
abbreviated: br
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "br."
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
and:
|
||||
default: &i
|
||||
canonical: и
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: &i_latin
|
||||
canonical: i
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
|
||||
|
||||
cross_streets:
|
||||
i: *i
|
||||
i_latin: *i_latin
|
||||
at: &na
|
||||
canonical: на
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
na_latin: &na_latin
|
||||
canonical: na
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
corner: &ugao
|
||||
canonical: угао
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
ugao_latin: &ugao_latin
|
||||
canonical: ugao
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
na_uglu: &na_uglu
|
||||
canonical: на углу
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
na_uglu_latin: &na_uglu_latin
|
||||
canonical: na uglu
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *i
|
||||
probability: 0.65
|
||||
alternatives:
|
||||
- alternative: *i_latin
|
||||
probability: 0.05
|
||||
- alternative: *na
|
||||
probability: 0.075
|
||||
- alternative: *na_latin
|
||||
probability: 0.025
|
||||
- alternative: *ugao
|
||||
probability: 0.1
|
||||
- alternative: *ugao_latin
|
||||
probability: 0.05
|
||||
- alternative: *na_uglu
|
||||
probability: 0.025
|
||||
- alternative: *na_uglu_latin
|
||||
probability: 0.025
|
||||
izmedu: &izmedu
|
||||
canonical: између
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
izmedu_latin: &izmedu_latin
|
||||
canonical: između
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
between:
|
||||
default: *izmedu
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *izmedu_latin
|
||||
probability: 0.1
|
||||
|
||||
levels:
|
||||
sprat: &sprat
|
||||
canonical: спрат
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
sprat_latin: &sprat_latin
|
||||
canonical: sprat
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
kat: &kat
|
||||
canonical: кат
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
kat_latin: &kat_latin
|
||||
canonical: kat
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
etaza: &etaza
|
||||
canonical: етажа
|
||||
abbreviated: ет
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
etaza_latin: &etaza_latin
|
||||
canonical: etaža
|
||||
abbreviated: et
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
prizemlje: &prizemlje
|
||||
canonical: приземље
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
prizemlje_latin: &prizemlje_latin
|
||||
canonical: prizemlje
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
parter: &parter
|
||||
canonical: партер
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
parter_latin: &parter_latin
|
||||
canonical: parter
|
||||
sample: true
|
||||
canonical_probability: 0.9
|
||||
sample_probability: 0.1
|
||||
|
||||
podrum: &podrum
|
||||
canonical: подрум
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
# e.g. подрум 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. 1. подрум
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
podrum_latin: &podrum_latin
|
||||
canonical: podrum
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
# e.g. подрум 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. 1. подрум
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *podrum
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *podrum_latin
|
||||
probability: 0.2
|
||||
"-1":
|
||||
default: *podrum
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *podrum_latin
|
||||
probability: 0.2
|
||||
"0":
|
||||
default: *prizemlje
|
||||
probability: 0.45
|
||||
alternatives:
|
||||
- alternative: *prizemlje_latin
|
||||
probability: 0.05
|
||||
- alternative: *parter
|
||||
probability: 0.35
|
||||
- alternative: *parter_latin
|
||||
probability: 0.05
|
||||
- alternative: *sprat
|
||||
probability: 0.04
|
||||
- alternative: *sprat_latin
|
||||
probability: 0.01
|
||||
- alternative: *kat
|
||||
probability: 0.04
|
||||
- alternative: *kat_latin
|
||||
probability: 0.01
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *sprat
|
||||
probability: 0.65
|
||||
alternatives:
|
||||
- alternative: *sprat_latin
|
||||
probability: 0.1
|
||||
- alternative: *kat
|
||||
probability: 0.15
|
||||
- alternative: *kat_latin
|
||||
probability: 0.05
|
||||
- alternative: *etaza
|
||||
probability: 0.04
|
||||
- alternative: *etaza_latin
|
||||
probability: 0.01
|
||||
numeric_probability: 0.69 # With this probability, pick an integer
|
||||
roman_numeral_probability: 0.3 # Pick a Roman numeral for the actual value
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
directions:
|
||||
right: &desno
|
||||
canonical: десно
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
desno_latin: &desno_latin
|
||||
canonical: desno
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &levo
|
||||
canonical: лево
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
levo_latin: &levo_latin
|
||||
canonical: levo
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *desno
|
||||
probability: 0.45
|
||||
- alternative: *desno_latin
|
||||
probability: 0.05
|
||||
- alternative: *levo
|
||||
probability: 0.45
|
||||
- alternative: *levo_latin
|
||||
probability: 0.05
|
||||
|
||||
cardinal_directions:
|
||||
east: &istok
|
||||
canonical: исток
|
||||
abbreviated: и
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: и
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
istok_latin: &istok_latin
|
||||
canonical: istok
|
||||
abbreviated: i
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: i
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &zapad
|
||||
canonical: запад
|
||||
abbreviated: з
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: з
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
zapad_latin: &zapad_latin
|
||||
canonical: zapad
|
||||
abbreviated: z
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: z
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &sever
|
||||
canonical: север
|
||||
abbreviated: с
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: с
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
sever_latin: &sever_latin
|
||||
canonical: sever
|
||||
abbreviated: s
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &jug
|
||||
canonical: југ
|
||||
abbreviated: ј
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: ј
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
jug_latin: &jug_latin
|
||||
canonical: jug
|
||||
abbreviated: j
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: j
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *sever
|
||||
probability: 0.23
|
||||
- alternative: *sever_latin
|
||||
probability: 0.02
|
||||
- alternative: *istok
|
||||
probability: 0.23
|
||||
- alternative: *istok_latin
|
||||
probability: 0.02
|
||||
- alternative: *jug
|
||||
probability: 0.23
|
||||
- alternative: *jug_latin
|
||||
probability: 0.02
|
||||
- alternative: *zapad
|
||||
probability: 0.23
|
||||
- alternative: *zapad_latin
|
||||
probability: 0.02
|
||||
|
||||
entrances:
|
||||
ulaz: &ulaz
|
||||
canonical: улаз
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
ulaz_latin: &ulaz_latin
|
||||
canonical: ulaz
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Ulaz 1, Ulaz A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *ulaz
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *ulaz_latin
|
||||
probability: 0.2
|
||||
numeric_probability: 0.1 # e.g. Ulaz 1
|
||||
alpha_probability: 0.85 # e.g. Ulaz A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
|
||||
|
||||
staircases:
|
||||
stepeniste: &stepeniste
|
||||
canonical: степениште
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
stepeniste_latin: &stepeniste_latin
|
||||
canonical: stepenište
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *stepeniste
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *stepeniste_latin
|
||||
probability: 0.2
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *desno
|
||||
probability: 0.19
|
||||
- alternative: *desno_latin
|
||||
probability: 0.01
|
||||
- alternative: *levo
|
||||
probability: 0.19
|
||||
- alternative: *levo_latin
|
||||
probability: 0.01
|
||||
- alternative: *sever
|
||||
probability: 0.14
|
||||
- alternative: *sever_latin
|
||||
probability: 0.01
|
||||
- alternative: *jug
|
||||
probability: 0.14
|
||||
- alternative: *jug_latin
|
||||
probability: 0.01
|
||||
- alternative: *istok
|
||||
probability: 0.14
|
||||
- alternative: *istok_latin
|
||||
probability: 0.01
|
||||
- alternative: *zapad
|
||||
probability: 0.14
|
||||
- alternative: *zapad_latin
|
||||
probability: 0.01
|
||||
|
||||
po_boxes:
|
||||
postanski_fah: &postanski_fah
|
||||
canonical: поштански фах
|
||||
abbreviated: пф
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # poštanski fah br. 1234
|
||||
postanski_fah_latin: &postanski_fah_latin
|
||||
canonical: poštanski fah
|
||||
abbreviated: pf
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # poštanski fah br. 1234
|
||||
postanski_pretinac: &postanski_pretinac
|
||||
canonical: поштански претинац
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.5
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
postanski_pretinac_latin: &postanski_pretinac_latin
|
||||
canonical: poštanski pretinac
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
postanski_pregradak: &postanski_pregradak
|
||||
canonical: поштански преградак
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.5
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
postanski_pregradak_latin: &postanski_pregradak_latin
|
||||
canonical: poštanski pregradak
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
|
||||
alphanumeric:
|
||||
default: *postanski_fah
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *postanski_fah_latin
|
||||
probability: 0.05
|
||||
- alternative: *postanski_pretinac
|
||||
probability: 0.1
|
||||
- alternative: *postanski_pretinac_latin
|
||||
probability: 0.05
|
||||
- alternative: *postanski_pregradak
|
||||
probability: 0.075
|
||||
- alternative: *postanski_pregradak_latin
|
||||
probability: 0.025
|
||||
numeric_probability: 0.9 # pf 123
|
||||
alpha_probability: 0.05 # pf A
|
||||
numeric_plus_alpha_probability: 0.04 # pf 123G
|
||||
alpha_plus_numeric_probability: 0.01 # pf A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
units:
|
||||
stan: &stan
|
||||
canonical: стан
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
stan_latin: &stan_latin
|
||||
canonical: stan
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
apartman: &apartman
|
||||
canonical: апартман
|
||||
abbreviated: ап
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
apartman_latin: &apartman_latin
|
||||
canonical: apartman
|
||||
abbreviated: ap
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
soba: &soba
|
||||
canonical: соба
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
soba_latin: &soba_latin
|
||||
canonical: soba
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
kancelarija: &kancelarija
|
||||
canonical: канцеларија
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
kancelarija_latin: &kancelarija_latin
|
||||
canonical: kancelarija
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *stan
|
||||
probability: 0.5
|
||||
alternatives:
|
||||
- alternative: *stan_latin
|
||||
probability: 0.1
|
||||
- alternative: *apartman
|
||||
probability: 0.2
|
||||
- alternative: *apartman_latin
|
||||
probability: 0.05
|
||||
- alternative: *soba
|
||||
probability: 0.1
|
||||
- alternative: *soba_latin
|
||||
probability: 0.05
|
||||
numeric_probability: 0.9 # e.g. stan. 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. stan A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.01
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *soba
|
||||
probability: 0.55
|
||||
alternatives:
|
||||
- alternative: *soba_latin
|
||||
probability: 0.05
|
||||
- alternative: *kancelarija
|
||||
probability: 0.35
|
||||
- alternative: *kancelarija_latin
|
||||
probability: 0.05
|
||||
numeric_probability: 0.95 # e.g. soba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
|
||||
alpha_probability: 0.03 # e.g. soba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *soba
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *soba_latin
|
||||
probability: 0.1
|
||||
numeric_probability: 0.95 # e.g. soba 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
|
||||
alpha_probability: 0.03 # e.g. soba A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
795
resources/addresses/sv.yaml
Normal file
795
resources/addresses/sv.yaml
Normal file
@@ -0,0 +1,795 @@
|
||||
# sv.yaml
|
||||
# -------
|
||||
# Swedish language specification.
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85
|
||||
alphanumeric_probability: 0.1
|
||||
standalone_probability: 0.05
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
numbers:
|
||||
default: &nummer
|
||||
canonical: nummer
|
||||
abbreviated: nr
|
||||
sample: true
|
||||
# Probabilities
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
sample_exclude:
|
||||
- "#"
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "#"
|
||||
direction: left
|
||||
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *nummer
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
|
||||
|
||||
and:
|
||||
default: &och
|
||||
canonical: och
|
||||
abbreviated: "&"
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.75
|
||||
sample: true
|
||||
sample_probability: 0.05
|
||||
|
||||
cross_streets:
|
||||
and: *och
|
||||
corner_of: &hornet_av
|
||||
canonical: hörnet av
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
at_the_corner_of: &i_hornet_av
|
||||
canonical: i hörnet av
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *och
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *hornet_av
|
||||
probability: 0.15
|
||||
- alternative: *i_hornet_av
|
||||
probability: 0.15
|
||||
|
||||
between:
|
||||
canonical: mellan
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
|
||||
|
||||
levels:
|
||||
vaningen: &vaningen
|
||||
canonical: våningen
|
||||
abbreviated: vån
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.2
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
ordinal_probability: 1.0
|
||||
vaning: &vaning
|
||||
canonical: våning
|
||||
abbreviated: vån
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.3
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
ordinal:
|
||||
direction: left
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
numeric_probability: 0.8
|
||||
ordinal_probability: 0.2
|
||||
plan: &plan
|
||||
canonical: plan
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
entreplan: &entreplan
|
||||
canonical: entréplan
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
trappa_upp: &trappa_upp
|
||||
canonical: trappa upp
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
number_min_abs_value: 2
|
||||
number_max_abs_value: 2
|
||||
number_subtract_abs_value: 1
|
||||
numeric_probability: 0.8
|
||||
ordinal_probability: 0.2
|
||||
trappor_upp: &trappor_upp
|
||||
canonical: trappor upp
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
number_min_abs_value: 3
|
||||
number_subtract_abs_value: 1
|
||||
numeric_probability: 0.8
|
||||
ordinal_probability: 0.2
|
||||
trappa: &trappa
|
||||
canonical: trappa
|
||||
abbreviated: tr
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
number_min_abs_value: 2
|
||||
number_max_abs_value: 2
|
||||
number_subtract_abs_value: 1
|
||||
numeric_probability: 0.8
|
||||
ordinal_probability: 0.2
|
||||
trappor: &trappor
|
||||
canonical: trappor
|
||||
abbreviated: tr
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.8
|
||||
spellout_probability: 0.2
|
||||
number_min_abs_value: 3
|
||||
number_subtract_abs_value: 1
|
||||
numeric_probability: 0.8
|
||||
ordinal_probability: 0.2
|
||||
bottenvaning: &bottenvaning
|
||||
canonical: bottenvåning
|
||||
abbreviated: bv
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
vindsvaningen: &vindsvaningen
|
||||
canonical: vindsvåningen
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
standalone_probability: 1.0
|
||||
vinds: &vinds
|
||||
canonical: vinds
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
standalone_probability: 1.0
|
||||
kallare: &kallare
|
||||
canonical: källare
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
# e.g. 1 källare
|
||||
numeric:
|
||||
direction: right
|
||||
direction_probability: 0.8
|
||||
# e.g. k1
|
||||
numeric_affix:
|
||||
affix: k
|
||||
direction: left
|
||||
# e.g. 1:a k
|
||||
ordinal:
|
||||
direction: right
|
||||
standalone_probability: 0.9
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
numeric_affix_probability: 0.09
|
||||
ordinal_probability: 0.005
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *kallare
|
||||
probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *vaning
|
||||
probability: 0.025
|
||||
- alternative: *vaningen
|
||||
probability: 0.025
|
||||
"-1":
|
||||
default: *kallare
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *vaning
|
||||
probability: 0.05
|
||||
- alternative: *vaningen
|
||||
probability: 0.05
|
||||
"0":
|
||||
default: *bottenvaning
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *entreplan
|
||||
probability: 0.2
|
||||
- alternative: *vaningen
|
||||
probability: 0.1
|
||||
- alternative: *vaning
|
||||
probability: 0.1
|
||||
"1":
|
||||
default: *bottenvaning
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *entreplan
|
||||
probability: 0.2
|
||||
- alternative: *vaningen
|
||||
probability: 0.1
|
||||
- alternative: *vaning
|
||||
probability: 0.1
|
||||
"top":
|
||||
default: *vaningen
|
||||
probability: 0.35
|
||||
alternatives:
|
||||
- alternative: *vaning
|
||||
probability: 0.35
|
||||
- alternative: *trappor_upp
|
||||
probability: 0.1
|
||||
- alternative: *trappor
|
||||
probability: 0.1
|
||||
- alternative: *vindsvaningen
|
||||
probability: 0.05
|
||||
- alternative: *vinds
|
||||
probability: 0.05
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *vaningen
|
||||
probability: 0.25
|
||||
alternatives:
|
||||
- alternative: *vaning
|
||||
probability: 0.2
|
||||
- alternative: *plan
|
||||
probability: 0.05
|
||||
- alternative: *trappa_upp
|
||||
probability: 0.125
|
||||
- alternative: *trappa
|
||||
probability: 0.125
|
||||
- alternative: *trappor_upp
|
||||
probability: 0.125
|
||||
- alternative: *trappor
|
||||
probability: 0.125
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
|
||||
categories:
|
||||
near:
|
||||
default:
|
||||
canonical: i närheten av
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: nära
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
nearby:
|
||||
default:
|
||||
canonical: i närheten
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.4
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: runt här
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
- alternative:
|
||||
canonical: nära här
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: nära här
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: nära
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
- alternative:
|
||||
canonical: omkring här
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.1
|
||||
near_me:
|
||||
default:
|
||||
canonical: nära mig
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: i närheten av mig
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
probability: 0.2
|
||||
|
||||
in:
|
||||
default:
|
||||
canonical: i
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative:
|
||||
canonical: på
|
||||
probability: 0.2
|
||||
|
||||
|
||||
# Probabilities of each phrase
|
||||
near_probability: 0.35
|
||||
nearby_probability: 0.2
|
||||
near_me_probability: 0.1
|
||||
in_probability: 0.35
|
||||
|
||||
|
||||
directions:
|
||||
right: &hoger
|
||||
canonical: höger
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
sample_probability: 0.9
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: h
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
left: &vanster
|
||||
canonical: vänster
|
||||
sample: true
|
||||
canonical_probability: 0.1
|
||||
sample_probability: 0.9
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
whitespace_probability: 0.1
|
||||
numeric_probability: 0.8
|
||||
numeric_affix_probability: 0.2
|
||||
alternatives:
|
||||
- alternative: *hoger
|
||||
probability: 0.5
|
||||
- alternative: *vanster
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &ost
|
||||
canonical: öst
|
||||
abbreviated: ö
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: ö
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
eastern: &ostra
|
||||
canonical: östra
|
||||
abbreviated: ö:a
|
||||
canonical_probability: 0.9
|
||||
abbreviated_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
west: &vast
|
||||
canonical: väst
|
||||
abbreviated: v
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: v
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
western: &vastra
|
||||
canonical: västra
|
||||
abbreviated: v:a
|
||||
canonical_probability: 0.9
|
||||
abbreviated_probability: 0.1
|
||||
numeric:
|
||||
direction: right
|
||||
|
||||
north: &norr
|
||||
canonical: norr
|
||||
abbreviated: n
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: n
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
northern: &norra
|
||||
canonical: norra
|
||||
abbreviated: n:a
|
||||
canonical_probability: 0.9
|
||||
abbreviated_probability: 0.1
|
||||
|
||||
south: &sod
|
||||
canonical: söd
|
||||
abbreviated: s
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: s
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
southern: &sodra
|
||||
canonical: södra
|
||||
abbreviated: s:a
|
||||
canonical_probability: 0.9
|
||||
abbreviated_probability: 0.1
|
||||
|
||||
alternatives:
|
||||
- alternative: *norr
|
||||
probability: 0.25
|
||||
- alternative: *ost
|
||||
probability: 0.25
|
||||
- alternative: *sod
|
||||
probability: 0.25
|
||||
- alternative: *vast
|
||||
probability: 0.25
|
||||
|
||||
entrances:
|
||||
ingang: &ingang
|
||||
canonical: ingång
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
entre: &entre
|
||||
canonical: entré
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# Eingang 1, Eingang A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *ingang
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *entre
|
||||
probability: 0.4
|
||||
numeric_probability: 0.1 # e.g. Eingang 1
|
||||
alpha_probability: 0.85 # e.g. Eingang A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
staircases:
|
||||
uppgang: &uppgang
|
||||
canonical: uppgång
|
||||
abbreviated: u
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
uppgang_hoger: &uppgang_hoger
|
||||
canonical: uppgång höger
|
||||
abbreviated: uh
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
uppgang_vanster: &uppgang_vanster
|
||||
canonical: uppgång vänster
|
||||
abbreviated: uv
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *uppgang
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *uppgang_hoger
|
||||
probability: 0.2
|
||||
- alternative: *uppgang_vanster
|
||||
probability: 0.2
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: left
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *norr
|
||||
- alternative: *sod
|
||||
- alternative: *ost
|
||||
- alternative: *vast
|
||||
|
||||
po_boxes:
|
||||
box: &box
|
||||
canonical: box
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Box No 1234
|
||||
postlada: &postlada
|
||||
canonical: postlåda
|
||||
abbreviated: pl
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2 # Pl No 1234
|
||||
|
||||
alphanumeric:
|
||||
sample: false
|
||||
default: *box
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *postlada
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # Box 123
|
||||
alpha_probability: 0.05 # Box A
|
||||
numeric_plus_alpha_probability: 0.04 # Box 123G
|
||||
alpha_plus_numeric_probability: 0.01 # Box A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.1
|
||||
- length: 5
|
||||
probability: 0.5
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
|
||||
|
||||
units:
|
||||
lagenhet: &lagenhet
|
||||
canonical: lägenhet
|
||||
abbreviated: lgh
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
null_phrase_probability: 0.1
|
||||
# Lejlighed nummer 4
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
bostad: &bostad
|
||||
canonical: bostad
|
||||
abbreviated: bst
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.05
|
||||
lagenhetsnummer: &lagenhetsnummer
|
||||
canonical: lägenhetsnummer
|
||||
abbreviated: lgh nr
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
hus: &hus
|
||||
canonical: hus
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
rum: &rum
|
||||
canonical: rum
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *lagenhet
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative: *lagenhetsnummer
|
||||
probability: 0.05
|
||||
- alternative: *hus
|
||||
probability: 0.1
|
||||
- alternative: *rum
|
||||
probability: 0.1
|
||||
numeric_probability: 0.95 # e.g. Lägenhet 1
|
||||
alpha_probability: 0.05 # e.g. Lgh A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# Separate random probability for adding directions like 2H, 2V, etc.
|
||||
add_direction: true
|
||||
add_direction_probability: 0.005
|
||||
|
||||
# Add directions for plain numbers
|
||||
add_direction_numeric: true
|
||||
# Add direction only e.g. Lejlighed Igjen
|
||||
add_direction_standalone: true
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.2
|
||||
|
||||
# Use the actual floor phrase as long as the whole phrase is numeric
|
||||
# Has the effect of creating Bolignummer-style units
|
||||
use_floor_affix_unit_num_digits: 2
|
||||
|
||||
# In Swedish addresses, the ground level is 10, floors are 11, 12, ... basements are 9, 8, ...
|
||||
use_floor_ground_starts_at: 10
|
||||
# For single digit floors, use 09, 08, etc.
|
||||
use_floor_floor_num_digits: 2
|
||||
|
||||
|
||||
countries:
|
||||
# Swedish addresses in Finland
|
||||
fi:
|
||||
units:
|
||||
alphanumeric:
|
||||
default: *bostad
|
||||
probability: 1.0
|
||||
alternatives: []
|
||||
|
||||
add_direction: false
|
||||
add_direction_numeric: false
|
||||
add_direction_standalone: false
|
||||
|
||||
use_floor_probability: 0.1
|
||||
|
||||
use_floor_affix_unit_num_digits: 0
|
||||
|
||||
use_floor_ground_starts_at: 1
|
||||
use_floor_floor_num_digits: 2
|
||||
503
resources/addresses/tr.yaml
Normal file
503
resources/addresses/tr.yaml
Normal file
@@ -0,0 +1,503 @@
|
||||
# tr.yaml
|
||||
# -------
|
||||
# Turkish language specification
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.9
|
||||
alphanumeric_probability: 0.1
|
||||
|
||||
staircase:
|
||||
null_probability: 0.99
|
||||
alphanumeric_probability: 0.01
|
||||
|
||||
entrance:
|
||||
null_probability: 0.999
|
||||
alphanumeric_probability: 0.001
|
||||
|
||||
unit:
|
||||
null_probability: 0.7
|
||||
alphanumeric_probability: 0.3
|
||||
|
||||
combinations:
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- staircase
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- level
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.1
|
||||
# For unit types like 2/34
|
||||
-
|
||||
components:
|
||||
- house_number
|
||||
- unit
|
||||
label: house_number
|
||||
separators:
|
||||
- separator: "/"
|
||||
probability: 0.95
|
||||
- separator: "-"
|
||||
probability: 0.05
|
||||
probability: 0.005
|
||||
|
||||
|
||||
numbers:
|
||||
|
||||
default: &numara
|
||||
canonical: numara
|
||||
abbreviated: "no:"
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.6
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_affix:
|
||||
affix: "no:"
|
||||
whitespace_probability: 0.6
|
||||
direction: left
|
||||
numeric_probability: 0.4
|
||||
numeric_affix_probability: 0.6
|
||||
|
||||
alphanumeric_phrase_probability: 0.05
|
||||
no_number_probability: 0.05
|
||||
|
||||
|
||||
and:
|
||||
default: &ve
|
||||
canonical: ve
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
|
||||
|
||||
cross_streets:
|
||||
ve: *ve
|
||||
corner_of: &kose
|
||||
canonical: köşe
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
kosesinde: &kosesinde
|
||||
canonical: köşesinde
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
intersection:
|
||||
default: *ve
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *kose
|
||||
probability: 0.1
|
||||
- alternative: *kosesinde
|
||||
probability: 0.1
|
||||
|
||||
arasinda: &arasinda
|
||||
canonical: arasında
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
parentheses_probability: 0.5
|
||||
between:
|
||||
default: *arasinda
|
||||
|
||||
levels:
|
||||
kat: &kat
|
||||
canonical: kat
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.9
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
roman_numeral_probability: 0.7
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
numeric_probability: 0.4
|
||||
ordinal_probability: 0.6
|
||||
|
||||
zemin_kat: &zemin_kat
|
||||
canonical: zemin kat
|
||||
abbreviated: zk
|
||||
sample: true
|
||||
canonical_probability: 0.3
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.3
|
||||
asma_kat: &asma_kat
|
||||
canonical: asma kat
|
||||
half_floors: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
sample: true
|
||||
# e.g. asma kat 2
|
||||
numeric:
|
||||
direction: left
|
||||
# e.g. 2. asma kat
|
||||
ordinal:
|
||||
direction: right
|
||||
numeric_probability: 0.1
|
||||
ordinal_probability: 0.2
|
||||
standalone_probability: 0.6
|
||||
bodrum: &bodrum
|
||||
canonical: bodrum
|
||||
sample: true
|
||||
canonical_probability: 0.7
|
||||
sample_probability: 0.3
|
||||
# e.g. bodrum 1
|
||||
numeric:
|
||||
direction: left
|
||||
direction_probability: 0.8
|
||||
# e.g. 1. bodrum
|
||||
ordinal:
|
||||
direction: right
|
||||
digits:
|
||||
ascii_probability: 0.7
|
||||
roman_numeral_probability: 0.3
|
||||
standalone_probability: 0.99
|
||||
number_abs_value: true
|
||||
number_min_abs_value: 1
|
||||
numeric_probability: 0.005
|
||||
ordinal_probability: 0.005
|
||||
|
||||
aliases:
|
||||
"<-1":
|
||||
default: *bodrum
|
||||
"-1":
|
||||
default: *bodrum
|
||||
# Special token for half-floors
|
||||
half_floors:
|
||||
default: *asma_kat
|
||||
"0":
|
||||
default: *zemin_kat
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *kat
|
||||
probability: 0.1
|
||||
|
||||
numbering_starts_at: 0
|
||||
|
||||
alphanumeric:
|
||||
default: *kat
|
||||
numeric_probability: 0.99 # With this probability, pick an integer
|
||||
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
|
||||
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
|
||||
alpha_plus_numeric_probability: 0.0001 # e.g. A2
|
||||
|
||||
|
||||
directions:
|
||||
right: &sag
|
||||
canonical: sağ
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
left: &sol
|
||||
canonical: sol
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: right
|
||||
alternatives:
|
||||
- alternative: *sag
|
||||
probability: 0.5
|
||||
- alternative: *sol
|
||||
probability: 0.5
|
||||
|
||||
cardinal_directions:
|
||||
east: &dogu
|
||||
canonical: doğu
|
||||
abbreviated: d
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: d
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
west: &bati
|
||||
canonical: batı
|
||||
abbreviated: b
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: b
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
north: &kuzey
|
||||
canonical: kuzey
|
||||
abbreviated: k
|
||||
canonical_probability: 0.95
|
||||
abbreviated_probability: 0.05
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: k
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
south: &guney
|
||||
canonical: güney
|
||||
abbreviated: g
|
||||
sample: true
|
||||
canonical_probability: 0.75
|
||||
abbreviated_probability: 0.1
|
||||
sample_probability: 0.15
|
||||
numeric:
|
||||
direction: right
|
||||
numeric_affix:
|
||||
affix: g
|
||||
direction: right
|
||||
numeric_probability: 0.5
|
||||
numeric_affix_probability: 0.5
|
||||
|
||||
alternatives:
|
||||
- alternative: *kuzey
|
||||
probability: 0.25
|
||||
- alternative: *dogu
|
||||
probability: 0.23
|
||||
- alternative: *guney
|
||||
probability: 0.23
|
||||
- alternative: *bati
|
||||
probability: 0.23
|
||||
|
||||
entrances:
|
||||
giris: &giris
|
||||
canonical: giriş
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
# giriş 1, giriş A, etc.
|
||||
alphanumeric: &entrance_alphanumeric
|
||||
default: *giris
|
||||
numeric_probability: 0.1 # e.g. giriş 1
|
||||
alpha_probability: 0.85 # e.g. giriş A
|
||||
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
|
||||
staircases:
|
||||
merdiven: &merdiven
|
||||
canonical: merdiven
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
|
||||
|
||||
alphanumeric: &staircase_alphanumeric
|
||||
default: *merdiven
|
||||
numeric_probability: 0.75
|
||||
alpha_probability: 0.2
|
||||
numeric_plus_alpha_probability: 0.025
|
||||
alpha_plus_numeric_probability: 0.025
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
directional:
|
||||
direction: right
|
||||
direction_probability: 0.85
|
||||
modifier:
|
||||
alternatives:
|
||||
- alternative: *sag
|
||||
probability: 0.2
|
||||
- alternative: *sol
|
||||
probability: 0.2
|
||||
- alternative: *kuzey
|
||||
probability: 0.15
|
||||
- alternative: *guney
|
||||
probability: 0.15
|
||||
- alternative: *dogu
|
||||
probability: 0.15
|
||||
- alternative: *bati
|
||||
probability: 0.15
|
||||
|
||||
po_boxes:
|
||||
posta_kutusu: &posta_kutusu
|
||||
canonical: posta kutusu
|
||||
abbreviated: pk
|
||||
sample: true
|
||||
canonical_probability: 0.2
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.2
|
||||
|
||||
alphanumeric:
|
||||
default: *posta_kutusu
|
||||
numeric_probability: 0.9 # pp 123
|
||||
alpha_probability: 0.05 # p.p A
|
||||
numeric_plus_alpha_probability: 0.04 # pp 123G
|
||||
alpha_plus_numeric_probability: 0.01 # pp A123
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
|
||||
units:
|
||||
daire: &daire
|
||||
canonical: daire
|
||||
abbreviated: d
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.4
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
apartman: &apartman
|
||||
canonical: apartman
|
||||
abbreviated: apt
|
||||
sample: true
|
||||
canonical_probability: 0.4
|
||||
abbreviated_probability: 0.2
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
oda: &oda
|
||||
canonical: oda
|
||||
sample: true
|
||||
canonical_probability: 0.8
|
||||
sample_probability: 0.2
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
ofis: &ofis
|
||||
canonical: ofis
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
sample_probability: 0.4
|
||||
numeric:
|
||||
direction: left
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.1
|
||||
|
||||
alphanumeric: &unit_alphanumeric
|
||||
default: *daire
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *apartman
|
||||
probability: 0.3
|
||||
- alternative: *oda
|
||||
probability: 0.1
|
||||
numeric_probability: 0.9 # e.g. d. 1
|
||||
numeric_plus_alpha_probability: 0.03 # e.g. 1A
|
||||
alpha_plus_numeric_probability: 0.03 # e.g. A1
|
||||
alpha_probability: 0.04 # e.g. daire A
|
||||
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
|
||||
# If there are 10 floors, create unit numbers like #301 or #1032
|
||||
use_floor_probability: 0.05
|
||||
|
||||
zones:
|
||||
commercial: &commercial_unit_types
|
||||
default: *oda
|
||||
probability: 0.6
|
||||
alternatives:
|
||||
- alternative: *ofis
|
||||
probability: 0.4
|
||||
numeric_probability: 0.95 # e.g. oda 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. oda 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. oda A1
|
||||
alpha_probability: 0.03 # e.g. oda A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
university:
|
||||
default: *oda
|
||||
numeric_probability: 0.95 # e.g. oda 1
|
||||
numeric_plus_alpha_probability: 0.01 # e.g. oda 1A
|
||||
alpha_plus_numeric_probability: 0.01 # e.g. oda A1
|
||||
alpha_probability: 0.03 # e.g. oda A
|
||||
alpha_plus_numeric:
|
||||
whitespace_probability: 0.1
|
||||
numeric_plus_alpha:
|
||||
whitespace_probability: 0.1
|
||||
1001
resources/addresses/uk.yaml
Normal file
1001
resources/addresses/uk.yaml
Normal file
File diff suppressed because it is too large
Load Diff
292
resources/addresses/zh.yaml
Normal file
292
resources/addresses/zh.yaml
Normal file
@@ -0,0 +1,292 @@
|
||||
# zh.yaml
|
||||
# -------
|
||||
# Chinese language specification (default is mainland China, Hong Kong below)
|
||||
|
||||
whitespace: false
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85 # Probability of doing nothing if no floor number is specified
|
||||
alphanumeric_probability: 0.15
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
|
||||
numbers:
|
||||
default: &hao
|
||||
canonical: 号
|
||||
numeric_affix:
|
||||
affix: 号
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: &hao_traditional
|
||||
canonical: 號
|
||||
numeric_affix:
|
||||
affix: 號
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
probability: 0.2
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *hao
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *hao_traditional
|
||||
probability: 0.2
|
||||
alphanumeric_phrase_probability: 0.6
|
||||
|
||||
levels:
|
||||
lou: &lou
|
||||
canonical: 楼
|
||||
numeric_affix:
|
||||
affix: 楼
|
||||
direction: right
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.5
|
||||
digits:
|
||||
ascii_probability: 0.6
|
||||
unicode_full_width_probability: 0.1
|
||||
spellout_probability: 0.3
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
lou_traditional: &lou_traditional
|
||||
canonical: 樓
|
||||
numeric_affix:
|
||||
affix: 樓
|
||||
direction: right
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.5
|
||||
digits:
|
||||
ascii_probability: 0.6
|
||||
unicode_full_width_probability: 0.1
|
||||
spellout_probability: 0.3
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
ceng: &ceng
|
||||
canonical: 层
|
||||
numeric_affix:
|
||||
affix: 层
|
||||
direction: right
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.5
|
||||
digits:
|
||||
ascii_probability: 0.6
|
||||
unicode_full_width_probability: 0.1
|
||||
spellout_probability: 0.3
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
ceng_traditional: &ceng_traditional
|
||||
canonical: 層
|
||||
numeric_affix:
|
||||
affix: 層
|
||||
direction: right
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.5
|
||||
digits:
|
||||
ascii_probability: 0.6
|
||||
unicode_full_width_probability: 0.1
|
||||
spellout_probability: 0.3
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *lou
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: *lou_traditional
|
||||
probability: 0.05
|
||||
- alternative: *ceng
|
||||
probability: 0.08
|
||||
- alternative: *ceng_traditional
|
||||
probability: 0.02
|
||||
numeric_probability: 1.0
|
||||
|
||||
po_boxes:
|
||||
youzheng_xinxiang: &youzheng_xinxiang
|
||||
canonical: 邮政信箱
|
||||
numeric_affix:
|
||||
affix: 邮政信箱
|
||||
direction: left
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
unicode_full_width_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
use_number_phrase: true
|
||||
use_number_phrase_probability: 0.8
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
youzheng_xinxiang_traditional: &youzheng_xinxiang_traditional
|
||||
canonical: 郵政信箱
|
||||
numeric_affix:
|
||||
affix: 郵政信箱
|
||||
direction: left
|
||||
digits:
|
||||
ascii_probability: 0.3
|
||||
unicode_full_width_probability: 0.5
|
||||
spellout_probability: 0.2
|
||||
use_number_phrase: true
|
||||
use_number_phrase_probability: 0.8
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
|
||||
alphanumeric:
|
||||
default: *youzheng_xinxiang
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: *youzheng_xinxiang_traditional
|
||||
probability: 0.1
|
||||
numeric_probability: 1.0
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
postcodes:
|
||||
alphanumeric:
|
||||
default: &youbian
|
||||
canonical: 邮编
|
||||
numeric_affix:
|
||||
affix: 邮编
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.9
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 0.1
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: &youbian_traditional
|
||||
canonical: 郵編
|
||||
numeric_affix:
|
||||
affix: 郵編
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.9
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 0.1
|
||||
probability: 0.1
|
||||
|
||||
units:
|
||||
shi: &shi
|
||||
canonical: 室
|
||||
numeric_affix:
|
||||
affix: 室
|
||||
direction: right
|
||||
add_number_phrase: true
|
||||
add_number_phrase_probability: 0.5
|
||||
digits:
|
||||
ascii_probability: 0.6
|
||||
unicode_full_width_probability: 0.1
|
||||
spellout_probability: 0.3
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
alphanumeric:
|
||||
default: *shi
|
||||
numeric_probability: 1.0
|
||||
use_positive_numbers_probability: 1.0
|
||||
# If we have a floor number (from building:levels), use it
|
||||
use_floor_probability: 0.8
|
||||
|
||||
|
||||
countries:
|
||||
# Hong Kong
|
||||
hk:
|
||||
components:
|
||||
# Floor number a little more common in Hong Kong than mainland China
|
||||
level:
|
||||
null_probability: 0.75
|
||||
alphanumeric_probability: 0.25
|
||||
|
||||
numbers: &numbers_prefer_traditional
|
||||
default: *hao_traditional
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *hao
|
||||
probability: 0.3
|
||||
|
||||
house_numbers: &house_number_prefer_traditional
|
||||
alphanumeric:
|
||||
default: *hao_traditional
|
||||
probability: 0.7
|
||||
alternatives:
|
||||
- alternative: *hao
|
||||
probability: 0.3
|
||||
alphanumeric_phrase_probability: 0.6
|
||||
|
||||
levels: &levels_prefer_traditional
|
||||
alphanumeric:
|
||||
default: *lou_traditional
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative: *lou
|
||||
probability: 0.15
|
||||
- alternative: *ceng_traditional
|
||||
probability: 0.06
|
||||
- alternative: *ceng
|
||||
probability: 0.04
|
||||
numeric_probability: 1.0
|
||||
|
||||
po_boxes: &po_boxes_prefer_traditional
|
||||
alphanumeric:
|
||||
default: *youzheng_xinxiang_traditional
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative: *youzheng_xinxiang
|
||||
probability: 0.25
|
||||
numeric_probability: 1.0
|
||||
|
||||
|
||||
postcodes: &postcodes_prefer_traditional
|
||||
alphanumeric:
|
||||
default: *youbian_traditional
|
||||
probability: 0.75
|
||||
alternatives:
|
||||
- alternative: *youbian
|
||||
probability: 0.25
|
||||
|
||||
# Macau
|
||||
mo:
|
||||
numbers: *numbers_prefer_traditional
|
||||
house_numbers: *house_number_prefer_traditional
|
||||
levels: *levels_prefer_traditional
|
||||
po_boxes: *po_boxes_prefer_traditional
|
||||
postcodes: *postcodes_prefer_traditional
|
||||
|
||||
units:
|
||||
alphanumeric_probability:
|
||||
numeric_probability: 0.9
|
||||
alpha_probability: 0.1
|
||||
|
||||
|
||||
# Taiwan
|
||||
tw:
|
||||
numbers: *numbers_prefer_traditional
|
||||
house_numbers: *house_number_prefer_traditional
|
||||
levels: *levels_prefer_traditional
|
||||
po_boxes: *po_boxes_prefer_traditional
|
||||
postcodes: *postcodes_prefer_traditional
|
||||
|
||||
units:
|
||||
alphanumeric_probability:
|
||||
numeric_probability: 0.9
|
||||
alpha_probability: 0.1
|
||||
153
resources/addresses/zh_pinyin.yaml
Normal file
153
resources/addresses/zh_pinyin.yaml
Normal file
@@ -0,0 +1,153 @@
|
||||
# zh_pinyin.yaml
|
||||
# --------------
|
||||
# Chinese (Pinyin)
|
||||
|
||||
whitespace: false
|
||||
|
||||
components:
|
||||
level:
|
||||
null_probability: 0.85 # Probability of doing nothing if no floor number is specified
|
||||
alphanumeric_probability: 0.15
|
||||
|
||||
unit:
|
||||
# If no unit number is specified
|
||||
null_probability: 0.6
|
||||
alphanumeric_probability: 0.4
|
||||
|
||||
numbers:
|
||||
default: &hao
|
||||
canonical: hao
|
||||
numeric_affix:
|
||||
affix: -hao
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
house_numbers:
|
||||
alphanumeric:
|
||||
default: *hao
|
||||
alphanumeric_phrase_probability: 0.6
|
||||
|
||||
levels:
|
||||
lou: &lou
|
||||
canonical: lóu
|
||||
numeric_affix:
|
||||
affix: -lóu
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
lou_no_accent: &lou_no_accent
|
||||
canonical: lou
|
||||
numeric_affix:
|
||||
affix: -lou
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
ceng: &ceng
|
||||
canonical: céng
|
||||
numeric_affix:
|
||||
affix: -céng
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
ceng_no_accent: &ceng_no_accent
|
||||
canonical: ceng
|
||||
numeric_affix:
|
||||
affix: -ceng
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
numbering_starts_at: 1
|
||||
|
||||
alphanumeric:
|
||||
default: *lou
|
||||
probability: 0.85
|
||||
alternatives:
|
||||
- alternative: *lou_no_accent
|
||||
probability: 0.05
|
||||
- alternative: *ceng
|
||||
probability: 0.08
|
||||
- alternative: *ceng_no_accent
|
||||
probability: 0.02
|
||||
numeric_probability: 1.0
|
||||
|
||||
po_boxes:
|
||||
youzheng_xinxiang: &youzheng_xinxiang
|
||||
canonical: youzheng xinxiang
|
||||
numeric:
|
||||
direction: left
|
||||
numeric_probability: 1.0
|
||||
|
||||
alphanumeric:
|
||||
default: *youzheng_xinxiang
|
||||
numeric_probability: 1.0
|
||||
|
||||
digits:
|
||||
- length: 1
|
||||
probability: 0.05
|
||||
- length: 2
|
||||
probability: 0.1
|
||||
- length: 3
|
||||
probability: 0.2
|
||||
- length: 4
|
||||
probability: 0.5
|
||||
- length: 5
|
||||
probability: 0.1
|
||||
- length: 6
|
||||
probability: 0.05
|
||||
|
||||
postcodes:
|
||||
alphanumeric:
|
||||
default: &youbian
|
||||
canonical: yóubiān
|
||||
numeric:
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.9
|
||||
numeric_probability: 0.1
|
||||
probability: 0.9
|
||||
alternatives:
|
||||
- alternative: &youbian_no_accent
|
||||
canonical: youbian
|
||||
numeric:
|
||||
direction: left
|
||||
# null_probability means the chance of doing nothing e.g. just the postal code
|
||||
null_probability: 0.9
|
||||
numeric_probability: 0.1
|
||||
probability: 0.1
|
||||
|
||||
units:
|
||||
shi: &shi
|
||||
canonical: shì
|
||||
numeric_affix:
|
||||
affix: -shì
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
shi_no_accent: &shi_no_accent
|
||||
canonical: shi
|
||||
numeric_affix:
|
||||
affix: -shi
|
||||
upper_case: false
|
||||
direction: right
|
||||
numeric_probability: 0.0
|
||||
numeric_affix_probability: 1.0
|
||||
|
||||
alphanumeric:
|
||||
default: *shi
|
||||
probability: 0.8
|
||||
alternatives:
|
||||
- alternative: *shi_no_accent
|
||||
probability: 0.2
|
||||
numeric_probability: 1.0
|
||||
use_positive_numbers_probability: 1.0
|
||||
# If we have a floor number (from building:levels), use it
|
||||
use_floor_probability: 0.8
|
||||
2
resources/boundaries/geonames/ad.yaml
Normal file
2
resources/boundaries/geonames/ad.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
3
resources/boundaries/geonames/ar.yaml
Normal file
3
resources/boundaries/geonames/ar.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/at.yaml
Normal file
3
resources/boundaries/geonames/at.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# admin2 is a mix of state_district and city, need to list specifically
|
||||
3
resources/boundaries/geonames/au.yaml
Normal file
3
resources/boundaries/geonames/au.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# admin2 is a mix of state_district and city, need to list specifically
|
||||
3
resources/boundaries/geonames/ax.yaml
Normal file
3
resources/boundaries/geonames/ax.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/bd.yaml
Normal file
3
resources/boundaries/geonames/bd.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# unclear what admin2 is, maybe city
|
||||
3
resources/boundaries/geonames/be.yaml
Normal file
3
resources/boundaries/geonames/be.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/bg.yaml
Normal file
3
resources/boundaries/geonames/bg.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/br.yaml
Normal file
3
resources/boundaries/geonames/br.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/ca.yaml
Normal file
3
resources/boundaries/geonames/ca.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/ch.yaml
Normal file
3
resources/boundaries/geonames/ch.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
4
resources/boundaries/geonames/cz.yaml
Normal file
4
resources/boundaries/geonames/cz.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
admin_codes:
|
||||
# The GeoNames admin1 boundaries are admin_level=5 or 6 in OSM
|
||||
# However, they do appear to be states, might need to update Czech OSM config
|
||||
admin1: state_district
|
||||
3
resources/boundaries/geonames/de.yaml
Normal file
3
resources/boundaries/geonames/de.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/dk.yaml
Normal file
3
resources/boundaries/geonames/dk.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# admin2 is a mix of city and island, need to list specifically
|
||||
3
resources/boundaries/geonames/do.yaml
Normal file
3
resources/boundaries/geonames/do.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/dz.yaml
Normal file
3
resources/boundaries/geonames/dz.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/es.yaml
Normal file
3
resources/boundaries/geonames/es.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
5
resources/boundaries/geonames/fi.yaml
Normal file
5
resources/boundaries/geonames/fi.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
admin_codes:
|
||||
# The GeoNames admin1 boundaries are admin_level=6 in OSM
|
||||
# However, they do appear to be states, might need to update Finnish OSM config
|
||||
admin1: state_district
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/fo.yaml
Normal file
3
resources/boundaries/geonames/fo.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/fr.yaml
Normal file
3
resources/boundaries/geonames/fr.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/gb.yaml
Normal file
3
resources/boundaries/geonames/gb.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/gt.yaml
Normal file
3
resources/boundaries/geonames/gt.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
2
resources/boundaries/geonames/gu.yaml
Normal file
2
resources/boundaries/geonames/gu.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: city
|
||||
3
resources/boundaries/geonames/hr.yaml
Normal file
3
resources/boundaries/geonames/hr.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# admin2 is a mix of city and city_district, need to list specifically
|
||||
4
resources/boundaries/geonames/hu.yaml
Normal file
4
resources/boundaries/geonames/hu.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
admin_codes:
|
||||
# The GeoNames admin1 boundaries are admin_level=6 in OSM
|
||||
# However, they do appear to be states, might need to update Hungary OSM config
|
||||
admin1: state_district
|
||||
3
resources/boundaries/geonames/ie.yaml
Normal file
3
resources/boundaries/geonames/ie.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
2
resources/boundaries/geonames/im.yaml
Normal file
2
resources/boundaries/geonames/im.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: city
|
||||
3
resources/boundaries/geonames/in.yaml
Normal file
3
resources/boundaries/geonames/in.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/is.yaml
Normal file
3
resources/boundaries/geonames/is.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/it.yaml
Normal file
3
resources/boundaries/geonames/it.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
2
resources/boundaries/geonames/je.yaml
Normal file
2
resources/boundaries/geonames/je.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
3
resources/boundaries/geonames/jp.yaml
Normal file
3
resources/boundaries/geonames/jp.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# admin2 is a mix of state_district and city, need to list specifically
|
||||
2
resources/boundaries/geonames/li.yaml
Normal file
2
resources/boundaries/geonames/li.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: city
|
||||
3
resources/boundaries/geonames/lk.yaml
Normal file
3
resources/boundaries/geonames/lk.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/lt.yaml
Normal file
3
resources/boundaries/geonames/lt.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# admin2 is a mix of state_district and city, need to list specifically
|
||||
4
resources/boundaries/geonames/lu.yaml
Normal file
4
resources/boundaries/geonames/lu.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
admin_codes:
|
||||
# The admin1 names don't appear to exist in OSM, but would be states otherwise
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
2
resources/boundaries/geonames/md.yaml
Normal file
2
resources/boundaries/geonames/md.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: state_district
|
||||
2
resources/boundaries/geonames/mp.yaml
Normal file
2
resources/boundaries/geonames/mp.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: state_district
|
||||
2
resources/boundaries/geonames/mt.yaml
Normal file
2
resources/boundaries/geonames/mt.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: city
|
||||
3
resources/boundaries/geonames/mx.yaml
Normal file
3
resources/boundaries/geonames/mx.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/my.yaml
Normal file
3
resources/boundaries/geonames/my.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/nl.yaml
Normal file
3
resources/boundaries/geonames/nl.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/no.yaml
Normal file
3
resources/boundaries/geonames/no.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/nz.yaml
Normal file
3
resources/boundaries/geonames/nz.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/ph.yaml
Normal file
3
resources/boundaries/geonames/ph.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: country_region
|
||||
# admin2 is a mix of state_district and city, need to list specifically
|
||||
3
resources/boundaries/geonames/pk.yaml
Normal file
3
resources/boundaries/geonames/pk.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/pl.yaml
Normal file
3
resources/boundaries/geonames/pl.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
6
resources/boundaries/geonames/pr.yaml
Normal file
6
resources/boundaries/geonames/pr.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
admin_codes:
|
||||
admin1: state_district
|
||||
# The notion of a "barrio" in the official sense in PR is not quite a
|
||||
# municipality, and has no current official purpose, but might be useful
|
||||
# to have the name + "barrio" version available in libpostal
|
||||
admin2: city
|
||||
8
resources/boundaries/geonames/pt.yaml
Normal file
8
resources/boundaries/geonames/pt.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
admin_codes:
|
||||
admin1: state_district
|
||||
admin2: city
|
||||
|
||||
overrides:
|
||||
id:
|
||||
"2593105": "state" # Madeira
|
||||
"3411865": "state" # Azores
|
||||
4
resources/boundaries/geonames/ro.yaml
Normal file
4
resources/boundaries/geonames/ro.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# These are mostly admin_level=6, which maps to city in OSM
|
||||
admin2: city
|
||||
3
resources/boundaries/geonames/ru.yaml
Normal file
3
resources/boundaries/geonames/ru.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: state_district
|
||||
3
resources/boundaries/geonames/se.yaml
Normal file
3
resources/boundaries/geonames/se.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
admin2: city
|
||||
22
resources/boundaries/geonames/si.yaml
Normal file
22
resources/boundaries/geonames/si.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
admin_codes:
|
||||
admin1: city
|
||||
|
||||
overrides:
|
||||
id:
|
||||
# Districts of Ljubljana (suburbs in OSM)
|
||||
"3196350": "suburb" # Opština Ljubljana-Vič-Rudnik
|
||||
"3196352": "suburb" # Opština [historical] Ljubljana-Šiška
|
||||
"3196355": "suburb" # Opština Ljubljana-Moste-Polje
|
||||
"3196356": "suburb" # Opština Ljubljana-Center
|
||||
"3196357": "suburb" # Opčina Ljubljana-Bežigrad
|
||||
"9794374": "suburb" # Črnuče District
|
||||
"9794375": "suburb" # Dravlje District
|
||||
"9794376": "suburb" # Golovec District
|
||||
"9794377": "suburb" # Jarše District
|
||||
"9794378": "suburb" # Posavje District
|
||||
"9794379": "suburb" # Rožnik District
|
||||
"9794380": "suburb" # Sostro District
|
||||
"9794381": "suburb" # Šentvid District
|
||||
"9794382": "suburb" # Šmarna Gora District
|
||||
"9794384": "suburb" # Trnovo District
|
||||
"9794386": "suburb" # Vič District
|
||||
17
resources/boundaries/geonames/sk.yaml
Normal file
17
resources/boundaries/geonames/sk.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
# admin2 is a mix of state_district and city, need to list specifically
|
||||
admin2: state_district
|
||||
overrides:
|
||||
id:
|
||||
# Districts of Bratislava
|
||||
"8986283": "city_district" # Okres Bratislava I
|
||||
"8986339": "city_district" # Okres Bratislava II
|
||||
"8986340": "city_district" # Okres Bratislava III
|
||||
"8986341": "city_district" # Okres Bratislava IV
|
||||
"8986342": "city_district" # Okres Bratislava V
|
||||
# Districts of Košice
|
||||
"8986335": "city_district" # Košice I
|
||||
"8986336": "city_district" # Košice II
|
||||
"8986337": "city_district" # Košice III
|
||||
"8986338": "city_district" # Košice IV
|
||||
2
resources/boundaries/geonames/sm.yaml
Normal file
2
resources/boundaries/geonames/sm.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
admin_codes:
|
||||
admin1: city
|
||||
8
resources/boundaries/geonames/th.yaml
Normal file
8
resources/boundaries/geonames/th.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
admin_codes:
|
||||
admin1: state
|
||||
|
||||
overrides:
|
||||
id:
|
||||
# Bangkok the state is treated as a city
|
||||
# Note: we do this in OSM to get the boundary, so duplicate in GeoNames
|
||||
"1609348": "city"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user