[osm] Including toponyms in the training data for countries where the unqualified place names can be assumed to be examples of a given language

This commit is contained in:
Al
2015-09-04 14:13:26 -04:00
parent 17fcfa8b59
commit df20e2cbc0
2 changed files with 22 additions and 9 deletions

View File

@@ -17,7 +17,21 @@ from geodata.i18n.unicode_properties import get_chars_by_script, get_script_lang
from address_normalizer.text.normalize import PhraseFilter
from address_normalizer.text.tokenize import *
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es'])
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
# For toponyms, we want to limit the countries we consider to those where
# we the place names can themselves be considered training examples of the language
WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']),
'fr': set(['fr']),
'it': set(['it']),
'de': set(['de', 'at']),
'nl': set(['nl']),
'es': set(['es', 'ar', 'mx', 'cl', 'co', 'pe', 'ec', 'pr', 'uy',
've', 'cu', 'do', 'bo', 'gt', 'cr', 'py', 'sv', 'pa',
'ni', 'hn']),
'pt': set(['pt', 'br']),
}
DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')