[languages] non-default languages can still be labeled from > 1 char abbreviations if there's no evidence of other languages in the string. Adding Python version of get_string_script from the C lib

This commit is contained in:
Al
2015-08-23 02:24:32 -04:00
parent a419dad630
commit 122a81b610
4 changed files with 48 additions and 14 deletions

View File

@@ -9,7 +9,5 @@ gaten
lia lia
park park
plassen|pl. plassen|pl.
vegen
veinen
stredet stredet
svingen svingen

View File

@@ -1,4 +1,6 @@
allè|alle allè|alle
allèen|alleen allèen|alleen
veg|v. veg|v.
vegen|v.|vn.
vei|v. vei|v.
veien|v.|vn.

View File

@@ -332,11 +332,15 @@ def extract_language_scripts(xml):
return language_scripts return language_scripts
def get_script_languages(script_codes): def get_script_languages():
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient # For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
# to identify the language. We keep track of those single language scripts to inform # to identify the language. We keep track of those single language scripts to inform
# the language classifier # the language classifier
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA) cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
cldr_xml = etree.parse(cldr_supplemental_data) cldr_xml = etree.parse(cldr_supplemental_data)
language_scripts = extract_language_scripts(cldr_xml) language_scripts = extract_language_scripts(cldr_xml)
@@ -364,6 +368,9 @@ def get_script_languages(script_codes):
langs = script_code_languages.get(script_code, []) langs = script_code_languages.get(script_code, [])
script_languages[script_name].extend(langs) script_languages[script_name].extend(langs)
for name in all_scripts.iterkeys():
script_languages.setdefault(name, [])
return script_languages return script_languages
@@ -383,11 +390,7 @@ def main(out_dir):
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA): if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
download_cldr() download_cldr()
chars = get_chars_by_script() script_languages = get_script_languages()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
script_languages = get_script_languages(script_codes)
max_langs = 0 max_langs = 0
@@ -396,9 +399,6 @@ def main(out_dir):
if num_langs > max_langs: if num_langs > max_langs:
max_langs = num_langs max_langs = num_langs
for name in all_scripts.iterkeys():
script_languages.setdefault(name, [])
# Generate C header and constants # Generate C header and constants
script_enum = u''' script_enum = u'''

View File

@@ -13,6 +13,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, '
from geodata.encoding import safe_decode from geodata.encoding import safe_decode
from geodata.i18n.unicode_paths import DATA_DIR from geodata.i18n.unicode_paths import DATA_DIR
from geodata.i18n.normalize import strip_accents from geodata.i18n.normalize import strip_accents
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
from address_normalizer.text.normalize import PhraseFilter from address_normalizer.text.normalize import PhraseFilter
from address_normalizer.text.tokenize import * from address_normalizer.text.tokenize import *
@@ -103,7 +104,7 @@ class DictionaryPhraseFilter(PhraseFilter):
token_len = len(token) token_len = len(token)
suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
if suffix_search and self.trie.get(token[token_len - (suffix_len - len(SUFFIX_KEY)):]): if suffix_search and self.trie.get(token[token_len - (suffix_len - len(SUFFIX_KEY)):].rstrip('.')):
yield (token_types.PHRASE, [(c,) + t], suffix_search) yield (token_types.PHRASE, [(c,) + t], suffix_search)
continue continue
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
@@ -119,31 +120,62 @@ street_types_gazetteer = DictionaryPhraseFilter('street_types.txt',
'concatenated_prefixes_separable.txt', 'concatenated_prefixes_separable.txt',
'stopwords.txt',) 'stopwords.txt',)
char_scripts = get_chars_by_script()
script_languages = get_script_languages()
UNKNOWN_SCRIPT = 'Unknown'
COMMON_SCRIPT = 'Common'
MAX_ASCII = 127
def get_string_script(s):
s = safe_decode(s)
script = last_script = UNKNOWN_SCRIPT
is_ascii = True
script_len = 0
for c in s:
script = char_scripts[ord(c)]
if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
script = last_script
if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
break
is_ascii = is_ascii and ord(c) <= MAX_ASCII
script_len += 1
if script != UNKNOWN_SCRIPT:
last_script = script
return (last_script, script_len, is_ascii)
UNKNOWN_LANGUAGE = 'unk' UNKNOWN_LANGUAGE = 'unk'
AMBIGUOUS_LANGUAGE = 'xxx' AMBIGUOUS_LANGUAGE = 'xxx'
def disambiguate_language(text, languages): def disambiguate_language(text, languages):
num_defaults = sum((1 for lang, default in languages if default))
valid_languages = OrderedDict(languages) valid_languages = OrderedDict(languages)
tokens = tokenize(safe_decode(text).replace(u'-', u' ').lower()) tokens = tokenize(safe_decode(text).replace(u'-', u' ').lower())
current_lang = None current_lang = None
possible_lang = None
seen_languages = set() seen_languages = set()
for c, t, data in street_types_gazetteer.filter(tokens): for c, t, data in street_types_gazetteer.filter(tokens):
if c == token_types.PHRASE: if c == token_types.PHRASE:
valid = [] valid = []
for d in data: for d in data:
lang, canonical = d.split('|') lang, canonical = d.split('|')
canonical = int(canonical) canonical = int(canonical)
if lang not in valid_languages: if lang not in valid_languages:
continue continue
is_default = valid_languages[lang]
if canonical or not seen_languages: if canonical or is_default:
valid.append(lang) valid.append(lang)
elif not seen_languages and len(t[0][1]) > 1:
possible_lang = lang if possible_lang is None or possible_lang == lang else None
if seen_languages and valid and not any((l in seen_languages for l in valid)): if seen_languages and valid and not any((l in seen_languages for l in valid)):
return AMBIGUOUS_LANGUAGE return AMBIGUOUS_LANGUAGE
@@ -160,4 +192,6 @@ def disambiguate_language(text, languages):
if current_lang is not None: if current_lang is not None:
return current_lang return current_lang
elif possible_lang is not None:
return possible_lang
return UNKNOWN_LANGUAGE return UNKNOWN_LANGUAGE