[languages] non-default languages can still be labeled from > 1 char abbreviations if there's no evidence of other languages in the string. Adding Python version of get_string_script from the C lib
This commit is contained in:
@@ -9,7 +9,5 @@ gaten
|
|||||||
lia
|
lia
|
||||||
park
|
park
|
||||||
plassen|pl.
|
plassen|pl.
|
||||||
vegen
|
|
||||||
veinen
|
|
||||||
stredet
|
stredet
|
||||||
svingen
|
svingen
|
||||||
@@ -1,4 +1,6 @@
|
|||||||
allè|alle
|
allè|alle
|
||||||
allèen|alleen
|
allèen|alleen
|
||||||
veg|v.
|
veg|v.
|
||||||
|
vegen|v.|vn.
|
||||||
vei|v.
|
vei|v.
|
||||||
|
veien|v.|vn.
|
||||||
@@ -332,11 +332,15 @@ def extract_language_scripts(xml):
|
|||||||
return language_scripts
|
return language_scripts
|
||||||
|
|
||||||
|
|
||||||
def get_script_languages(script_codes):
|
def get_script_languages():
|
||||||
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
|
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
|
||||||
# to identify the language. We keep track of those single language scripts to inform
|
# to identify the language. We keep track of those single language scripts to inform
|
||||||
# the language classifier
|
# the language classifier
|
||||||
|
|
||||||
|
chars = get_chars_by_script()
|
||||||
|
all_scripts = build_master_scripts_list(chars)
|
||||||
|
script_codes = get_script_codes(all_scripts)
|
||||||
|
|
||||||
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
|
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
|
||||||
cldr_xml = etree.parse(cldr_supplemental_data)
|
cldr_xml = etree.parse(cldr_supplemental_data)
|
||||||
language_scripts = extract_language_scripts(cldr_xml)
|
language_scripts = extract_language_scripts(cldr_xml)
|
||||||
@@ -364,6 +368,9 @@ def get_script_languages(script_codes):
|
|||||||
langs = script_code_languages.get(script_code, [])
|
langs = script_code_languages.get(script_code, [])
|
||||||
script_languages[script_name].extend(langs)
|
script_languages[script_name].extend(langs)
|
||||||
|
|
||||||
|
for name in all_scripts.iterkeys():
|
||||||
|
script_languages.setdefault(name, [])
|
||||||
|
|
||||||
return script_languages
|
return script_languages
|
||||||
|
|
||||||
|
|
||||||
@@ -383,11 +390,7 @@ def main(out_dir):
|
|||||||
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
|
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
|
||||||
download_cldr()
|
download_cldr()
|
||||||
|
|
||||||
chars = get_chars_by_script()
|
script_languages = get_script_languages()
|
||||||
all_scripts = build_master_scripts_list(chars)
|
|
||||||
script_codes = get_script_codes(all_scripts)
|
|
||||||
|
|
||||||
script_languages = get_script_languages(script_codes)
|
|
||||||
|
|
||||||
max_langs = 0
|
max_langs = 0
|
||||||
|
|
||||||
@@ -396,9 +399,6 @@ def main(out_dir):
|
|||||||
if num_langs > max_langs:
|
if num_langs > max_langs:
|
||||||
max_langs = num_langs
|
max_langs = num_langs
|
||||||
|
|
||||||
for name in all_scripts.iterkeys():
|
|
||||||
script_languages.setdefault(name, [])
|
|
||||||
|
|
||||||
# Generate C header and constants
|
# Generate C header and constants
|
||||||
|
|
||||||
script_enum = u'''
|
script_enum = u'''
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, '
|
|||||||
from geodata.encoding import safe_decode
|
from geodata.encoding import safe_decode
|
||||||
from geodata.i18n.unicode_paths import DATA_DIR
|
from geodata.i18n.unicode_paths import DATA_DIR
|
||||||
from geodata.i18n.normalize import strip_accents
|
from geodata.i18n.normalize import strip_accents
|
||||||
|
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
|
||||||
from address_normalizer.text.normalize import PhraseFilter
|
from address_normalizer.text.normalize import PhraseFilter
|
||||||
from address_normalizer.text.tokenize import *
|
from address_normalizer.text.tokenize import *
|
||||||
|
|
||||||
@@ -103,7 +104,7 @@ class DictionaryPhraseFilter(PhraseFilter):
|
|||||||
token_len = len(token)
|
token_len = len(token)
|
||||||
suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
|
suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
|
||||||
|
|
||||||
if suffix_search and self.trie.get(token[token_len - (suffix_len - len(SUFFIX_KEY)):]):
|
if suffix_search and self.trie.get(token[token_len - (suffix_len - len(SUFFIX_KEY)):].rstrip('.')):
|
||||||
yield (token_types.PHRASE, [(c,) + t], suffix_search)
|
yield (token_types.PHRASE, [(c,) + t], suffix_search)
|
||||||
continue
|
continue
|
||||||
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
|
prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
|
||||||
@@ -119,31 +120,62 @@ street_types_gazetteer = DictionaryPhraseFilter('street_types.txt',
|
|||||||
'concatenated_prefixes_separable.txt',
|
'concatenated_prefixes_separable.txt',
|
||||||
'stopwords.txt',)
|
'stopwords.txt',)
|
||||||
|
|
||||||
|
char_scripts = get_chars_by_script()
|
||||||
|
script_languages = get_script_languages()
|
||||||
|
|
||||||
|
UNKNOWN_SCRIPT = 'Unknown'
|
||||||
|
COMMON_SCRIPT = 'Common'
|
||||||
|
MAX_ASCII = 127
|
||||||
|
|
||||||
|
|
||||||
|
def get_string_script(s):
|
||||||
|
s = safe_decode(s)
|
||||||
|
script = last_script = UNKNOWN_SCRIPT
|
||||||
|
is_ascii = True
|
||||||
|
script_len = 0
|
||||||
|
for c in s:
|
||||||
|
script = char_scripts[ord(c)]
|
||||||
|
if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
|
||||||
|
script = last_script
|
||||||
|
if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
|
||||||
|
break
|
||||||
|
is_ascii = is_ascii and ord(c) <= MAX_ASCII
|
||||||
|
script_len += 1
|
||||||
|
if script != UNKNOWN_SCRIPT:
|
||||||
|
last_script = script
|
||||||
|
return (last_script, script_len, is_ascii)
|
||||||
|
|
||||||
|
|
||||||
UNKNOWN_LANGUAGE = 'unk'
|
UNKNOWN_LANGUAGE = 'unk'
|
||||||
AMBIGUOUS_LANGUAGE = 'xxx'
|
AMBIGUOUS_LANGUAGE = 'xxx'
|
||||||
|
|
||||||
|
|
||||||
def disambiguate_language(text, languages):
|
def disambiguate_language(text, languages):
|
||||||
|
num_defaults = sum((1 for lang, default in languages if default))
|
||||||
valid_languages = OrderedDict(languages)
|
valid_languages = OrderedDict(languages)
|
||||||
tokens = tokenize(safe_decode(text).replace(u'-', u' ').lower())
|
tokens = tokenize(safe_decode(text).replace(u'-', u' ').lower())
|
||||||
|
|
||||||
current_lang = None
|
current_lang = None
|
||||||
|
possible_lang = None
|
||||||
|
|
||||||
seen_languages = set()
|
seen_languages = set()
|
||||||
|
|
||||||
for c, t, data in street_types_gazetteer.filter(tokens):
|
for c, t, data in street_types_gazetteer.filter(tokens):
|
||||||
|
|
||||||
if c == token_types.PHRASE:
|
if c == token_types.PHRASE:
|
||||||
valid = []
|
valid = []
|
||||||
|
|
||||||
for d in data:
|
for d in data:
|
||||||
lang, canonical = d.split('|')
|
lang, canonical = d.split('|')
|
||||||
canonical = int(canonical)
|
canonical = int(canonical)
|
||||||
if lang not in valid_languages:
|
if lang not in valid_languages:
|
||||||
continue
|
continue
|
||||||
|
is_default = valid_languages[lang]
|
||||||
|
|
||||||
if canonical or not seen_languages:
|
if canonical or is_default:
|
||||||
valid.append(lang)
|
valid.append(lang)
|
||||||
|
elif not seen_languages and len(t[0][1]) > 1:
|
||||||
|
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
||||||
|
|
||||||
if seen_languages and valid and not any((l in seen_languages for l in valid)):
|
if seen_languages and valid and not any((l in seen_languages for l in valid)):
|
||||||
return AMBIGUOUS_LANGUAGE
|
return AMBIGUOUS_LANGUAGE
|
||||||
|
|
||||||
@@ -160,4 +192,6 @@ def disambiguate_language(text, languages):
|
|||||||
|
|
||||||
if current_lang is not None:
|
if current_lang is not None:
|
||||||
return current_lang
|
return current_lang
|
||||||
|
elif possible_lang is not None:
|
||||||
|
return possible_lang
|
||||||
return UNKNOWN_LANGUAGE
|
return UNKNOWN_LANGUAGE
|
||||||
|
|||||||
Reference in New Issue
Block a user