From 122a81b61085b100f01e794d170bd3a0408d9e91 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 23 Aug 2015 02:24:32 -0400 Subject: [PATCH] [languages] non-default languages can still be labeled from > 1 char abbreviations if there's no evidence of other languages in the string. Adding Python version of get_string_script from the C lib --- .../nb/concatenated_suffixes_inseparable.txt | 2 - .../nb/concatenated_suffixes_separable.txt | 2 + scripts/geodata/i18n/unicode_properties.py | 18 ++++----- scripts/geodata/language_id/disambiguation.py | 40 +++++++++++++++++-- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/resources/dictionaries/nb/concatenated_suffixes_inseparable.txt b/resources/dictionaries/nb/concatenated_suffixes_inseparable.txt index 2d410c3e..4ea032c7 100644 --- a/resources/dictionaries/nb/concatenated_suffixes_inseparable.txt +++ b/resources/dictionaries/nb/concatenated_suffixes_inseparable.txt @@ -9,7 +9,5 @@ gaten lia park plassen|pl. -vegen -veinen stredet svingen \ No newline at end of file diff --git a/resources/dictionaries/nb/concatenated_suffixes_separable.txt b/resources/dictionaries/nb/concatenated_suffixes_separable.txt index 68cae08f..a67d5c55 100644 --- a/resources/dictionaries/nb/concatenated_suffixes_separable.txt +++ b/resources/dictionaries/nb/concatenated_suffixes_separable.txt @@ -1,4 +1,6 @@ allè|alle allèen|alleen veg|v. +vegen|v.|vn. vei|v. +veien|v.|vn. \ No newline at end of file diff --git a/scripts/geodata/i18n/unicode_properties.py b/scripts/geodata/i18n/unicode_properties.py index 796a2983..565421af 100644 --- a/scripts/geodata/i18n/unicode_properties.py +++ b/scripts/geodata/i18n/unicode_properties.py @@ -332,11 +332,15 @@ def extract_language_scripts(xml): return language_scripts -def get_script_languages(script_codes): +def get_script_languages(): # For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient # to identify the language. We keep track of those single language scripts to inform # the language classifier + chars = get_chars_by_script() + all_scripts = build_master_scripts_list(chars) + script_codes = get_script_codes(all_scripts) + cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA) cldr_xml = etree.parse(cldr_supplemental_data) language_scripts = extract_language_scripts(cldr_xml) @@ -364,6 +368,9 @@ def get_script_languages(script_codes): langs = script_code_languages.get(script_code, []) script_languages[script_name].extend(langs) + for name in all_scripts.iterkeys(): + script_languages.setdefault(name, []) + return script_languages @@ -383,11 +390,7 @@ def main(out_dir): if not os.path.exists(CLDR_SUPPLEMENTAL_DATA): download_cldr() - chars = get_chars_by_script() - all_scripts = build_master_scripts_list(chars) - script_codes = get_script_codes(all_scripts) - - script_languages = get_script_languages(script_codes) + script_languages = get_script_languages() max_langs = 0 @@ -396,9 +399,6 @@ def main(out_dir): if num_langs > max_langs: max_langs = num_langs - for name in all_scripts.iterkeys(): - script_languages.setdefault(name, []) - # Generate C header and constants script_enum = u''' diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index b9eea796..fd6dfff7 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -13,6 +13,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, ' from geodata.encoding import safe_decode from geodata.i18n.unicode_paths import DATA_DIR from geodata.i18n.normalize import strip_accents +from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages from address_normalizer.text.normalize import PhraseFilter from address_normalizer.text.tokenize import * @@ -103,7 +104,7 @@ class DictionaryPhraseFilter(PhraseFilter): token_len = len(token) suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) - if suffix_search and self.trie.get(token[token_len - (suffix_len - len(SUFFIX_KEY)):]): + if suffix_search and self.trie.get(token[token_len - (suffix_len - len(SUFFIX_KEY)):].rstrip('.')): yield (token_types.PHRASE, [(c,) + t], suffix_search) continue prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) @@ -119,31 +120,62 @@ street_types_gazetteer = DictionaryPhraseFilter('street_types.txt', 'concatenated_prefixes_separable.txt', 'stopwords.txt',) +char_scripts = get_chars_by_script() +script_languages = get_script_languages() + +UNKNOWN_SCRIPT = 'Unknown' +COMMON_SCRIPT = 'Common' +MAX_ASCII = 127 + + +def get_string_script(s): + s = safe_decode(s) + script = last_script = UNKNOWN_SCRIPT + is_ascii = True + script_len = 0 + for c in s: + script = char_scripts[ord(c)] + if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT: + script = last_script + if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT: + break + is_ascii = is_ascii and ord(c) <= MAX_ASCII + script_len += 1 + if script != UNKNOWN_SCRIPT: + last_script = script + return (last_script, script_len, is_ascii) + UNKNOWN_LANGUAGE = 'unk' AMBIGUOUS_LANGUAGE = 'xxx' def disambiguate_language(text, languages): + num_defaults = sum((1 for lang, default in languages if default)) valid_languages = OrderedDict(languages) tokens = tokenize(safe_decode(text).replace(u'-', u' ').lower()) current_lang = None + possible_lang = None seen_languages = set() for c, t, data in street_types_gazetteer.filter(tokens): - if c == token_types.PHRASE: valid = [] + for d in data: lang, canonical = d.split('|') canonical = int(canonical) if lang not in valid_languages: continue + is_default = valid_languages[lang] - if canonical or not seen_languages: + if canonical or is_default: valid.append(lang) + elif not seen_languages and len(t[0][1]) > 1: + possible_lang = lang if possible_lang is None or possible_lang == lang else None + if seen_languages and valid and not any((l in seen_languages for l in valid)): return AMBIGUOUS_LANGUAGE @@ -160,4 +192,6 @@ def disambiguate_language(text, languages): if current_lang is not None: return current_lang + elif possible_lang is not None: + return possible_lang return UNKNOWN_LANGUAGE