[categories] Keeping keys sorted in generated YAML files, ignoring Interlingua queries

This commit is contained in:
Al
2016-03-28 16:38:13 -04:00
parent e65711f6fa
commit 5d19aacb25

View File

@@ -41,6 +41,12 @@ NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I) phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])') wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
IGNORE_LANGUAGES = {
# Interlingua
'ia'
}
IGNORE_PLURAL_LANGUAGES = { IGNORE_PLURAL_LANGUAGES = {
# For Japanese, seems to just put an s on the end, which doesn't seem right # For Japanese, seems to just put an s on the end, which doesn't seem right
# Need input from a native speaker on that one # Need input from a native speaker on that one
@@ -82,6 +88,8 @@ def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
continue continue
lang = entity.rstrip('/').rsplit('/')[-1].lower() lang = entity.rstrip('/').rsplit('/')[-1].lower()
if lang in IGNORE_LANGUAGES:
continue
link = WIKI_BASE_URL + entity.replace(' ', '_') link = WIKI_BASE_URL + entity.replace(' ', '_')
@@ -106,15 +114,20 @@ def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
with open(filename, 'w') as f: with open(filename, 'w') as f:
phrase_data = [ phrase_data = [
{ {
'phrase': phrase, # For sorting purposes, we'll remove later
'key': key, '0phrase': safe_decode(phrase),
'value': value, '1key': safe_decode(key),
'is_plural': is_plural '2value': safe_decode(value),
'3is_plural': is_plural
} }
for phrase, key, value, is_plural in phrases for phrase, key, value, is_plural in phrases
] ]
yaml.dump(phrase_data, f, allow_unicode=True, default_flow_style=False) yaml_data = yaml.safe_dump(phrase_data, allow_unicode=True, default_flow_style=False)
yaml_data = yaml_data.replace('0phrase:', 'phrase:').replace('1key:', 'key:').replace('2value:', 'value:').replace('3is_plural:', 'is_plural:')
f.write(yaml_data)
print('Done') print('Done')