[categories] Keeping keys sorted in generated YAML files, ignoring Interlingua queries
This commit is contained in:
@@ -41,6 +41,12 @@ NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX
|
|||||||
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
|
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
|
||||||
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
|
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
|
||||||
|
|
||||||
|
IGNORE_LANGUAGES = {
|
||||||
|
# Interlingua
|
||||||
|
'ia'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
IGNORE_PLURAL_LANGUAGES = {
|
IGNORE_PLURAL_LANGUAGES = {
|
||||||
# For Japanese, seems to just put an s on the end, which doesn't seem right
|
# For Japanese, seems to just put an s on the end, which doesn't seem right
|
||||||
# Need input from a native speaker on that one
|
# Need input from a native speaker on that one
|
||||||
@@ -82,6 +88,8 @@ def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
lang = entity.rstrip('/').rsplit('/')[-1].lower()
|
lang = entity.rstrip('/').rsplit('/')[-1].lower()
|
||||||
|
if lang in IGNORE_LANGUAGES:
|
||||||
|
continue
|
||||||
|
|
||||||
link = WIKI_BASE_URL + entity.replace(' ', '_')
|
link = WIKI_BASE_URL + entity.replace(' ', '_')
|
||||||
|
|
||||||
@@ -106,15 +114,20 @@ def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
|
|||||||
with open(filename, 'w') as f:
|
with open(filename, 'w') as f:
|
||||||
phrase_data = [
|
phrase_data = [
|
||||||
{
|
{
|
||||||
'phrase': phrase,
|
# For sorting purposes, we'll remove later
|
||||||
'key': key,
|
'0phrase': safe_decode(phrase),
|
||||||
'value': value,
|
'1key': safe_decode(key),
|
||||||
'is_plural': is_plural
|
'2value': safe_decode(value),
|
||||||
|
'3is_plural': is_plural
|
||||||
}
|
}
|
||||||
for phrase, key, value, is_plural in phrases
|
for phrase, key, value, is_plural in phrases
|
||||||
]
|
]
|
||||||
|
|
||||||
yaml.dump(phrase_data, f, allow_unicode=True, default_flow_style=False)
|
yaml_data = yaml.safe_dump(phrase_data, allow_unicode=True, default_flow_style=False)
|
||||||
|
|
||||||
|
yaml_data = yaml_data.replace('0phrase:', 'phrase:').replace('1key:', 'key:').replace('2value:', 'value:').replace('3is_plural:', 'is_plural:')
|
||||||
|
|
||||||
|
f.write(yaml_data)
|
||||||
|
|
||||||
print('Done')
|
print('Done')
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user