From b2fe9d4db02ee9ee34b52621ecbdeee5f3a8a0a8 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 3 Jun 2015 22:55:45 -0400 Subject: [PATCH] [transliteration] Adding uppercase umlauts and Scandinativan a-ring --- scripts/geodata/i18n/transliteration_rules.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/scripts/geodata/i18n/transliteration_rules.py b/scripts/geodata/i18n/transliteration_rules.py index a8eae57a..85048ced 100644 --- a/scripts/geodata/i18n/transliteration_rules.py +++ b/scripts/geodata/i18n/transliteration_rules.py @@ -1120,6 +1120,9 @@ html_escapes = {'&{};'.format(name): escape_string(safe_encode(unichr(value))) for name, value in htmlentitydefs.name2codepoint.iteritems() } +# [[:Latin] & [:Ll:]] +latin_lower_set = 'abcdefghijklmnopqrstuvwxyz\\xc2\\xaa\\xc2\\xba\\xc3\\x9f\\xc3\\xa0\\xc3\\xa1\\xc3\\xa2\\xc3\\xa3\\xc3\\xa4\\xc3\\xa5\\xc3\\xa6\\xc3\\xa7\\xc3\\xa8\\xc3\\xa9\\xc3\\xaa\\xc3\\xab\\xc3\\xac\\xc3\\xad\\xc3\\xae\\xc3\\xaf\\xc3\\xb0\\xc3\\xb1\\xc3\\xb2\\xc3\\xb3\\xc3\\xb4\\xc3\\xb5\\xc3\\xb6\\xc3\\xb8\\xc3\\xb9\\xc3\\xba\\xc3\\xbb\\xc3\\xbc\\xc3\\xbd\\xc3\\xbe\\xc3\\xbf\\xc4\\x81\\xc4\\x83\\xc4\\x85\\xc4\\x87\\xc4\\x89\\xc4\\x8b\\xc4\\x8d\\xc4\\x8f\\xc4\\x91\\xc4\\x93\\xc4\\x95\\xc4\\x97\\xc4\\x99\\xc4\\x9b\\xc4\\x9d\\xc4\\x9f\\xc4\\xa1\\xc4\\xa3\\xc4\\xa5\\xc4\\xa7\\xc4\\xa9\\xc4\\xab\\xc4\\xad\\xc4\\xaf\\xc4\\xb1\\xc4\\xb3\\xc4\\xb5\\xc4\\xb7\\xc4\\xb8\\xc4\\xba\\xc4\\xbc\\xc4\\xbe\\xc5\\x80\\xc5\\x82\\xc5\\x84\\xc5\\x86\\xc5\\x88\\xc5\\x89\\xc5\\x8b\\xc5\\x8d\\xc5\\x8f\\xc5\\x91\\xc5\\x93\\xc5\\x95\\xc5\\x97\\xc5\\x99\\xc5\\x9b\\xc5\\x9d\\xc5\\x9f\\xc5\\xa1\\xc5\\xa3\\xc5\\xa5\\xc5\\xa7\\xc5\\xa9\\xc5\\xab\\xc5\\xad\\xc5\\xaf\\xc5\\xb1\\xc5\\xb3\\xc5\\xb5\\xc5\\xb7\\xc5\\xba\\xc5\\xbc\\xc5\\xbe\\xc5\\xbf\\xc6\\x80\\xc6\\x83\\xc6\\x85\\xc6\\x88\\xc6\\x8c\\xc6\\x8d\\xc6\\x92\\xc6\\x95\\xc6\\x99\\xc6\\x9a\\xc6\\x9b\\xc6\\x9e\\xc6\\xa1\\xc6\\xa3\\xc6\\xa5\\xc6\\xa8\\xc6\\xaa\\xc6\\xab\\xc6\\xad\\xc6\\xb0\\xc6\\xb4\\xc6\\xb6\\xc6\\xb9\\xc6\\xba\\xc6\\xbd\\xc6\\xbe\\xc6\\xbf\\xc7\\x86\\xc7\\x89\\xc7\\x8c\\xc7\\x8e\\xc7\\x90\\xc7\\x92\\xc7\\x94\\xc7\\x96\\xc7\\x98\\xc7\\x9a\\xc7\\x9c\\xc7\\x9d\\xc7\\x9f\\xc7\\xa1\\xc7\\xa3\\xc7\\xa5\\xc7\\xa7\\xc7\\xa9\\xc7\\xab\\xc7\\xad\\xc7\\xaf\\xc7\\xb0\\xc7\\xb3\\xc7\\xb5\\xc7\\xb9\\xc7\\xbb\\xc7\\xbd\\xc7\\xbf\\xc8\\x81\\xc8\\x83\\xc8\\x85\\xc8\\x87\\xc8\\x89\\xc8\\x8b\\xc8\\x8d\\xc8\\x8f\\xc8\\x91\\xc8\\x93\\xc8\\x95\\xc8\\x97\\xc8\\x99\\xc8\\x9b\\xc8\\x9d\\xc8\\x9f\\xc8\\xa1\\xc8\\xa3\\xc8\\xa5\\xc8\\xa7\\xc8\\xa9\\xc8\\xab\\xc8\\xad\\xc8\\xaf\\xc8\\xb1\\xc8\\xb3\\xc8\\xb4\\xc8\\xb5\\xc8\\xb6\\xc8\\xb7\\xc8\\xb8\\xc8\\xb9\\xc8\\xbc\\xc8\\xbf\\xc9\\x80\\xc9\\x82\\xc9\\x87\\xc9\\x89\\xc9\\x8b\\xc9\\x8d\\xc9\\x8f\\xc9\\x90\\xc9\\x91\\xc9\\x92\\xc9\\x93\\xc9\\x94\\xc9\\x95\\xc9\\x96\\xc9\\x97\\xc9\\x98\\xc9\\x99\\xc9\\x9a\\xc9\\x9b\\xc9\\x9c\\xc9\\x9d\\xc9\\x9e\\xc9\\x9f\\xc9\\xa0\\xc9\\xa1\\xc9\\xa2\\xc9\\xa3\\xc9\\xa4\\xc9\\xa5\\xc9\\xa6\\xc9\\xa7\\xc9\\xa8\\xc9\\xa9\\xc9\\xaa\\xc9\\xab\\xc9\\xac\\xc9\\xad\\xc9\\xae\\xc9\\xaf\\xc9\\xb0\\xc9\\xb1\\xc9\\xb2\\xc9\\xb3\\xc9\\xb4\\xc9\\xb5\\xc9\\xb6\\xc9\\xb7\\xc9\\xb8\\xc9\\xb9\\xc9\\xba\\xc9\\xbb\\xc9\\xbc\\xc9\\xbd\\xc9\\xbe\\xc9\\xbf\\xca\\x80\\xca\\x81\\xca\\x82\\xca\\x83\\xca\\x84\\xca\\x85\\xca\\x86\\xca\\x87\\xca\\x88\\xca\\x89\\xca\\x8a\\xca\\x8b\\xca\\x8c\\xca\\x8d\\xca\\x8e\\xca\\x8f\\xca\\x90\\xca\\x91\\xca\\x92\\xca\\x93\\xca\\x95\\xca\\x96\\xca\\x97\\xca\\x98\\xca\\x99\\xca\\x9a\\xca\\x9b\\xca\\x9c\\xca\\x9d\\xca\\x9e\\xca\\x9f\\xca\\xa0\\xca\\xa1\\xca\\xa2\\xca\\xa3\\xca\\xa4\\xca\\xa5\\xca\\xa6\\xca\\xa7\\xca\\xa8\\xca\\xa9\\xca\\xaa\\xca\\xab\\xca\\xac\\xca\\xad\\xca\\xae\\xca\\xaf\\xe1\\xb4\\x80\\xe1\\xb4\\x81\\xe1\\xb4\\x82\\xe1\\xb4\\x83\\xe1\\xb4\\x84\\xe1\\xb4\\x85\\xe1\\xb4\\x86\\xe1\\xb4\\x87\\xe1\\xb4\\x88\\xe1\\xb4\\x89\\xe1\\xb4\\x8a\\xe1\\xb4\\x8b\\xe1\\xb4\\x8c\\xe1\\xb4\\x8d\\xe1\\xb4\\x8e\\xe1\\xb4\\x8f\\xe1\\xb4\\x90\\xe1\\xb4\\x91\\xe1\\xb4\\x92\\xe1\\xb4\\x93\\xe1\\xb4\\x94\\xe1\\xb4\\x95\\xe1\\xb4\\x96\\xe1\\xb4\\x97\\xe1\\xb4\\x98\\xe1\\xb4\\x99\\xe1\\xb4\\x9a\\xe1\\xb4\\x9b\\xe1\\xb4\\x9c\\xe1\\xb4\\x9d\\xe1\\xb4\\x9e\\xe1\\xb4\\x9f\\xe1\\xb4\\xa0\\xe1\\xb4\\xa1\\xe1\\xb4\\xa2\\xe1\\xb4\\xa3\\xe1\\xb4\\xa4\\xe1\\xb4\\xa5\\xe1\\xb5\\xa2\\xe1\\xb5\\xa3\\xe1\\xb5\\xa4\\xe1\\xb5\\xa5\\xe1\\xb5\\xab\\xe1\\xb5\\xac\\xe1\\xb5\\xad\\xe1\\xb5\\xae\\xe1\\xb5\\xaf\\xe1\\xb5\\xb0\\xe1\\xb5\\xb1\\xe1\\xb5\\xb2\\xe1\\xb5\\xb3\\xe1\\xb5\\xb4\\xe1\\xb5\\xb5\\xe1\\xb5\\xb6\\xe1\\xb5\\xb7\\xe1\\xb5\\xb9\\xe1\\xb5\\xba\\xe1\\xb5\\xbb\\xe1\\xb5\\xbc\\xe1\\xb5\\xbd\\xe1\\xb5\\xbe\\xe1\\xb5\\xbf\\xe1\\xb6\\x80\\xe1\\xb6\\x81\\xe1\\xb6\\x82\\xe1\\xb6\\x83\\xe1\\xb6\\x84\\xe1\\xb6\\x85\\xe1\\xb6\\x86\\xe1\\xb6\\x87\\xe1\\xb6\\x88\\xe1\\xb6\\x89\\xe1\\xb6\\x8a\\xe1\\xb6\\x8b\\xe1\\xb6\\x8c\\xe1\\xb6\\x8d\\xe1\\xb6\\x8e\\xe1\\xb6\\x8f\\xe1\\xb6\\x90\\xe1\\xb6\\x91\\xe1\\xb6\\x92\\xe1\\xb6\\x93\\xe1\\xb6\\x94\\xe1\\xb6\\x95\\xe1\\xb6\\x96\\xe1\\xb6\\x97\\xe1\\xb6\\x98\\xe1\\xb6\\x99\\xe1\\xb6\\x9a\\xe1\\xb8\\x81\\xe1\\xb8\\x83\\xe1\\xb8\\x85\\xe1\\xb8\\x87\\xe1\\xb8\\x89\\xe1\\xb8\\x8b\\xe1\\xb8\\x8d\\xe1\\xb8\\x8f\\xe1\\xb8\\x91\\xe1\\xb8\\x93\\xe1\\xb8\\x95\\xe1\\xb8\\x97\\xe1\\xb8\\x99\\xe1\\xb8\\x9b\\xe1\\xb8\\x9d\\xe1\\xb8\\x9f\\xe1\\xb8\\xa1\\xe1\\xb8\\xa3\\xe1\\xb8\\xa5\\xe1\\xb8\\xa7\\xe1\\xb8\\xa9\\xe1\\xb8\\xab\\xe1\\xb8\\xad\\xe1\\xb8\\xaf\\xe1\\xb8\\xb1\\xe1\\xb8\\xb3\\xe1\\xb8\\xb5\\xe1\\xb8\\xb7\\xe1\\xb8\\xb9\\xe1\\xb8\\xbb\\xe1\\xb8\\xbd\\xe1\\xb8\\xbf\\xe1\\xb9\\x81\\xe1\\xb9\\x83\\xe1\\xb9\\x85\\xe1\\xb9\\x87\\xe1\\xb9\\x89\\xe1\\xb9\\x8b\\xe1\\xb9\\x8d\\xe1\\xb9\\x8f\\xe1\\xb9\\x91\\xe1\\xb9\\x93\\xe1\\xb9\\x95\\xe1\\xb9\\x97\\xe1\\xb9\\x99\\xe1\\xb9\\x9b\\xe1\\xb9\\x9d\\xe1\\xb9\\x9f\\xe1\\xb9\\xa1\\xe1\\xb9\\xa3\\xe1\\xb9\\xa5\\xe1\\xb9\\xa7\\xe1\\xb9\\xa9\\xe1\\xb9\\xab\\xe1\\xb9\\xad\\xe1\\xb9\\xaf\\xe1\\xb9\\xb1\\xe1\\xb9\\xb3\\xe1\\xb9\\xb5\\xe1\\xb9\\xb7\\xe1\\xb9\\xb9\\xe1\\xb9\\xbb\\xe1\\xb9\\xbd\\xe1\\xb9\\xbf\\xe1\\xba\\x81\\xe1\\xba\\x83\\xe1\\xba\\x85\\xe1\\xba\\x87\\xe1\\xba\\x89\\xe1\\xba\\x8b\\xe1\\xba\\x8d\\xe1\\xba\\x8f\\xe1\\xba\\x91\\xe1\\xba\\x93\\xe1\\xba\\x95\\xe1\\xba\\x96\\xe1\\xba\\x97\\xe1\\xba\\x98\\xe1\\xba\\x99\\xe1\\xba\\x9a\\xe1\\xba\\x9b\\xe1\\xba\\x9c\\xe1\\xba\\x9d\\xe1\\xba\\x9f\\xe1\\xba\\xa1\\xe1\\xba\\xa3\\xe1\\xba\\xa5\\xe1\\xba\\xa7\\xe1\\xba\\xa9\\xe1\\xba\\xab\\xe1\\xba\\xad\\xe1\\xba\\xaf\\xe1\\xba\\xb1\\xe1\\xba\\xb3\\xe1\\xba\\xb5\\xe1\\xba\\xb7\\xe1\\xba\\xb9\\xe1\\xba\\xbb\\xe1\\xba\\xbd\\xe1\\xba\\xbf\\xe1\\xbb\\x81\\xe1\\xbb\\x83\\xe1\\xbb\\x85\\xe1\\xbb\\x87\\xe1\\xbb\\x89\\xe1\\xbb\\x8b\\xe1\\xbb\\x8d\\xe1\\xbb\\x8f\\xe1\\xbb\\x91\\xe1\\xbb\\x93\\xe1\\xbb\\x95\\xe1\\xbb\\x97\\xe1\\xbb\\x99\\xe1\\xbb\\x9b\\xe1\\xbb\\x9d\\xe1\\xbb\\x9f\\xe1\\xbb\\xa1\\xe1\\xbb\\xa3\\xe1\\xbb\\xa5\\xe1\\xbb\\xa7\\xe1\\xbb\\xa9\\xe1\\xbb\\xab\\xe1\\xbb\\xad\\xe1\\xbb\\xaf\\xe1\\xbb\\xb1\\xe1\\xbb\\xb3\\xe1\\xbb\\xb5\\xe1\\xbb\\xb7\\xe1\\xbb\\xb9\\xe1\\xbb\\xbb\\xe1\\xbb\\xbd\\xe1\\xbb\\xbf\\xe2\\x85\\x8e\\xe2\\x86\\x84\\xe2\\xb1\\xa1\\xe2\\xb1\\xa5\\xe2\\xb1\\xa6\\xe2\\xb1\\xa8\\xe2\\xb1\\xaa\\xe2\\xb1\\xac\\xe2\\xb1\\xb1\\xe2\\xb1\\xb3\\xe2\\xb1\\xb4\\xe2\\xb1\\xb6\\xe2\\xb1\\xb7\\xe2\\xb1\\xb8\\xe2\\xb1\\xb9\\xe2\\xb1\\xba\\xe2\\xb1\\xbb\\xe2\\xb1\\xbc\\xea\\x9c\\xa3\\xea\\x9c\\xa5\\xea\\x9c\\xa7\\xea\\x9c\\xa9\\xea\\x9c\\xab\\xea\\x9c\\xad\\xea\\x9c\\xaf\\xea\\x9c\\xb0\\xea\\x9c\\xb1\\xea\\x9c\\xb3\\xea\\x9c\\xb5\\xea\\x9c\\xb7\\xea\\x9c\\xb9\\xea\\x9c\\xbb\\xea\\x9c\\xbd\\xea\\x9c\\xbf\\xea\\x9d\\x81\\xea\\x9d\\x83\\xea\\x9d\\x85\\xea\\x9d\\x87\\xea\\x9d\\x89\\xea\\x9d\\x8b\\xea\\x9d\\x8d\\xea\\x9d\\x8f\\xea\\x9d\\x91\\xea\\x9d\\x93\\xea\\x9d\\x95\\xea\\x9d\\x97\\xea\\x9d\\x99\\xea\\x9d\\x9b\\xea\\x9d\\x9d\\xea\\x9d\\x9f\\xea\\x9d\\xa1\\xea\\x9d\\xa3\\xea\\x9d\\xa5\\xea\\x9d\\xa7\\xea\\x9d\\xa9\\xea\\x9d\\xab\\xea\\x9d\\xad\\xea\\x9d\\xaf\\xea\\x9d\\xb1\\xea\\x9d\\xb2\\xea\\x9d\\xb3\\xea\\x9d\\xb4\\xea\\x9d\\xb5\\xea\\x9d\\xb6\\xea\\x9d\\xb7\\xea\\x9d\\xb8\\xea\\x9d\\xba\\xea\\x9d\\xbc\\xea\\x9d\\xbf\\xea\\x9e\\x81\\xea\\x9e\\x83\\xea\\x9e\\x85\\xea\\x9e\\x87\\xea\\x9e\\x8c\\xef\\xac\\x80\\xef\\xac\\x81\\xef\\xac\\x82\\xef\\xac\\x83\\xef\\xac\\x84\\xef\\xac\\x85\\xef\\xac\\x86\\xef\\xbd\\x81\\xef\\xbd\\x82\\xef\\xbd\\x83\\xef\\xbd\\x84\\xef\\xbd\\x85\\xef\\xbd\\x86\\xef\\xbd\\x87\\xef\\xbd\\x88\\xef\\xbd\\x89\\xef\\xbd\\x8a\\xef\\xbd\\x8b\\xef\\xbd\\x8c\\xef\\xbd\\x8d\\xef\\xbd\\x8e\\xef\\xbd\\x8f\\xef\\xbd\\x90\\xef\\xbd\\x91\\xef\\xbd\\x92\\xef\\xbd\\x93\\xef\\xbd\\x94\\xef\\xbd\\x95\\xef\\xbd\\x96\\xef\\xbd\\x97\\xef\\xbd\\x98\\xef\\xbd\\x99\\xef\\xbd\\x9a' + # Extra rules defined here supplemental_transliterations = { 'latin-ascii': [ @@ -1128,10 +1131,38 @@ supplemental_transliterations = { # German transliterations not handled by standard NFD normalization # ä => ae (u'"\\xc3\\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', 'NULL', '0', 'NULL', '0'), + # Ä => Ae if followed by lower case Latin letter + (u'"\\xc3\\x84"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', '"[{}]"'.format(latin_lower_set), '0', u'"Ae"', '2', 'NULL', '0', 'NULL', '0'), + # Ä => AE otherwise + (u'"\\xc3\\x84"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"AE"', '2', 'NULL', '0', 'NULL', '0'), # ö => oe (u'"\\xc3\\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', 'NULL', '0', 'NULL', '0'), + # Ö => Oe if followed by lower case Latin letter + (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', '"[{}]"'.format(latin_lower_set), '0', u'"Oe"', '2', 'NULL', '0', 'NULL', '0'), + # Ö => OE otherwise + (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"OE"', '2', 'NULL', '0', 'NULL', '0'), + + # Note this is the German form. In Swedish ü => y, + # might make sense to split these rules into + # language-specific transliterators + # ü => ue (u'"\\xc3\\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', 'NULL', '0', 'NULL', '0'), + # Ü => Ue if followed by lower case Latin letter + (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', '"[{}]"'.format(latin_lower_set), '0', u'"Ue"', '2', 'NULL', '0', 'NULL', '0'), + # Ü => UE otherwise + (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"UE"', '2', 'NULL', '0', 'NULL', '0'), + + # Swedish transliterations not handled by standard NFD normalization + (u'"\\xc3\\xa5"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"aa"', '2', 'NULL', '0', 'NULL', '0'), + + # Å => Aa if followed by lower case Latin letter + (u'"\\xc3\\x85"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', '"[{}]"'.format(latin_lower_set), '0', u'"Aa"', '2', 'NULL', '0', 'NULL', '0'), + + # Å => AA otherwise + (u'"\\xc3\\x85"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"AA"', '2', 'NULL', '0', 'NULL', '0'), + + ]), (PREPEND_STEP, [(quote_string(name), str(len(name)), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), 'NULL', '0', 'NULL', '0') for name, value in html_escapes.iteritems()