From 2e15db06dde23d17f8e20152be9364e5a504c648 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 21 Jan 2016 02:07:46 -0500 Subject: [PATCH] [text] making normalize_string directly callable from Python geodata --- scripts/geodata/text/normalize.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/geodata/text/normalize.py b/scripts/geodata/text/normalize.py index 15c4e067..253425a8 100644 --- a/scripts/geodata/text/normalize.py +++ b/scripts/geodata/text/normalize.py @@ -53,6 +53,16 @@ def remove_parens(tokens): return new_tokens +def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS): + s = safe_decode(s) + if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: + normalized = _normalize.normalize_string_latin(s, string_options) + else: + normalized = _normalize.normalize_string_utf8(s, string_options) + + return normalized + + def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, token_options=DEFAULT_TOKEN_OPTIONS, strip_parentheticals=True): @@ -67,11 +77,7 @@ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, Usage: normalized_tokens(u'St.-Barthélemy') ''' - s = safe_decode(s) - if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: - normalized = _normalize.normalize_string_latin(s, string_options) - else: - normalized = _normalize.normalize_string_utf8(s, string_options) + normalized = normalize_string(s, string_options=string_options) # Tuples of (offset, len, type) raw_tokens = tokenize_raw(normalized)