[python/normalize] Adding remove parentheses options in Python normalize (would require compiling with the scanner to do it from C, but could switch)

2015-10-30 01:27:13 -04:00
parent 5076c0409b
commit e7f783477f
1 changed files with 24 additions and 3 deletions
--- a/python/postal/text/normalize.py
+++ b/python/postal/text/normalize.py
@@ -20,8 +20,23 @@ DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
    _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS


+def remove_parens(tokens):
+    new_tokens = []
+    open_parens = 0
+    for t, c in tokens:
+        if c == token_types.PUNCT_OPEN:
+            open_parens += 1
+        elif c == token_types.PUNCT_CLOSE:
+            if open_parens > 0:
+                open_parens -= 1
+        elif open_parens <= 0:
+            new_tokens.append((t, c))
+    return new_tokens
+
+
 def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
-                      token_options=DEFAULT_TOKEN_OPTIONS):
+                      token_options=DEFAULT_TOKEN_OPTIONS,
+                      strip_parentheticals=True):
    '''
    Normalizes a string, tokenizes, and normalizes each token
    with string and token-level options.
@@ -40,5 +55,11 @@ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
        normalized = _normalize.normalize_string_utf8(s, string_options)

    # Tuples of (offset, len, type)
-    tokens = tokenize_raw(normalized)
-    return [(_normalize.normalize_token(normalized, t, token_options), token_types.from_id(t[-1])) for t in tokens]
+    raw_tokens = tokenize_raw(normalized)
+    tokens = [(_normalize.normalize_token(normalized, t, token_options),
+               token_types.from_id(t[-1])) for t in raw_tokens]
+
+    if strip_parentheticals:
+        return remove_parens(tokens)
+    else:
+        return tokens