From 40918812e2049bc6e2e65af679c65e2c50f2b1ff Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Tue, 27 Oct 2015 13:32:47 -0400
Subject: [PATCH] [normalize] Adding hyphen elimination as a string option
 (changes tokenization)

---
 python/postal/text/normalize.py  | 14 ++++++++++++++
 python/postal/text/pynormalize.c |  3 +--
 src/normalize.c                  | 13 +++++++++++--
 src/normalize.h                  |  1 +
 src/string_utils.c               |  7 +++++++
 src/string_utils.h               |  2 ++
 6 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/python/postal/text/normalize.py b/python/postal/text/normalize.py
index c2fed1a1..b9700433 100644
--- a/python/postal/text/normalize.py
+++ b/python/postal/text/normalize.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from postal.text import _normalize
 from postal.text import _tokenize
 
@@ -6,6 +7,8 @@ from postal.text.encoding import safe_decode
 DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII |  \
     _normalize.NORMALIZE_STRING_DECOMPOSE | \
     _normalize.NORMALIZE_STRING_TRIM | \
+    _normalize.NORMALIZE_STRING_REPLACE_HYPHENS | \
+    _normalize.NORMALIZE_STRING_STRIP_ACCENTS | \
     _normalize.NORMALIZE_STRING_LOWERCASE
 
 DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
@@ -18,6 +21,17 @@ DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
 
 def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
                       token_options=DEFAULT_TOKEN_OPTIONS):
+    '''
+    Normalizes a string, tokenizes, and normalizes each token
+    with string and token-level options.
+
+    This version only uses libpostal's deterministic normalizations
+    i.e. methods with a single output. The string tree version will
+    return multiple normalized strings, each with tokens.
+
+    Usage:
+        normalized_tokens(u'St.-Barthélemy')
+    '''
     s = safe_decode(s)
     if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
         normalized = _normalize.normalize_string_latin(s, string_options)
diff --git a/python/postal/text/pynormalize.c b/python/postal/text/pynormalize.c
index e2e5c277..9df1ae13 100644
--- a/python/postal/text/pynormalize.c
+++ b/python/postal/text/pynormalize.c
@@ -1,7 +1,5 @@
 #include <Python.h>
 
-#include "pyencoding.h"
-
 #include "src/normalize.h"
 #include "src/transliterate.h"
 
@@ -307,6 +305,7 @@ init_normalize(void) {
     PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE));
     PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE));
     PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM));
+    PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS));
 
 
     PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS));
diff --git a/src/normalize.c b/src/normalize.c
index 6717aca1..dd2e74a0 100644
--- a/src/normalize.c
+++ b/src/normalize.c
@@ -29,12 +29,21 @@ char *normalize_string_utf8(char *str, uint64_t options) {
         utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE;
     }
 
+    char *normalized = NULL;
+
     if (have_utf8proc_options) {
         utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
-        return (char *)utf8proc_normalized;
+
+        normalized = (char *)utf8proc_normalized;
+        str = normalized;
     }
 
-    return NULL;
+    if (options & NORMALIZE_STRING_REPLACE_HYPHENS) {
+        string_replace(str, '-', ' ');
+        normalized = str;
+    }
+
+    return normalized;
 }
 
 
diff --git a/src/normalize.h b/src/normalize.h
index 427924ac..d7ea2a0b 100644
--- a/src/normalize.h
+++ b/src/normalize.h
@@ -44,6 +44,7 @@ As well as normalizations for individual string tokens:
 #define NORMALIZE_STRING_DECOMPOSE 1 << 3
 #define NORMALIZE_STRING_LOWERCASE 1 << 4
 #define NORMALIZE_STRING_TRIM 1 << 5
+#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
 
 #define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
 #define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
diff --git a/src/string_utils.c b/src/string_utils.c
index a7ee82fc..a77f4d5c 100644
--- a/src/string_utils.c
+++ b/src/string_utils.c
@@ -75,6 +75,12 @@ inline void string_upper(char *s) {
     for (; *s; ++s) *s = toupper(*s);
 }
 
+inline void string_replace(char *s, char c1, char c2) {
+    for (; *s; ++s) {
+        if (*s == c1) *s = c2;
+    }
+}
+
 inline bool string_is_upper(char *s) {
     for (; *s; ++s) {
         if (*s != toupper(*s)) return false;
@@ -93,6 +99,7 @@ inline bool string_is_lower(char *s) {
     return true;
 }
 
+
 uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) {
     uint32_t num_replacements = 0;
     
diff --git a/src/string_utils.h b/src/string_utils.h
index 7c1b7bcf..397912ca 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -55,6 +55,8 @@ void string_lower(char *s);
 bool string_is_upper(char *s);
 void string_upper(char *s);
 
+void string_replace(char *s, char c1, char c2);
+
 bool string_starts_with(const char *str, const char *start);
 bool string_ends_with(const char *str, const char *ending);