From 40918812e2049bc6e2e65af679c65e2c50f2b1ff Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 27 Oct 2015 13:32:47 -0400 Subject: [PATCH] [normalize] Adding hyphen elimination as a string option (changes tokenization) --- python/postal/text/normalize.py | 14 ++++++++++++++ python/postal/text/pynormalize.c | 3 +-- src/normalize.c | 13 +++++++++++-- src/normalize.h | 1 + src/string_utils.c | 7 +++++++ src/string_utils.h | 2 ++ 6 files changed, 36 insertions(+), 4 deletions(-) diff --git a/python/postal/text/normalize.py b/python/postal/text/normalize.py index c2fed1a1..b9700433 100644 --- a/python/postal/text/normalize.py +++ b/python/postal/text/normalize.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from postal.text import _normalize from postal.text import _tokenize @@ -6,6 +7,8 @@ from postal.text.encoding import safe_decode DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII | \ _normalize.NORMALIZE_STRING_DECOMPOSE | \ _normalize.NORMALIZE_STRING_TRIM | \ + _normalize.NORMALIZE_STRING_REPLACE_HYPHENS | \ + _normalize.NORMALIZE_STRING_STRIP_ACCENTS | \ _normalize.NORMALIZE_STRING_LOWERCASE DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \ @@ -18,6 +21,17 @@ DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, token_options=DEFAULT_TOKEN_OPTIONS): + ''' + Normalizes a string, tokenizes, and normalizes each token + with string and token-level options. + + This version only uses libpostal's deterministic normalizations + i.e. methods with a single output. The string tree version will + return multiple normalized strings, each with tokens. + + Usage: + normalized_tokens(u'St.-Barthélemy') + ''' s = safe_decode(s) if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: normalized = _normalize.normalize_string_latin(s, string_options) diff --git a/python/postal/text/pynormalize.c b/python/postal/text/pynormalize.c index e2e5c277..9df1ae13 100644 --- a/python/postal/text/pynormalize.c +++ b/python/postal/text/pynormalize.c @@ -1,7 +1,5 @@ #include -#include "pyencoding.h" - #include "src/normalize.h" #include "src/transliterate.h" @@ -307,6 +305,7 @@ init_normalize(void) { PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE)); PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE)); PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM)); + PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS)); PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS)); diff --git a/src/normalize.c b/src/normalize.c index 6717aca1..dd2e74a0 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -29,12 +29,21 @@ char *normalize_string_utf8(char *str, uint64_t options) { utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE; } + char *normalized = NULL; + if (have_utf8proc_options) { utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options); - return (char *)utf8proc_normalized; + + normalized = (char *)utf8proc_normalized; + str = normalized; } - return NULL; + if (options & NORMALIZE_STRING_REPLACE_HYPHENS) { + string_replace(str, '-', ' '); + normalized = str; + } + + return normalized; } diff --git a/src/normalize.h b/src/normalize.h index 427924ac..d7ea2a0b 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -44,6 +44,7 @@ As well as normalizations for individual string tokens: #define NORMALIZE_STRING_DECOMPOSE 1 << 3 #define NORMALIZE_STRING_LOWERCASE 1 << 4 #define NORMALIZE_STRING_TRIM 1 << 5 +#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 #define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 #define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 diff --git a/src/string_utils.c b/src/string_utils.c index a7ee82fc..a77f4d5c 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -75,6 +75,12 @@ inline void string_upper(char *s) { for (; *s; ++s) *s = toupper(*s); } +inline void string_replace(char *s, char c1, char c2) { + for (; *s; ++s) { + if (*s == c1) *s = c2; + } +} + inline bool string_is_upper(char *s) { for (; *s; ++s) { if (*s != toupper(*s)) return false; @@ -93,6 +99,7 @@ inline bool string_is_lower(char *s) { return true; } + uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) { uint32_t num_replacements = 0; diff --git a/src/string_utils.h b/src/string_utils.h index 7c1b7bcf..397912ca 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -55,6 +55,8 @@ void string_lower(char *s); bool string_is_upper(char *s); void string_upper(char *s); +void string_replace(char *s, char c1, char c2); + bool string_starts_with(const char *str, const char *start); bool string_ends_with(const char *str, const char *ending);