[normalize] Adding hyphen elimination as a string option (changes tokenization)
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
from postal.text import _normalize
|
from postal.text import _normalize
|
||||||
from postal.text import _tokenize
|
from postal.text import _tokenize
|
||||||
|
|
||||||
@@ -6,6 +7,8 @@ from postal.text.encoding import safe_decode
|
|||||||
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII | \
|
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII | \
|
||||||
_normalize.NORMALIZE_STRING_DECOMPOSE | \
|
_normalize.NORMALIZE_STRING_DECOMPOSE | \
|
||||||
_normalize.NORMALIZE_STRING_TRIM | \
|
_normalize.NORMALIZE_STRING_TRIM | \
|
||||||
|
_normalize.NORMALIZE_STRING_REPLACE_HYPHENS | \
|
||||||
|
_normalize.NORMALIZE_STRING_STRIP_ACCENTS | \
|
||||||
_normalize.NORMALIZE_STRING_LOWERCASE
|
_normalize.NORMALIZE_STRING_LOWERCASE
|
||||||
|
|
||||||
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
||||||
@@ -18,6 +21,17 @@ DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
|||||||
|
|
||||||
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
||||||
token_options=DEFAULT_TOKEN_OPTIONS):
|
token_options=DEFAULT_TOKEN_OPTIONS):
|
||||||
|
'''
|
||||||
|
Normalizes a string, tokenizes, and normalizes each token
|
||||||
|
with string and token-level options.
|
||||||
|
|
||||||
|
This version only uses libpostal's deterministic normalizations
|
||||||
|
i.e. methods with a single output. The string tree version will
|
||||||
|
return multiple normalized strings, each with tokens.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
normalized_tokens(u'St.-Barthélemy')
|
||||||
|
'''
|
||||||
s = safe_decode(s)
|
s = safe_decode(s)
|
||||||
if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
|
if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
|
||||||
normalized = _normalize.normalize_string_latin(s, string_options)
|
normalized = _normalize.normalize_string_latin(s, string_options)
|
||||||
|
|||||||
@@ -1,7 +1,5 @@
|
|||||||
#include <Python.h>
|
#include <Python.h>
|
||||||
|
|
||||||
#include "pyencoding.h"
|
|
||||||
|
|
||||||
#include "src/normalize.h"
|
#include "src/normalize.h"
|
||||||
#include "src/transliterate.h"
|
#include "src/transliterate.h"
|
||||||
|
|
||||||
@@ -307,6 +305,7 @@ init_normalize(void) {
|
|||||||
PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE));
|
PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE));
|
||||||
PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE));
|
PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE));
|
||||||
PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM));
|
PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM));
|
||||||
|
PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS));
|
||||||
|
|
||||||
|
|
||||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS));
|
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS));
|
||||||
|
|||||||
@@ -29,12 +29,21 @@ char *normalize_string_utf8(char *str, uint64_t options) {
|
|||||||
utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE;
|
utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char *normalized = NULL;
|
||||||
|
|
||||||
if (have_utf8proc_options) {
|
if (have_utf8proc_options) {
|
||||||
utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
|
utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
|
||||||
return (char *)utf8proc_normalized;
|
|
||||||
|
normalized = (char *)utf8proc_normalized;
|
||||||
|
str = normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
if (options & NORMALIZE_STRING_REPLACE_HYPHENS) {
|
||||||
|
string_replace(str, '-', ' ');
|
||||||
|
normalized = str;
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ As well as normalizations for individual string tokens:
|
|||||||
#define NORMALIZE_STRING_DECOMPOSE 1 << 3
|
#define NORMALIZE_STRING_DECOMPOSE 1 << 3
|
||||||
#define NORMALIZE_STRING_LOWERCASE 1 << 4
|
#define NORMALIZE_STRING_LOWERCASE 1 << 4
|
||||||
#define NORMALIZE_STRING_TRIM 1 << 5
|
#define NORMALIZE_STRING_TRIM 1 << 5
|
||||||
|
#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
|
||||||
|
|
||||||
#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
|
#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
|
||||||
#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
|
#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
|
||||||
|
|||||||
@@ -75,6 +75,12 @@ inline void string_upper(char *s) {
|
|||||||
for (; *s; ++s) *s = toupper(*s);
|
for (; *s; ++s) *s = toupper(*s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void string_replace(char *s, char c1, char c2) {
|
||||||
|
for (; *s; ++s) {
|
||||||
|
if (*s == c1) *s = c2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inline bool string_is_upper(char *s) {
|
inline bool string_is_upper(char *s) {
|
||||||
for (; *s; ++s) {
|
for (; *s; ++s) {
|
||||||
if (*s != toupper(*s)) return false;
|
if (*s != toupper(*s)) return false;
|
||||||
@@ -93,6 +99,7 @@ inline bool string_is_lower(char *s) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) {
|
uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) {
|
||||||
uint32_t num_replacements = 0;
|
uint32_t num_replacements = 0;
|
||||||
|
|
||||||
|
|||||||
@@ -55,6 +55,8 @@ void string_lower(char *s);
|
|||||||
bool string_is_upper(char *s);
|
bool string_is_upper(char *s);
|
||||||
void string_upper(char *s);
|
void string_upper(char *s);
|
||||||
|
|
||||||
|
void string_replace(char *s, char c1, char c2);
|
||||||
|
|
||||||
bool string_starts_with(const char *str, const char *start);
|
bool string_starts_with(const char *str, const char *start);
|
||||||
bool string_ends_with(const char *str, const char *ending);
|
bool string_ends_with(const char *str, const char *ending);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user