[normalize] Adding hyphen elimination as a string option (changes tokenization)
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from postal.text import _normalize
|
||||
from postal.text import _tokenize
|
||||
|
||||
@@ -6,6 +7,8 @@ from postal.text.encoding import safe_decode
|
||||
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII | \
|
||||
_normalize.NORMALIZE_STRING_DECOMPOSE | \
|
||||
_normalize.NORMALIZE_STRING_TRIM | \
|
||||
_normalize.NORMALIZE_STRING_REPLACE_HYPHENS | \
|
||||
_normalize.NORMALIZE_STRING_STRIP_ACCENTS | \
|
||||
_normalize.NORMALIZE_STRING_LOWERCASE
|
||||
|
||||
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
||||
@@ -18,6 +21,17 @@ DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
||||
|
||||
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
||||
token_options=DEFAULT_TOKEN_OPTIONS):
|
||||
'''
|
||||
Normalizes a string, tokenizes, and normalizes each token
|
||||
with string and token-level options.
|
||||
|
||||
This version only uses libpostal's deterministic normalizations
|
||||
i.e. methods with a single output. The string tree version will
|
||||
return multiple normalized strings, each with tokens.
|
||||
|
||||
Usage:
|
||||
normalized_tokens(u'St.-Barthélemy')
|
||||
'''
|
||||
s = safe_decode(s)
|
||||
if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
|
||||
normalized = _normalize.normalize_string_latin(s, string_options)
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#include <Python.h>
|
||||
|
||||
#include "pyencoding.h"
|
||||
|
||||
#include "src/normalize.h"
|
||||
#include "src/transliterate.h"
|
||||
|
||||
@@ -307,6 +305,7 @@ init_normalize(void) {
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS));
|
||||
|
||||
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS));
|
||||
|
||||
Reference in New Issue
Block a user