[merge] merging commit from v1.1
This commit is contained in:
@@ -30,6 +30,7 @@ As well as normalizations for individual string tokens:
|
||||
|
||||
#include "constants.h"
|
||||
#include "klib/khash.h"
|
||||
#include "libpostal.h"
|
||||
#include "string_utils.h"
|
||||
#include "utf8proc/utf8proc.h"
|
||||
#include "unicode_scripts.h"
|
||||
@@ -39,25 +40,26 @@ As well as normalizations for individual string tokens:
|
||||
#include "tokens.h"
|
||||
#include "vector.h"
|
||||
|
||||
#define NORMALIZE_STRING_LATIN_ASCII 1 << 0
|
||||
#define NORMALIZE_STRING_TRANSLITERATE 1 << 1
|
||||
#define NORMALIZE_STRING_STRIP_ACCENTS 1 << 2
|
||||
#define NORMALIZE_STRING_DECOMPOSE 1 << 3
|
||||
#define NORMALIZE_STRING_LOWERCASE 1 << 4
|
||||
#define NORMALIZE_STRING_TRIM 1 << 5
|
||||
#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
|
||||
#define NORMALIZE_STRING_COMPOSE 1 << 7
|
||||
#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8
|
||||
#define NORMALIZE_STRING_REPLACE_NUMEX 1 << 9
|
||||
#define NORMALIZE_STRING_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII
|
||||
#define NORMALIZE_STRING_TRANSLITERATE LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE
|
||||
#define NORMALIZE_STRING_STRIP_ACCENTS LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS
|
||||
#define NORMALIZE_STRING_DECOMPOSE LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE
|
||||
#define NORMALIZE_STRING_LOWERCASE LIBPOSTAL_NORMALIZE_STRING_LOWERCASE
|
||||
#define NORMALIZE_STRING_TRIM LIBPOSTAL_NORMALIZE_STRING_TRIM
|
||||
#define NORMALIZE_STRING_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS
|
||||
#define NORMALIZE_STRING_COMPOSE LIBPOSTAL_NORMALIZE_STRING_COMPOSE
|
||||
#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII
|
||||
#define NORMALIZE_STRING_REPLACE_NUMEX LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX
|
||||
|
||||
#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
|
||||
#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
|
||||
#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2
|
||||
#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3
|
||||
#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4
|
||||
#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5
|
||||
#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6
|
||||
#define NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7
|
||||
#define NORMALIZE_TOKEN_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS
|
||||
#define NORMALIZE_TOKEN_DELETE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS
|
||||
#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD
|
||||
#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
|
||||
#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES
|
||||
#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
|
||||
#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
|
||||
#define NORMALIZE_TOKEN_REPLACE_DIGITS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS
|
||||
#define NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS
|
||||
|
||||
// Replace digits with capital D e.g. 10013 => DDDDD, intended for use with lowercased strings
|
||||
#define DIGIT_CHAR "D"
|
||||
|
||||
Reference in New Issue
Block a user