[merge] merging commit from v1.1

This commit is contained in:
Al
2017-08-14 04:04:58 -06:00
parent bb277fb326
commit 448ca6a61a
10 changed files with 374 additions and 294 deletions

View File

@@ -30,6 +30,7 @@ As well as normalizations for individual string tokens:
#include "constants.h"
#include "klib/khash.h"
#include "libpostal.h"
#include "string_utils.h"
#include "utf8proc/utf8proc.h"
#include "unicode_scripts.h"
@@ -39,25 +40,26 @@ As well as normalizations for individual string tokens:
#include "tokens.h"
#include "vector.h"
#define NORMALIZE_STRING_LATIN_ASCII 1 << 0
#define NORMALIZE_STRING_TRANSLITERATE 1 << 1
#define NORMALIZE_STRING_STRIP_ACCENTS 1 << 2
#define NORMALIZE_STRING_DECOMPOSE 1 << 3
#define NORMALIZE_STRING_LOWERCASE 1 << 4
#define NORMALIZE_STRING_TRIM 1 << 5
#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
#define NORMALIZE_STRING_COMPOSE 1 << 7
#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8
#define NORMALIZE_STRING_REPLACE_NUMEX 1 << 9
#define NORMALIZE_STRING_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII
#define NORMALIZE_STRING_TRANSLITERATE LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE
#define NORMALIZE_STRING_STRIP_ACCENTS LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS
#define NORMALIZE_STRING_DECOMPOSE LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE
#define NORMALIZE_STRING_LOWERCASE LIBPOSTAL_NORMALIZE_STRING_LOWERCASE
#define NORMALIZE_STRING_TRIM LIBPOSTAL_NORMALIZE_STRING_TRIM
#define NORMALIZE_STRING_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS
#define NORMALIZE_STRING_COMPOSE LIBPOSTAL_NORMALIZE_STRING_COMPOSE
#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII
#define NORMALIZE_STRING_REPLACE_NUMEX LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX
#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2
#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3
#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4
#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5
#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6
#define NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7
#define NORMALIZE_TOKEN_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS
#define NORMALIZE_TOKEN_DELETE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS
#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD
#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES
#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
#define NORMALIZE_TOKEN_REPLACE_DIGITS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS
#define NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS
// Replace digits with capital D e.g. 10013 => DDDDD, intended for use with lowercased strings
#define DIGIT_CHAR "D"