From c383f8af886627436eacfa6e10d34e97f4894626 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 29 May 2016 01:37:38 -0400 Subject: [PATCH] [parser] Using NFC normalization for parser as well, @ sign not defined as separator since it may also be used in intersections --- src/address_parser.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/address_parser.h b/src/address_parser.h index c929323c..bd70d1c8 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -58,14 +58,14 @@ with the general error-driven averaged perceptron. #define NULL_PHRASE_MEMBERSHIP -1 -#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII +#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII #define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS #define ADDRESS_SEPARATOR_NONE 0 #define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0 #define ADDRESS_SEPARATOR_FIELD 1 << 1 -#define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH|| (token_type) == SEMICOLON || (token_type) == PUNCT_OPEN || (token_type) == PUNCT_CLOSE || (token_type) == AT_SIGN ) +#define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH|| (token_type) == SEMICOLON || (token_type) == PUNCT_OPEN || (token_type) == PUNCT_CLOSE ) #define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD || (token_type) == COLON ) #define SEPARATOR_LABEL "sep"