[parser] As part of reading/tokenizing the address parser data set,

several copies of the same training example will be generated.

1. with only lowercasing
2. with simple Latin-ASCII normalization (no umlauts, only things that
are common to all languages)
3. basic UTF-8 normalizations (accent stripping)
4. language-specific Latin-ASCII transliteration (e.g. ü => ue in German)

This will apply both on the initial passes when building the phrase
gazetteers and during each iteration of training. In this way, only the
most basic normalizations like lowercasing need to be done at runtime
and it's possible to use only minimal normalizations like lowercasing.

May have a small effect on randomization as examples are created in a
deterministic order. However, this should not lead to cycles since the
base examples are shuffled, thus still satisfying the random permutation
requirement of an online/stochastic learning algorithm.
This commit is contained in:
Al
2016-12-02 13:09:03 -05:00
parent adab232674
commit 22c4e99ea0
2 changed files with 127 additions and 51 deletions

View File

@@ -58,9 +58,13 @@ with the general error-driven averaged perceptron.
#define NULL_PHRASE_MEMBERSHIP -1
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_SIMPLE_LATIN_ASCII
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8 NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_STRIP_ACCENTS
#define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS
#define ADDRESS_PARSER_NORMALIZE_PHRASE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
#define ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
#define ADDRESS_PARSER_NORMALIZE_POSTAL_CODE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
#define ADDRESS_SEPARATOR_NONE 0
#define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0
@@ -72,18 +76,17 @@ with the general error-driven averaged perceptron.
#define SEPARATOR_LABEL "sep"
#define FIELD_SEPARATOR_LABEL "fsep"
#define ADDRESS_COMPONENT_HOUSE 1 << 0
#define ADDRESS_COMPONENT_HOUSE_NUMBER 1 << 1
#define ADDRESS_COMPONENT_ROAD 1 << 4
#define ADDRESS_COMPONENT_SUBURB 1 << 7
#define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 8
#define ADDRESS_COMPONENT_CITY 1 << 9
#define ADDRESS_COMPONENT_ISLAND 1 << 10
#define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 11
#define ADDRESS_COMPONENT_STATE 1 << 12
#define ADDRESS_COMPONENT_POSTAL_CODE 1 << 13
#define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 14
#define ADDRESS_COMPONENT_COUNTRY 1 << 15
#define ADDRESS_COMPONENT_NON_BOUNDARY 1 << 0
#define ADDRESS_COMPONENT_SUBURB 1 << 3
#define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 4
#define ADDRESS_COMPONENT_CITY 1 << 5
#define ADDRESS_COMPONENT_ISLAND 1 << 7
#define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 8
#define ADDRESS_COMPONENT_STATE 1 << 9
#define ADDRESS_COMPONENT_POSTAL_CODE 1 << 10
#define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 11
#define ADDRESS_COMPONENT_COUNTRY 1 << 13
#define ADDRESS_COMPONENT_WORLD_REGION 1 << 14
typedef enum {
ADDRESS_PARSER_BOUNDARY_NONE,
@@ -96,6 +99,7 @@ typedef enum {
ADDRESS_PARSER_BOUNDARY_POSTAL_CODE,
ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION,
ADDRESS_PARSER_BOUNDARY_COUNTRY,
ADDRESS_PARSER_BOUNDARY_WORLD_REGION,
NUM_ADDRESS_PARSER_BOUNDARY_TYPES
} address_parser_boundary_components;
@@ -110,8 +114,9 @@ typedef enum {
#define ADDRESS_PARSER_LABEL_ISLAND "island"
#define ADDRESS_PARSER_LABEL_STATE "state"
#define ADDRESS_PARSER_LABEL_POSTAL_CODE "postcode"
#define ADDRESS_PARSER_LABEL_COUNTRY "country_region"
#define ADDRESS_PARSER_LABEL_COUNTRY_REGION "country_region"
#define ADDRESS_PARSER_LABEL_COUNTRY "country"
#define ADDRESS_PARSER_LABEL_WORLD_REGION "world_region"
typedef union address_parser_types {
uint32_t value;