Merge branch 'master' into parser-data

This commit is contained in:
Al
2016-03-29 17:13:56 -04:00
5 changed files with 2712 additions and 2623 deletions

View File

@@ -1 +1,2 @@
casella postale|cp|c.p.|c.p|c p
casella postale|cp|c.p.|c.p|c p|cpt
presso|co|c \ o|c / o

View File

@@ -1,19 +1,38 @@
accesso|acc.so|accso
autostrada|aut.da|autda
alto|alt
alzaia|alz
androna|and
arco|arc
autostrada|aut.da|autda|aut
audostradale|aut.sle|autsle|aut sle|aut.ale|autale
autostrade|aut.de|autde
baluardo|bdo
banchina|bna
bivio|biv
borgata|bga
borgo|b.go|bgo
bretella|br.lla|brlla
bretelle|br.lle|brlle
calle|c.le|cle
brigata|bgt
calata|clt
calle|cal
campiello|cpl
campo|c.po|cpo
cascina|cna
canale|cnl
cantone|cat
casale|cas
cascina|c.na|cna
cavalcavia|cavalc.a|cavalca
chiusa|c.usa|cusa
circonvallazione|circonv.e|circonve|crv
contrà|con|contra'
contrada|contr.a|contra|c.da|cda|cnt
corso|c.so|cso
corte|c.te|cte
cortile|c.ile|ctile
cortile|c.ile|ctile|c.le|cle
cortina|ctn
costa|cta
crocevia|cra
cupa|cup
discesa|dis
ferrata|f.rata|frata
@@ -61,7 +80,7 @@ rampa|rpa
rampe|rpe
regione|reg
rio|rii
rio tera'|rit
rio terà|rit|rio tera'
ripa|ri
riva|rva
riviera|riv

File diff suppressed because it is too large Load Diff

View File

@@ -280,6 +280,8 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt
token_t token;
size_t added_expansions = 0;
if ((value.components & options.address_components) > 0) {
key->n = namespace_len;
for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
@@ -298,8 +300,13 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt
address_expansion_array *expansions = address_dictionary_get_expansions(key_str);
if (expansions != NULL) {
for (int j = 0; j < expansions->n; j++) {
address_expansion_t expansion = expansions->a[j];
if ((expansion.address_components & options.address_components) == 0) {
continue;
}
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
char *canonical_normalized = normalize_string_utf8(canonical, normalize_string_options);
@@ -347,10 +354,15 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt
}
cstring_array_terminate(tree->strings);
}
added_expansions++;
}
}
} else {
}
if (added_expansions == 0) {
uint32_t start_index = cstring_array_start_token(tree->strings);
for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
token = tokens->a[j];

View File

@@ -9,7 +9,7 @@ The normalize module provides several options for preprocessing full strings:
As well as normalizations for individual string tokens:
- Replace hyphens with space e.g. "quatre-vignt" => "quatre vignt"
- Replace hyphens with space e.g. "quatre-vingt" => "quatre vingt"
- Delete hyphens e.g. "auto-estrada" => "autoestrada"
- Delete final period "R." => "R"
- Delete acronym periods: "U.S.A." => "USA"
@@ -73,4 +73,4 @@ string_tree_t *normalize_string(char *str, uint64_t options);
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages);
#endif
#endif