Merge branch 'master' into parser-data

This commit is contained in:
Al
2016-03-23 15:54:36 -04:00
11 changed files with 3430 additions and 3314 deletions

View File

@@ -1,4 +1,7 @@
language: c
brances:
only:
- master
env:
global:
- secure: "bHrAu46oecEj3gjamT+XWXtf2J0ZJCFa8tUdgM4evscaJiiwv1TtsGXyhIj/ai7DlRIPVJUtBUy6uoGGjr6GT43zTrzSxYAOMdVXZYsnTDcdL1/0dbwcIK6/u0EI377s1buGIxG1fHveWKXuXwJWDAw4KS+5HU88a42+zMbhKe4="

View File

@@ -0,0 +1 @@
senza numero civico|snc|s.n.c.|s n c

View File

@@ -15,12 +15,12 @@ corso|c.so|cso
corte|c.te|cte
cortile|c.ile|ctile
ferrata|f.rata|frata
fondamenta|f.ta|fta
fondamenta|f.ta|fta|fon
forca|f.ca|fca
forcella|forc.la|forcla
frazione|fraz.e|fraze
frazione|fraz.e|fraze|frz
frazioni|fraz.i|frazi
galleria|gall.a|galla
galleria|gall.a|galla|gal
gallerie|gall.e|galle
genna|g.na|gna
genne|g.ne|gne
@@ -32,26 +32,35 @@ piazzale|p.le|ple
piazzetta|p.ta|pta
ponte|p.te|pte
porta|p.ta|pta
porto|pto
provinciale
regione|reg
salizada|s.da|sda
stazione|staz
strada|s|str.a|stra|str
strada comunale|sc|s.c.|s.c|s c
strada provinciale|sp|s.p.|s.p|s p|str provinciale|s provinciale|str.prov|str prov|s.prov|s prov
strada comunale|sc|s.c.|s.c|s c|stc
strada provinciale|sp|s.p.|s.p|s p|str provinciale|s provinciale|str.prov|str prov|s.prov|s prov|stp
strada regionale|sr|s.r.|s.r|s r|str.reg|str reg|str r|str.r|str reginale|s reginale
strada statale|ss|s.s.|s.s|s s|str.s|str s|str.st|str st|str statale|s statale
strada vicinale|sv|s.v.|s.v|s v|str.vic|str vic|str vicinale|s vicinale
strada statale|ss|s.s.|s.s|s s|str.s|str s|str.st|str st|str statale|s statale|sts
strada vicinale|sv|s.v.|s.v|s v|str.vic|str vic|str vicinale|s vicinale|svc
stretta|str.ta|strta
strette|str.te|strte
stretti|str.ti|strti
stretto|str.to|strto
superstrada
tangenziale
traversa
traversa|tra
traversa nuova|tnu
traversa privata|tpr
traversa strada statale|tra s.s.
traversa vicinale|tvc
via|v
via comunale|vc|v.c.|v.c|v c
via provincale|vp|v.p.|v.p|v p|v provinciale
via statale|vs|v.s.|v.s|v s
via vicinale|vvc
viale|v.le|vle|vl
vicolo|v.lo|vlo
vico
vico|vic
vocabolo|voc
zona industriale|zin

View File

@@ -10,4 +10,4 @@ giardino|g.no|gno
numero|nº|no|n.º|n.o|n|#
officina|off.a|off
palazzo|pal.zo|palzo
piano|pº|p.º|p.o|p.nº|p.no|pnº|pno
piano|pº|p.º|p.o|p.nº|p.no|pnº|pno

View File

@@ -5,6 +5,8 @@
#define ADDRESS_DICTIONARY_SIGNATURE 0xBABABABA
#define ADDRESS_DICTIONARY_SETUP_ERROR "address_dictionary module not setup, call libpostal_setup() or address_dictionary_module_setup()\n"
address_dictionary_t *address_dict = NULL;
address_dictionary_t *get_address_dictionary(void) {
@@ -12,28 +14,47 @@ address_dictionary_t *get_address_dictionary(void) {
}
address_expansion_array *address_dictionary_get_expansions(char *key) {
if (address_dict == NULL || address_dict->expansions == NULL) return NULL;
if (address_dict == NULL || address_dict->expansions == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return NULL;
}
khiter_t k = kh_get(str_expansions, address_dict->expansions, key);
return k != kh_end(address_dict->expansions) ? kh_value(address_dict->expansions, k) : NULL;
}
int32_t address_dictionary_next_canonical_index(void) {
if (address_dict == NULL || address_dict->canonical == NULL) return -1;
if (address_dict == NULL || address_dict->canonical == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return -1;
}
return (int32_t)cstring_array_num_strings(address_dict->canonical);
}
bool address_dictionary_add_canonical(char *canonical) {
if (address_dict == NULL || address_dict->canonical == NULL) return false;
if (address_dict == NULL || address_dict->canonical == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return false;
}
cstring_array_add_string(address_dict->canonical, canonical);
return true;
}
char *address_dictionary_get_canonical(uint32_t index) {
if (address_dict == NULL || address_dict->canonical == NULL || index > cstring_array_num_strings(address_dict->canonical)) return NULL;
if (address_dict == NULL || address_dict->canonical == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return NULL;
} else if (index > cstring_array_num_strings(address_dict->canonical)) {
return NULL;
}
return cstring_array_get_string(address_dict->canonical, index);
}
bool address_dictionary_add_expansion(char *name, char *language, address_expansion_t expansion) {
if (address_dict == NULL || address_dict->expansions == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return false;
}
if (name == NULL) return false;
int ret;
@@ -162,6 +183,10 @@ static trie_prefix_result_t get_language_prefix(char *lang) {
bool search_address_dictionaries_with_phrases(char *str, char *lang, phrase_array **phrases) {
if (str == NULL) return false;
if (address_dict == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return false;
}
trie_prefix_result_t prefix = get_language_prefix(lang);
@@ -185,6 +210,10 @@ phrase_array *search_address_dictionaries(char *str, char *lang) {
bool search_address_dictionaries_tokens_with_phrases(char *str, token_array *tokens, char *lang, phrase_array **phrases) {
if (str == NULL) return false;
if (address_dict == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return false;
}
trie_prefix_result_t prefix = get_language_prefix(lang);
@@ -208,6 +237,10 @@ phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens,
phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang) {
if (str == NULL) return NULL_PHRASE;
if (address_dict == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return NULL_PHRASE;
}
trie_prefix_result_t prefix = get_language_prefix(lang);
@@ -221,6 +254,10 @@ phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang) {
phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang) {
if (str == NULL) return NULL_PHRASE;
if (address_dict == NULL) {
log_error(ADDRESS_DICTIONARY_SETUP_ERROR);
return NULL_PHRASE;
}
trie_prefix_result_t prefix = get_language_prefix(lang);

File diff suppressed because it is too large Load Diff

View File

@@ -803,13 +803,18 @@ address_parser_response_t *address_parser_response_new(void) {
address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context) {
if (address == NULL || context == NULL) return NULL;
address_parser_t *parser = get_address_parser();
if (parser == NULL) {
log_error("parser is not setup, call libpostal_setup_address_parser()\n");
return NULL;
}
char *normalized = address_parser_normalize_string(address);
bool is_normalized = normalized != NULL;
if (!is_normalized) {
normalized = address;
}
address_parser_t *parser = get_address_parser();
averaged_perceptron_t *model = parser->model;
token_array *tokens = tokenize(normalized);

View File

@@ -10,6 +10,8 @@
#define LANGUAGE_CLASSIFIER_SIGNATURE 0xCCCCCCCC
#define LANGUAGE_CLASSIFIER_SETUP_ERROR "language_classifier not loaded, run libpostal_setup_language_classifier()\n"
#define MIN_PROB (0.05 - DBL_EPSILON)
static language_classifier_t *language_classifier = NULL;
@@ -58,7 +60,7 @@ language_classifier_response_t *classify_languages(char *address) {
language_classifier_t *classifier = get_language_classifier();
if (classifier == NULL) {
log_error("classifier NULL\n");
log_error(LANGUAGE_CLASSIFIER_SETUP_ERROR);
return NULL;
}

View File

@@ -158,8 +158,6 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
ptr += script_len;
}
char_array *normalized = char_array_new_size(len);
add_latin_alternatives(tree, str, len, options);
size_t non_latin_scripts = kh_size(scripts);

View File

@@ -3,8 +3,12 @@
#include "numex.h"
#include "file_utils.h"
#include "log/log.h"
#define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB
#define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n"
#define SEPARATOR_TOKENS "-"
#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + FLT_EPSILON)
@@ -118,6 +122,7 @@ void numex_language_destroy(numex_language_t *self) {
bool numex_table_add_language(numex_language_t *language) {
if (numex_table == NULL) {
log_error(NUMEX_SETUP_ERROR);
return false;
}
@@ -130,6 +135,7 @@ bool numex_table_add_language(numex_language_t *language) {
numex_language_t *get_numex_language(char *name) {
if (numex_table == NULL) {
log_error(NUMEX_SETUP_ERROR);
return NULL;
}
@@ -616,7 +622,10 @@ static inline numex_rule_t get_numex_rule(size_t i) {
}
numex_result_array *convert_numeric_expressions(char *str, char *lang) {
if (numex_table == NULL) return NULL;
if (numex_table == NULL) {
log_error(NUMEX_SETUP_ERROR);
return NULL;
}
trie_t *trie = numex_table->trie;
if (trie == NULL) return NULL;
@@ -897,6 +906,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result) {
if (numex_table == NULL) {
log_error(NUMEX_SETUP_ERROR);
return NULL;
}

View File

@@ -2,6 +2,8 @@
#include "transliterate.h"
#include "file_utils.h"
#include "log/log.h"
#define TRANSLITERATION_TABLE_SIGNATURE 0xAAAAAAAA
#define NFD "NFD"
@@ -664,7 +666,14 @@ static char *replace_groups(trie_t *trie, char *str, char *replacement, group_ca
}
char *transliterate(char *trans_name, char *str, size_t len) {
if (trans_name == NULL || str == NULL || trans_table == NULL) return NULL;
if (trans_name == NULL || str == NULL) return NULL;
transliteration_table_t *trans_table = get_transliteration_table();
if (trans_table == NULL) {
log_error("transliteration table is NULL. Call libpostal_setup() or transliteration_module_setup()\n");
return NULL;
}
trie_t *trie = trans_table->trie;