[expand] adding a method that allows hash/equality comparisons of addresses like "100 Main" with "100 S Main St." or units like "Apt 101" vs. "#101". Instead of expanding the phrase abbreviations, this version tries its best to delete all but the root words in a string for a specific component. It's probably not perfect, but does handle a number of edge cases related to pre/post directionals in English e.g. "E St" will have a root word of simply "E", "Avenue E" => "E", etc. Also handles a variety of cases where the phrase could be a thoroughfare type but is really a root word such as "Park Pl" or the famous "Avenue Rd". This can be used for near dupe hashing to catch possible dupes for later analysis. Note that it will normalize "St Marks Pl" and "St Marks Ave" to the same thing, which is sometimes warranted (if the user typed the wrong thoroughfare), but can also be reconciled at deduping time.
This commit is contained in:
12
src/expand.h
12
src/expand.h
@@ -38,15 +38,19 @@ bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, l
|
||||
bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options);
|
||||
bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options);
|
||||
|
||||
string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options);
|
||||
|
||||
bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options);
|
||||
|
||||
void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options);
|
||||
|
||||
void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options);
|
||||
typedef enum {
|
||||
EXPAND_PHRASES,
|
||||
KEEP_PHRASES,
|
||||
DELETE_PHRASES
|
||||
} expansion_phrase_option_t;
|
||||
|
||||
char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n);
|
||||
char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option);
|
||||
char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n);
|
||||
void expansion_array_destroy(char **expansions, size_t n);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user