[dedupe] adding the core pairwise deduping module which ties together most of the work on this branch. Includes simple phrase-aware exact deduping methods, with per-component variations as to whether e.g. a root expansion match counts as an exact duplicate or not (in a secondary unit, "No. 2" and "Apt 2" can be considered an exact match in English whereas we wouldn't want to make that kind of assumption for street e.g. "Park Ave" and "Park Pl"). The API is fairly low-level at present, and may require a few calls. Notably, we leave the TFIDF scores or other weighting schemes to the client. Since each component gets its own dupe classification, it leaves the door open for doing more specific checks around e.g. compound house numbers/ranges in the future.

This commit is contained in:
Al
2017-12-29 04:48:00 -05:00
parent 1f1412c120
commit 098babfdee
2 changed files with 414 additions and 0 deletions

23
src/dedupe.h Normal file
View File

@@ -0,0 +1,23 @@
#ifndef DEDUPE_H
#define DEDUPE_H
#include <stdlib.h>
#include <stdio.h>
#include "libpostal.h"
#include "string_utils.h"
libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options);
libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options);
libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options);
#endif