[osm] Avoid using the alternate name (e.g. Brooklyn instead of Kings County) when it is the same as city

This commit is contained in:
Al
2015-12-05 14:21:07 -05:00
parent 7c26317903
commit f41158b8b3
2 changed files with 48 additions and 2 deletions

View File

@@ -696,6 +696,13 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
if place_type == 'borough' or polygon_type == 'local_admin': if place_type == 'borough' or polygon_type == 'local_admin':
neighborhood_level = AddressFormatter.CITY_DISTRICT neighborhood_level = AddressFormatter.CITY_DISTRICT
# Optimization so we don't use Brooklyn for Kings County
city_name = address_components.get(AddressFormatter.CITY)
if name == city_name:
name = neighbrohood.get(name_key, neighborhood.get(raw_name_key))
if not name or name == city_name:
continue
neighborhood_levels[neighborhood_level].append(name) neighborhood_levels[neighborhood_level].append(name)
for component, neighborhoods in neighborhood_levels.iteritems(): for component, neighborhoods in neighborhood_levels.iteritems():

View File

@@ -71,6 +71,41 @@ with the general error-driven averaged perceptron.
#define SEPARATOR_LABEL "sep" #define SEPARATOR_LABEL "sep"
#define FIELD_SEPARATOR_LABEL "fsep" #define FIELD_SEPARATOR_LABEL "fsep"
#define ADDRESS_COMPONENT_HOUSE 1 << 0
#define ADDRESS_COMPONENT_HOUSE_NUMBER 1 << 1
#define ADDRESS_COMPONENT_ROAD 1 << 4
#define ADDRESS_COMPONENT_SUBURB 1 << 7
#define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 8
#define ADDRESS_COMPONENT_CITY 1 << 9
#define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 10
#define ADDRESS_COMPONENT_STATE 1 << 11
#define ADDRESS_COMPONENT_POSTAL_CODE 1 << 12
#define ADDRESS_COMPONENT_COUNTRY 1 << 13
enum {
ADDRESS_PARSER_HOUSE,
ADDRESS_PARSER_HOUSE_NUMBER,
ADDRESS_PARSER_ROAD,
ADDRESS_PARSER_SUBURB,
ADDRESS_PARSER_CITY_DISTRICT,
ADDRESS_PARSER_CITY,
ADDRESS_PARSER_STATE_DISTRICT,
ADDRESS_PARSER_STATE,
ADDRESS_PARSER_POSTAL_CODE,
ADDRESS_PARSER_COUNTRY,
NUM_ADDRESS_PARSER_TYPES
} address_parser_types;
typedef union address_parser_types {
uint32_t value;
struct {
uint32_t components:16; // Bitset of components
uint32_t most_common:16; // Most common component as short integer enum value
};
} address_parser_types_t;
typedef struct address_parser_context { typedef struct address_parser_context {
char *language; char *language;
char *country; char *country;
@@ -84,6 +119,9 @@ typedef struct address_parser_context {
phrase_array *geodb_phrases; phrase_array *geodb_phrases;
// Index in gedob_phrases or -1 // Index in gedob_phrases or -1
int64_array *geodb_phrase_memberships; int64_array *geodb_phrase_memberships;
phrase_array *component_phrases;
// Index in component_phrases or -1
int64_array *component_phrase_memberships;
tokenized_string_t *tokenized_str; tokenized_string_t *tokenized_str;
} address_parser_context_t; } address_parser_context_t;
@@ -97,6 +135,7 @@ typedef struct address_parser_response {
typedef struct address_parser { typedef struct address_parser {
averaged_perceptron_t *model; averaged_perceptron_t *model;
trie_t *vocab; trie_t *vocab;
trie_t *phrase_types;
} address_parser_t; } address_parser_t;
// General usage // General usage
@@ -115,7 +154,7 @@ void address_parser_normalize_token(cstring_array *array, char *str, token_t tok
address_parser_context_t *address_parser_context_new(void); address_parser_context_t *address_parser_context_new(void);
void address_parser_context_destroy(address_parser_context_t *self); void address_parser_context_destroy(address_parser_context_t *self);
void address_parser_context_fill(address_parser_context_t *context, tokenized_string_t *tokenized_str, char *language, char *country); void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country);
// Feature function // Feature function
bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2); bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2);
@@ -131,4 +170,4 @@ bool address_parser_module_setup(char *dir);
void address_parser_module_teardown(void); void address_parser_module_teardown(void);
#endif #endif