[expand] adding a normalization for a single non-acronym internal period where there's an expansion at the prefix/suffix (for #218 and https://github.com/openvenues/libpostal/issues/216#issuecomment-306617824). Helps in cases like "St.Michaels" or "Jln.Utara" without needing to specify concatenated prefix phrases for every possibility

2017-10-28 02:38:15 -04:00
parent 6d430f7e9b
commit 053dca82ba
5 changed files with 402 additions and 285 deletions
--- a/src/libpostal.h
+++ b/src/libpostal.h
@@ -160,6 +160,12 @@ bool libpostal_setup_parser(void);
 bool libpostal_setup_parser_datadir(char *datadir);
 void libpostal_teardown_parser(void);

+bool libpostal_setup_language_classifier(void);
+bool libpostal_setup_language_classifier_datadir(char *datadir);
+void libpostal_teardown_language_classifier(void);
+
+/* Tokenization and token normalization APIs */
+
 typedef struct libpostal_token {
    size_t offset;
    size_t len;
@@ -190,6 +196,7 @@ libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n);
 #define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6
 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7
 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8
+#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS 1 << 9

 #define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE)

@@ -209,10 +216,6 @@ typedef struct libpostal_normalized_token {

 libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n);

-bool libpostal_setup_language_classifier(void);
-bool libpostal_setup_language_classifier_datadir(char *datadir);
-void libpostal_teardown_language_classifier(void);
-
 #ifdef __cplusplus
 }
 #endif