diff --git a/README.md b/README.md index a4849ad5..cc2c5630 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # libpostal: international street address NLP -[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE) +[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) +[![Build Status](https://ci.appveyor.com/api/projects/status/github/openvenues/libpostal?branch=master&svg=true)](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master) +[![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE) [![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors) [![OpenCollective Backers](https://opencollective.com/libpostal/backers/badge.svg)](#backers) @@ -137,6 +139,39 @@ For example, if you write a program called app.c, you can compile it like this: gcc app.c `pkg-config --cflags --libs libpostal` ``` +**On Windows (MSys2/MinGW)** + +For Windows the build procedure currently requires MSys2 and MinGW. This can be downloaded from http://msys2.org. Please follow the instructions on the MSys2 website for installation. + +Please ensure Msys2 is up-to-date by running: +``` +pacman -Syu +``` + +Install the following prerequisites: +``` +pacman -S autoconf automake curl git make libtool gcc mingw-w64-x86_64-gcc +``` + +Then to build the C library: +``` +git clone https://github.com/openvenues/libpostal +cd libpostal +cp -rf windows/* ./ +./bootstrap.sh +./configure --datadir=[...some dir with a few GB of space...] +make +make install +``` +Notes: When setting the datadir, the `C:` drive would be entered as `/c`. The libpostal build script automatically add `libpostal` on the end of the path, so '/c' would become `C:\libpostal\` on Windows. + +The compiled .dll will be in the `src/.libs/` directory and should be called `libpostal-1.dll`. + +If you require a .lib import library to link this to your application. You can generate one using the Visual Studio `lib.exe` tool and the `libpostal.def` definition file: +``` +lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64 +``` + Examples of parsing ------------------- diff --git a/src/Makefile.am b/src/Makefile.am index 6767219b..6a13fce6 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -34,8 +34,8 @@ libpostal_CFLAGS = $(CFLAGS_O3) bench_SOURCES = bench.c bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS) bench_CFLAGS = $(CFLAGS_O3) -address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c -address_parser_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS) +address_parser_SOURCES = strndup.c address_parser_cli.c json_encode.c linenoise/linenoise.c string_utils.c utf8proc/utf8proc.c +address_parser_LDADD = libpostal.la $(CBLAS_LIBS) address_parser_CFLAGS = $(CFLAGS_O3) build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c diff --git a/src/address_parser_cli.c b/src/address_parser_cli.c index 71f1856c..a314f2f1 100644 --- a/src/address_parser_cli.c +++ b/src/address_parser_cli.c @@ -1,35 +1,15 @@ #include #include -#include "address_parser.h" -#include "averaged_perceptron_tagger.h" -#include "address_dictionary.h" -#include "collections.h" -#include "constants.h" -#include "file_utils.h" #include "json_encode.h" #include "libpostal.h" -#include "normalize.h" -#include "scanner.h" -#include "shuffle.h" -#include "tokens.h" #include "linenoise/linenoise.h" #include "log/log.h" - -bool load_address_parser_dependencies(void) { - if (!address_dictionary_module_setup(NULL)) { - log_error("Could not load address dictionaries\n"); - return false; - } - - log_info("address dictionary module loaded\n"); - - return true; -} +#include "strndup.h" int main(int argc, char **argv) { - char *address_parser_dir = LIBPOSTAL_ADDRESS_PARSER_DIR; + char *address_parser_dir = NULL; char *history_file = "address_parser.history"; if (argc > 1) { @@ -38,7 +18,7 @@ int main(int argc, char **argv) { printf("Loading models...\n"); - if (!libpostal_setup() || !address_parser_module_setup(address_parser_dir)) { + if (!libpostal_setup() || !libpostal_setup_parser_datadir(address_parser_dir)) { exit(EXIT_FAILURE); } @@ -54,8 +34,6 @@ int main(int argc, char **argv) { char *input = NULL; - address_parser_t *parser = get_address_parser(); - while((input = linenoise("> ")) != NULL) { if (input[0] != '\0') { @@ -63,7 +41,7 @@ int main(int argc, char **argv) { linenoiseHistorySave(history_file); /* Save the history on disk. */ } - if (strcmp(input, ".exit") == 0) { + if (strncmp(input, ".exit", 5) == 0) { printf("Fin!\n"); free(input); break; @@ -95,7 +73,7 @@ int main(int argc, char **argv) { cstring_array_destroy(command); goto next_input; - } else if (string_starts_with(input, ".print_features")) { + } /*else if (string_starts_with(input, ".print_features")) { size_t num_tokens = 0; cstring_array *command = cstring_array_split(input, " ", 1, &num_tokens); if (cstring_array_num_strings(command) > 1) { @@ -111,13 +89,16 @@ int main(int argc, char **argv) { cstring_array_destroy(command); goto next_input; - } else if (strlen(input) == 0) { + }*/ else if (strlen(input) == 0) { goto next_input; } libpostal_address_parser_response_t *parsed; libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); + if (country != NULL) options.country = country; + if (language != NULL) options.language = language; + if ((parsed = libpostal_parse_address(input, options))) { printf("\n"); printf("Result:\n\n"); diff --git a/src/linenoise/linenoise.c b/src/linenoise/linenoise.c index c10557d0..c0a06588 100644 --- a/src/linenoise/linenoise.c +++ b/src/linenoise/linenoise.c @@ -105,7 +105,6 @@ * */ -#include #include #include #include @@ -114,7 +113,12 @@ #include #include #include + +#ifndef _WIN32 +#include #include +#endif //_WIN32 + #include #include "linenoise.h" @@ -123,8 +127,10 @@ static char *unsupported_term[] = {"dumb","cons25","emacs",NULL}; static linenoiseCompletionCallback *completionCallback = NULL; +#ifndef _WIN32 static struct termios orig_termios; /* In order to restore at exit.*/ static int rawmode = 0; /* For atexit() function to check if restore is needed*/ +#endif //_WIN32 static int mlmode = 0; /* Multi line mode. Default is single line. */ static int atexit_registered = 0; /* Register atexit just 1 time. */ static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN; @@ -150,25 +156,25 @@ struct linenoiseState { }; enum KEY_ACTION{ - KEY_NULL = 0, /* NULL */ - CTRL_A = 1, /* Ctrl+a */ - CTRL_B = 2, /* Ctrl-b */ - CTRL_C = 3, /* Ctrl-c */ - CTRL_D = 4, /* Ctrl-d */ - CTRL_E = 5, /* Ctrl-e */ - CTRL_F = 6, /* Ctrl-f */ - CTRL_H = 8, /* Ctrl-h */ - TAB = 9, /* Tab */ - CTRL_K = 11, /* Ctrl+k */ - CTRL_L = 12, /* Ctrl+l */ - ENTER = 13, /* Enter */ - CTRL_N = 14, /* Ctrl-n */ - CTRL_P = 16, /* Ctrl-p */ - CTRL_T = 20, /* Ctrl-t */ - CTRL_U = 21, /* Ctrl+u */ - CTRL_W = 23, /* Ctrl+w */ - ESC = 27, /* Escape */ - BACKSPACE = 127 /* Backspace */ + KEY_NULL = 0, /* NULL */ + CTRL_A = 1, /* Ctrl+a */ + CTRL_B = 2, /* Ctrl-b */ + CTRL_C = 3, /* Ctrl-c */ + CTRL_D = 4, /* Ctrl-d */ + CTRL_E = 5, /* Ctrl-e */ + CTRL_F = 6, /* Ctrl-f */ + CTRL_H = 8, /* Ctrl-h */ + TAB = 9, /* Tab */ + CTRL_K = 11, /* Ctrl+k */ + CTRL_L = 12, /* Ctrl+l */ + ENTER = 13, /* Enter */ + CTRL_N = 14, /* Ctrl-n */ + CTRL_P = 16, /* Ctrl-p */ + CTRL_T = 20, /* Ctrl-t */ + CTRL_U = 21, /* Ctrl+u */ + CTRL_W = 23, /* Ctrl+w */ + ESC = 27, /* Escape */ + BACKSPACE = 127 /* Backspace */ }; static void linenoiseAtExit(void); @@ -207,7 +213,13 @@ static int isUnsupportedTerm(void) { char *term = getenv("TERM"); int j; - if (term == NULL) return 0; + if (term == NULL) { +#ifdef _WIN32 + return 1; +#else + return 0; +#endif // _WIN32 + } for (j = 0; unsupported_term[j]; j++) if (!strcasecmp(term,unsupported_term[j])) return 1; return 0; @@ -215,6 +227,7 @@ static int isUnsupportedTerm(void) { /* Raw mode: 1960 magic shit. */ static int enableRawMode(int fd) { +#ifndef _WIN32 struct termios raw; if (!isatty(STDIN_FILENO)) goto fatal; @@ -247,12 +260,17 @@ static int enableRawMode(int fd) { fatal: errno = ENOTTY; return -1; +#else + return 0; +#endif //_WIN32 } static void disableRawMode(int fd) { +#ifndef _WIN32 /* Don't even check the return value as it's too late. */ if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1) rawmode = 0; +#endif //_WIN32 } /* Use the ESC [6n escape sequence to query the horizontal cursor position @@ -283,9 +301,13 @@ static int getCursorPosition(int ifd, int ofd) { /* Try to get the number of columns in the current terminal, or assume 80 * if it fails. */ static int getColumns(int ifd, int ofd) { +#ifndef _WIN32 struct winsize ws; if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) { +#else + if(1) { +#endif //_WIN32 /* ioctl() failed. Try to query the terminal itself. */ int start, cols; @@ -307,9 +329,12 @@ static int getColumns(int ifd, int ofd) { } } return cols; - } else { + } +#ifndef _WIN32 + else { return ws.ws_col; } +#endif //_WIN32 failed: return 80; diff --git a/windows/src/Makefile.am b/windows/src/Makefile.am deleted file mode 100644 index 1112f09a..00000000 --- a/windows/src/Makefile.am +++ /dev/null @@ -1,76 +0,0 @@ -# Inherited from autoconf / user-specified -CFLAGS_CONF = @CFLAGS@ -CFLAGS_BASE = -Wall -Wextra -Wno-unused-function -Wformat -Werror=format-security -Winit-self -Wno-sign-compare -DLIBPOSTAL_DATA_DIR='"$(datadir)/libpostal"' -g $(CFLAGS_CONF) -CFLAGS_O0 = $(CFLAGS_BASE) -O0 -CFLAGS_O1 = $(CFLAGS_BASE) -O1 -CFLAGS_O2 = $(CFLAGS_BASE) -O2 -CFLAGS_O3 = $(CFLAGS_BASE) -O3 -DEFAULT_INCLUDES = -I.. -I/usr/local/include - -# Wonky but have to be able to override the user's optimization level to compile the scanner -# as it takes an unreasonably long time to compile with the optimizer on. -CFLAGS = - -lib_LTLIBRARIES = libpostal.la -libpostal_la_SOURCES = strndup.c libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c -libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS) -libpostal_la_CFLAGS = $(CFLAGS_O2) -D LIBPOSTAL_EXPORTS -libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined - -dist_bin_SCRIPTS = libpostal_data - -# Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough -# On cross-compilation for ARM using gcc-4.7, there are "out of range" errors during compilation that can be fixed by adding -# -marm option. For that, CFLAGS_SCANNER_EXTRA is provided that can be filled during configuration stage (see ./configure --help). -noinst_LTLIBRARIES = libscanner.la -libscanner_la_SOURCES = klib/drand48.c scanner.c -libscanner_la_CFLAGS = $(CFLAGS_O0) -D LIBPOSTAL_EXPORTS $(CFLAGS_SCANNER_EXTRA) - -noinst_PROGRAMS = libpostal bench address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test - -libpostal_SOURCES = strndup.c main.c json_encode.c file_utils.c string_utils.c utf8proc/utf8proc.c -libpostal_LDADD = libpostal.la -libpostal_CFLAGS = $(CFLAGS_O3) -bench_SOURCES = bench.c -bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS) -bench_CFLAGS = $(CFLAGS_O3) -#address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c -#address_parser_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS) -#address_parser_CFLAGS = $(CFLAGS_O3) - -build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c -build_address_dictionary_CFLAGS = $(CFLAGS_O3) -build_numex_table_SOURCES = strndup.c numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c -build_numex_table_CFLAGS = $(CFLAGS_O3) -build_trans_table_SOURCES = strndup.c transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c -build_trans_table_CFLAGS = $(CFLAGS_O3) -address_parser_train_SOURCES = strndup.c address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c -address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS) -address_parser_train_CFLAGS = $(CFLAGS_O3) - -address_parser_test_SOURCES = strndup.c address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c -address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS) -address_parser_test_CFLAGS = $(CFLAGS_O3) - -language_classifier_train_SOURCES = strndup.c language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c -language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS) -language_classifier_train_CFLAGS = $(CFLAGS_O3) -language_classifier_SOURCES = strndup.c language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c -language_classifier_LDADD = libscanner.la $(CBLAS_LIBS) -language_classifier_CFLAGS = $(CFLAGS_O3) -language_classifier_test_SOURCES = strndup.c language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c -language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS) -language_classifier_test_CFLAGS = $(CFLAGS_O3) - - -pkginclude_HEADERS = libpostal.h - -if DOWNLOAD_DATA -all-local: - ${srcdir}/libpostal_data download all $(datadir)/libpostal -endif - -lexer: scanner.re - re2c -F -s -b -8 -o scanner.c scanner.re - -.PHONY: lexer