Updated linenoise to be MSys2/MinGW compatible. Updated address_parser app to use the defined libpostal api and not include internal components directly. Removed windows src Makefile as it is now the same as the standard one.

This commit is contained in:
AeroXuk
2017-11-27 01:42:25 +00:00
parent bb5535602a
commit 69e0d5d963
5 changed files with 94 additions and 129 deletions

View File

@@ -1,6 +1,8 @@
# libpostal: international street address NLP
[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE)
[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal)
[![Build Status](https://ci.appveyor.com/api/projects/status/github/openvenues/libpostal?branch=master&svg=true)](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
[![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE)
[![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors)
[![OpenCollective Backers](https://opencollective.com/libpostal/backers/badge.svg)](#backers)
@@ -137,6 +139,39 @@ For example, if you write a program called app.c, you can compile it like this:
gcc app.c `pkg-config --cflags --libs libpostal`
```
**On Windows (MSys2/MinGW)**
For Windows the build procedure currently requires MSys2 and MinGW. This can be downloaded from http://msys2.org. Please follow the instructions on the MSys2 website for installation.
Please ensure Msys2 is up-to-date by running:
```
pacman -Syu
```
Install the following prerequisites:
```
pacman -S autoconf automake curl git make libtool gcc mingw-w64-x86_64-gcc
```
Then to build the C library:
```
git clone https://github.com/openvenues/libpostal
cd libpostal
cp -rf windows/* ./
./bootstrap.sh
./configure --datadir=[...some dir with a few GB of space...]
make
make install
```
Notes: When setting the datadir, the `C:` drive would be entered as `/c`. The libpostal build script automatically add `libpostal` on the end of the path, so '/c' would become `C:\libpostal\` on Windows.
The compiled .dll will be in the `src/.libs/` directory and should be called `libpostal-1.dll`.
If you require a .lib import library to link this to your application. You can generate one using the Visual Studio `lib.exe` tool and the `libpostal.def` definition file:
```
lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64
```
Examples of parsing
-------------------

View File

@@ -34,8 +34,8 @@ libpostal_CFLAGS = $(CFLAGS_O3)
bench_SOURCES = bench.c
bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
bench_CFLAGS = $(CFLAGS_O3)
address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c
address_parser_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
address_parser_SOURCES = strndup.c address_parser_cli.c json_encode.c linenoise/linenoise.c string_utils.c utf8proc/utf8proc.c
address_parser_LDADD = libpostal.la $(CBLAS_LIBS)
address_parser_CFLAGS = $(CFLAGS_O3)
build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c

View File

@@ -1,35 +1,15 @@
#include <stdio.h>
#include <stdlib.h>
#include "address_parser.h"
#include "averaged_perceptron_tagger.h"
#include "address_dictionary.h"
#include "collections.h"
#include "constants.h"
#include "file_utils.h"
#include "json_encode.h"
#include "libpostal.h"
#include "normalize.h"
#include "scanner.h"
#include "shuffle.h"
#include "tokens.h"
#include "linenoise/linenoise.h"
#include "log/log.h"
bool load_address_parser_dependencies(void) {
if (!address_dictionary_module_setup(NULL)) {
log_error("Could not load address dictionaries\n");
return false;
}
log_info("address dictionary module loaded\n");
return true;
}
#include "strndup.h"
int main(int argc, char **argv) {
char *address_parser_dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
char *address_parser_dir = NULL;
char *history_file = "address_parser.history";
if (argc > 1) {
@@ -38,7 +18,7 @@ int main(int argc, char **argv) {
printf("Loading models...\n");
if (!libpostal_setup() || !address_parser_module_setup(address_parser_dir)) {
if (!libpostal_setup() || !libpostal_setup_parser_datadir(address_parser_dir)) {
exit(EXIT_FAILURE);
}
@@ -54,8 +34,6 @@ int main(int argc, char **argv) {
char *input = NULL;
address_parser_t *parser = get_address_parser();
while((input = linenoise("> ")) != NULL) {
if (input[0] != '\0') {
@@ -63,7 +41,7 @@ int main(int argc, char **argv) {
linenoiseHistorySave(history_file); /* Save the history on disk. */
}
if (strcmp(input, ".exit") == 0) {
if (strncmp(input, ".exit", 5) == 0) {
printf("Fin!\n");
free(input);
break;
@@ -95,7 +73,7 @@ int main(int argc, char **argv) {
cstring_array_destroy(command);
goto next_input;
} else if (string_starts_with(input, ".print_features")) {
} /*else if (string_starts_with(input, ".print_features")) {
size_t num_tokens = 0;
cstring_array *command = cstring_array_split(input, " ", 1, &num_tokens);
if (cstring_array_num_strings(command) > 1) {
@@ -111,13 +89,16 @@ int main(int argc, char **argv) {
cstring_array_destroy(command);
goto next_input;
} else if (strlen(input) == 0) {
}*/ else if (strlen(input) == 0) {
goto next_input;
}
libpostal_address_parser_response_t *parsed;
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
if (country != NULL) options.country = country;
if (language != NULL) options.language = language;
if ((parsed = libpostal_parse_address(input, options))) {
printf("\n");
printf("Result:\n\n");

View File

@@ -105,7 +105,6 @@
*
*/
#include <termios.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
@@ -114,7 +113,12 @@
#include <stdlib.h>
#include <ctype.h>
#include <sys/types.h>
#ifndef _WIN32
#include <termios.h>
#include <sys/ioctl.h>
#endif //_WIN32
#include <unistd.h>
#include "linenoise.h"
@@ -123,8 +127,10 @@
static char *unsupported_term[] = {"dumb","cons25","emacs",NULL};
static linenoiseCompletionCallback *completionCallback = NULL;
#ifndef _WIN32
static struct termios orig_termios; /* In order to restore at exit.*/
static int rawmode = 0; /* For atexit() function to check if restore is needed*/
#endif //_WIN32
static int mlmode = 0; /* Multi line mode. Default is single line. */
static int atexit_registered = 0; /* Register atexit just 1 time. */
static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN;
@@ -150,25 +156,25 @@ struct linenoiseState {
};
enum KEY_ACTION{
KEY_NULL = 0, /* NULL */
CTRL_A = 1, /* Ctrl+a */
CTRL_B = 2, /* Ctrl-b */
CTRL_C = 3, /* Ctrl-c */
CTRL_D = 4, /* Ctrl-d */
CTRL_E = 5, /* Ctrl-e */
CTRL_F = 6, /* Ctrl-f */
CTRL_H = 8, /* Ctrl-h */
TAB = 9, /* Tab */
CTRL_K = 11, /* Ctrl+k */
CTRL_L = 12, /* Ctrl+l */
ENTER = 13, /* Enter */
CTRL_N = 14, /* Ctrl-n */
CTRL_P = 16, /* Ctrl-p */
CTRL_T = 20, /* Ctrl-t */
CTRL_U = 21, /* Ctrl+u */
CTRL_W = 23, /* Ctrl+w */
ESC = 27, /* Escape */
BACKSPACE = 127 /* Backspace */
KEY_NULL = 0, /* NULL */
CTRL_A = 1, /* Ctrl+a */
CTRL_B = 2, /* Ctrl-b */
CTRL_C = 3, /* Ctrl-c */
CTRL_D = 4, /* Ctrl-d */
CTRL_E = 5, /* Ctrl-e */
CTRL_F = 6, /* Ctrl-f */
CTRL_H = 8, /* Ctrl-h */
TAB = 9, /* Tab */
CTRL_K = 11, /* Ctrl+k */
CTRL_L = 12, /* Ctrl+l */
ENTER = 13, /* Enter */
CTRL_N = 14, /* Ctrl-n */
CTRL_P = 16, /* Ctrl-p */
CTRL_T = 20, /* Ctrl-t */
CTRL_U = 21, /* Ctrl+u */
CTRL_W = 23, /* Ctrl+w */
ESC = 27, /* Escape */
BACKSPACE = 127 /* Backspace */
};
static void linenoiseAtExit(void);
@@ -207,7 +213,13 @@ static int isUnsupportedTerm(void) {
char *term = getenv("TERM");
int j;
if (term == NULL) return 0;
if (term == NULL) {
#ifdef _WIN32
return 1;
#else
return 0;
#endif // _WIN32
}
for (j = 0; unsupported_term[j]; j++)
if (!strcasecmp(term,unsupported_term[j])) return 1;
return 0;
@@ -215,6 +227,7 @@ static int isUnsupportedTerm(void) {
/* Raw mode: 1960 magic shit. */
static int enableRawMode(int fd) {
#ifndef _WIN32
struct termios raw;
if (!isatty(STDIN_FILENO)) goto fatal;
@@ -247,12 +260,17 @@ static int enableRawMode(int fd) {
fatal:
errno = ENOTTY;
return -1;
#else
return 0;
#endif //_WIN32
}
static void disableRawMode(int fd) {
#ifndef _WIN32
/* Don't even check the return value as it's too late. */
if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1)
rawmode = 0;
#endif //_WIN32
}
/* Use the ESC [6n escape sequence to query the horizontal cursor position
@@ -283,9 +301,13 @@ static int getCursorPosition(int ifd, int ofd) {
/* Try to get the number of columns in the current terminal, or assume 80
* if it fails. */
static int getColumns(int ifd, int ofd) {
#ifndef _WIN32
struct winsize ws;
if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) {
#else
if(1) {
#endif //_WIN32
/* ioctl() failed. Try to query the terminal itself. */
int start, cols;
@@ -307,9 +329,12 @@ static int getColumns(int ifd, int ofd) {
}
}
return cols;
} else {
}
#ifndef _WIN32
else {
return ws.ws_col;
}
#endif //_WIN32
failed:
return 80;

View File

@@ -1,76 +0,0 @@
# Inherited from autoconf / user-specified
CFLAGS_CONF = @CFLAGS@
CFLAGS_BASE = -Wall -Wextra -Wno-unused-function -Wformat -Werror=format-security -Winit-self -Wno-sign-compare -DLIBPOSTAL_DATA_DIR='"$(datadir)/libpostal"' -g $(CFLAGS_CONF)
CFLAGS_O0 = $(CFLAGS_BASE) -O0
CFLAGS_O1 = $(CFLAGS_BASE) -O1
CFLAGS_O2 = $(CFLAGS_BASE) -O2
CFLAGS_O3 = $(CFLAGS_BASE) -O3
DEFAULT_INCLUDES = -I.. -I/usr/local/include
# Wonky but have to be able to override the user's optimization level to compile the scanner
# as it takes an unreasonably long time to compile with the optimizer on.
CFLAGS =
lib_LTLIBRARIES = libpostal.la
libpostal_la_SOURCES = strndup.c libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS)
libpostal_la_CFLAGS = $(CFLAGS_O2) -D LIBPOSTAL_EXPORTS
libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined
dist_bin_SCRIPTS = libpostal_data
# Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough
# On cross-compilation for ARM using gcc-4.7, there are "out of range" errors during compilation that can be fixed by adding
# -marm option. For that, CFLAGS_SCANNER_EXTRA is provided that can be filled during configuration stage (see ./configure --help).
noinst_LTLIBRARIES = libscanner.la
libscanner_la_SOURCES = klib/drand48.c scanner.c
libscanner_la_CFLAGS = $(CFLAGS_O0) -D LIBPOSTAL_EXPORTS $(CFLAGS_SCANNER_EXTRA)
noinst_PROGRAMS = libpostal bench address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test
libpostal_SOURCES = strndup.c main.c json_encode.c file_utils.c string_utils.c utf8proc/utf8proc.c
libpostal_LDADD = libpostal.la
libpostal_CFLAGS = $(CFLAGS_O3)
bench_SOURCES = bench.c
bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
bench_CFLAGS = $(CFLAGS_O3)
#address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c
#address_parser_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
#address_parser_CFLAGS = $(CFLAGS_O3)
build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
build_address_dictionary_CFLAGS = $(CFLAGS_O3)
build_numex_table_SOURCES = strndup.c numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c
build_numex_table_CFLAGS = $(CFLAGS_O3)
build_trans_table_SOURCES = strndup.c transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
build_trans_table_CFLAGS = $(CFLAGS_O3)
address_parser_train_SOURCES = strndup.c address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c
address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS)
address_parser_train_CFLAGS = $(CFLAGS_O3)
address_parser_test_SOURCES = strndup.c address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c
address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS)
address_parser_test_CFLAGS = $(CFLAGS_O3)
language_classifier_train_SOURCES = strndup.c language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c
language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS)
language_classifier_train_CFLAGS = $(CFLAGS_O3)
language_classifier_SOURCES = strndup.c language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
language_classifier_LDADD = libscanner.la $(CBLAS_LIBS)
language_classifier_CFLAGS = $(CFLAGS_O3)
language_classifier_test_SOURCES = strndup.c language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS)
language_classifier_test_CFLAGS = $(CFLAGS_O3)
pkginclude_HEADERS = libpostal.h
if DOWNLOAD_DATA
all-local:
${srcdir}/libpostal_data download all $(datadir)/libpostal
endif
lexer: scanner.re
re2c -F -s -b -8 -o scanner.c scanner.re
.PHONY: lexer