diff --git a/.appveyor.yml b/.appveyor.yml new file mode 100644 index 00000000..eac7a36c --- /dev/null +++ b/.appveyor.yml @@ -0,0 +1,26 @@ +version: 1.0.{build} + +image: Visual Studio 2015 +platform: x64 + +environment: + matrix: + - COMPILER: msys2 + PLATFORM: x64 + MSYS2_ARCH: x86_64 + MSYS2_DIR: msys64 + MSYSTEM: MINGW64 + BIT: 64 + +build_script: + - '%APPVEYOR_BUILD_FOLDER%\win_build.bat' + +test_script: + - '%APPVEYOR_BUILD_FOLDER%\test\test_libpostal.exe' + +after_build: + - 7z a libpostal.zip %APPVEYOR_BUILD_FOLDER%\libpostal.dll %APPVEYOR_BUILD_FOLDER%\libpostal.def %APPVEYOR_BUILD_FOLDER%\libpostal.exp %APPVEYOR_BUILD_FOLDER%\libpostal.lib %APPVEYOR_BUILD_FOLDER%\src\libpostal.h + +artifacts: + - path: libpostal.zip + name: libpostal diff --git a/README.md b/README.md index a4849ad5..1ce427b8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # libpostal: international street address NLP -[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE) +[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) +[![Build Status](https://ci.appveyor.com/api/projects/status/github/openvenues/libpostal?branch=master&svg=true)](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master) +[![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE) [![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors) [![OpenCollective Backers](https://opencollective.com/libpostal/backers/badge.svg)](#backers) @@ -89,8 +91,8 @@ Individual users can also help support open geo NLP research by making a monthly -Installation ------------- +Installation (Mac/Linux) +------------------------ Before you install, make sure you have the following prerequisites: @@ -137,6 +139,42 @@ For example, if you write a program called app.c, you can compile it like this: gcc app.c `pkg-config --cflags --libs libpostal` ``` +Installation (Windows) +---------------------- + +**MSys2/MinGW** + +For Windows the build procedure currently requires MSys2 and MinGW. This can be downloaded from http://msys2.org. Please follow the instructions on the MSys2 website for installation. + +Please ensure Msys2 is up-to-date by running: +``` +pacman -Syu +``` + +Install the following prerequisites: +``` +pacman -S autoconf automake curl git make libtool gcc mingw-w64-x86_64-gcc +``` + +Then to build the C library: +``` +git clone https://github.com/openvenues/libpostal +cd libpostal +cp -rf windows/* ./ +./bootstrap.sh +./configure --datadir=[...some dir with a few GB of space...] +make +make install +``` +Notes: When setting the datadir, the `C:` drive would be entered as `/c`. The libpostal build script automatically add `libpostal` on the end of the path, so '/c' would become `C:\libpostal\` on Windows. + +The compiled .dll will be in the `src/.libs/` directory and should be called `libpostal-1.dll`. + +If you require a .lib import library to link this to your application. You can generate one using the Visual Studio `lib.exe` tool and the `libpostal.def` definition file: +``` +lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64 +``` + Examples of parsing ------------------- @@ -640,6 +678,12 @@ Bug reports, issues and pull requests are welcome. Please read the [contributing Submit issues at: https://github.com/openvenues/libpostal/issues. + +Shoutouts +--------- + +Special thanks to @BenK10 for the initial Windows build and @AeroXuk for integrating it seamlessly into the project and setting up an Appveyor build. + License ------- diff --git a/configure.ac b/configure.ac index 4f4bfcc1..0dea296c 100644 --- a/configure.ac +++ b/configure.ac @@ -48,7 +48,7 @@ AC_TYPE_UINT8_T AC_CHECK_TYPES([ptrdiff_t]) # Checks for library functions. -AC_CHECK_FUNCS([malloc realloc getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup]) +AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup]) AC_CONFIG_FILES([Makefile libpostal.pc diff --git a/libpostal.def b/libpostal.def new file mode 100644 index 00000000..7deba4f7 --- /dev/null +++ b/libpostal.def @@ -0,0 +1,17 @@ +EXPORTS +libpostal_get_default_options +libpostal_expand_address +libpostal_expansion_array_destroy +libpostal_address_parser_response_destroy +libpostal_get_address_parser_default_options +libpostal_parse_address +libpostal_parser_print_features +libpostal_setup +libpostal_setup_datadir +libpostal_teardown +libpostal_setup_parser +libpostal_setup_parser_datadir +libpostal_teardown_parser +libpostal_setup_language_classifier +libpostal_setup_language_classifier_datadir +libpostal_teardown_language_classifier diff --git a/src/Makefile.am b/src/Makefile.am index 6707d5aa..6a13fce6 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -12,10 +12,10 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include CFLAGS = lib_LTLIBRARIES = libpostal.la -libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c +libpostal_la_SOURCES = strndup.c libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS) -libpostal_la_CFLAGS = $(CFLAGS_O2) -libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ +libpostal_la_CFLAGS = $(CFLAGS_O2) -D LIBPOSTAL_EXPORTS +libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined dist_bin_SCRIPTS = libpostal_data @@ -23,42 +23,42 @@ dist_bin_SCRIPTS = libpostal_data # On cross-compilation for ARM using gcc-4.7, there are "out of range" errors during compilation that can be fixed by adding # -marm option. For that, CFLAGS_SCANNER_EXTRA is provided that can be filled during configuration stage (see ./configure --help). noinst_LTLIBRARIES = libscanner.la -libscanner_la_SOURCES = scanner.c -libscanner_la_CFLAGS = $(CFLAGS_O0) $(CFLAGS_SCANNER_EXTRA) +libscanner_la_SOURCES = klib/drand48.c scanner.c +libscanner_la_CFLAGS = $(CFLAGS_O0) -D LIBPOSTAL_EXPORTS $(CFLAGS_SCANNER_EXTRA) noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test -libpostal_SOURCES = main.c json_encode.c +libpostal_SOURCES = strndup.c main.c json_encode.c file_utils.c string_utils.c utf8proc/utf8proc.c libpostal_LDADD = libpostal.la libpostal_CFLAGS = $(CFLAGS_O3) bench_SOURCES = bench.c bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS) bench_CFLAGS = $(CFLAGS_O3) -address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c -address_parser_LDADD = libscanner.la $(CBLAS_LIBS) +address_parser_SOURCES = strndup.c address_parser_cli.c json_encode.c linenoise/linenoise.c string_utils.c utf8proc/utf8proc.c +address_parser_LDADD = libpostal.la $(CBLAS_LIBS) address_parser_CFLAGS = $(CFLAGS_O3) -build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c +build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c build_address_dictionary_CFLAGS = $(CFLAGS_O3) -build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c +build_numex_table_SOURCES = strndup.c numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c build_numex_table_CFLAGS = $(CFLAGS_O3) -build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c +build_trans_table_SOURCES = strndup.c transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c build_trans_table_CFLAGS = $(CFLAGS_O3) -address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c +address_parser_train_SOURCES = strndup.c address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS) address_parser_train_CFLAGS = $(CFLAGS_O3) -address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c -address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS) +address_parser_test_SOURCES = strndup.c address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c +address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS) address_parser_test_CFLAGS = $(CFLAGS_O3) -language_classifier_train_SOURCES = language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c +language_classifier_train_SOURCES = strndup.c language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS) language_classifier_train_CFLAGS = $(CFLAGS_O3) -language_classifier_SOURCES = language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c +language_classifier_SOURCES = strndup.c language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c language_classifier_LDADD = libscanner.la $(CBLAS_LIBS) language_classifier_CFLAGS = $(CFLAGS_O3) -language_classifier_test_SOURCES = language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c +language_classifier_test_SOURCES = strndup.c language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS) language_classifier_test_CFLAGS = $(CFLAGS_O3) diff --git a/src/address_parser.c b/src/address_parser.c index 7e6de097..613b619d 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -49,6 +49,13 @@ address_parser_t *get_address_parser(void) { return parser; } +bool address_parser_print_features(bool print_features) { + if (parser == NULL) return false; + + parser->options.print_features = print_features; + return true; +} + bool address_parser_save(address_parser_t *self, char *output_dir) { if (self == NULL || output_dir == NULL) return false; diff --git a/src/address_parser.h b/src/address_parser.h index 2518a9ef..4c5e699f 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -215,6 +215,7 @@ address_parser_t *address_parser_new_options(parser_options_t options); address_parser_t *get_address_parser(void); bool address_parser_load(char *dir); +bool address_parser_print_features(bool print_features); libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country); void address_parser_destroy(address_parser_t *self); diff --git a/src/address_parser_cli.c b/src/address_parser_cli.c index 71f1856c..9c50a8c9 100644 --- a/src/address_parser_cli.c +++ b/src/address_parser_cli.c @@ -1,35 +1,15 @@ #include #include -#include "address_parser.h" -#include "averaged_perceptron_tagger.h" -#include "address_dictionary.h" -#include "collections.h" -#include "constants.h" -#include "file_utils.h" #include "json_encode.h" #include "libpostal.h" -#include "normalize.h" -#include "scanner.h" -#include "shuffle.h" -#include "tokens.h" #include "linenoise/linenoise.h" #include "log/log.h" - -bool load_address_parser_dependencies(void) { - if (!address_dictionary_module_setup(NULL)) { - log_error("Could not load address dictionaries\n"); - return false; - } - - log_info("address dictionary module loaded\n"); - - return true; -} +#include "strndup.h" int main(int argc, char **argv) { - char *address_parser_dir = LIBPOSTAL_ADDRESS_PARSER_DIR; + char *address_parser_dir = NULL; char *history_file = "address_parser.history"; if (argc > 1) { @@ -38,7 +18,7 @@ int main(int argc, char **argv) { printf("Loading models...\n"); - if (!libpostal_setup() || !address_parser_module_setup(address_parser_dir)) { + if (!libpostal_setup() || !libpostal_setup_parser_datadir(address_parser_dir)) { exit(EXIT_FAILURE); } @@ -54,8 +34,6 @@ int main(int argc, char **argv) { char *input = NULL; - address_parser_t *parser = get_address_parser(); - while((input = linenoise("> ")) != NULL) { if (input[0] != '\0') { @@ -63,7 +41,7 @@ int main(int argc, char **argv) { linenoiseHistorySave(history_file); /* Save the history on disk. */ } - if (strcmp(input, ".exit") == 0) { + if (strncmp(input, ".exit", 5) == 0) { printf("Fin!\n"); free(input); break; @@ -101,12 +79,12 @@ int main(int argc, char **argv) { if (cstring_array_num_strings(command) > 1) { char *flag = cstring_array_get_string(command, 1); if (string_compare_case_insensitive(flag, "off") == 0) { - parser->options.print_features = false; + libpostal_parser_print_features(false); } else if (string_compare_case_insensitive(flag, "on") == 0) { - parser->options.print_features = true; + libpostal_parser_print_features(true); } } else { - parser->options.print_features = true; + libpostal_parser_print_features(true); } cstring_array_destroy(command); @@ -118,6 +96,9 @@ int main(int argc, char **argv) { libpostal_address_parser_response_t *parsed; libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); + if (country != NULL) options.country = country; + if (language != NULL) options.language = language; + if ((parsed = libpostal_parse_address(input, options))) { printf("\n"); printf("Result:\n\n"); diff --git a/src/klib/drand48.c b/src/klib/drand48.c new file mode 100644 index 00000000..2f4335bd --- /dev/null +++ b/src/klib/drand48.c @@ -0,0 +1,76 @@ + +/* + * Copyright (c) 1993 Martin Birgmeier + * All rights reserved. + + * You may redistribute unmodified or modified versions of this source + * code provided that the above copyright notice and this and the + * following conditions are retained. + + * This software is provided ``as is'', and comes with no warranties + * of any kind. I shall in no event be liable for anything that happens + * to anyone/anything when using this software. +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifndef HAVE_DRAND48 + +#include +#include "drand48.h" + +#define RAND48_SEED_0 (0x330e) +#define RAND48_SEED_1 (0xabcd) +#define RAND48_SEED_2 (0x1234) +#define RAND48_MULT_0 (0xe66d) +#define RAND48_MULT_1 (0xdeec) +#define RAND48_MULT_2 (0x0005) +#define RAND48_ADD (0x000b) + +unsigned short _rand48_seed[3] = { + RAND48_SEED_0, + RAND48_SEED_1, + RAND48_SEED_2 +}; + +unsigned short _rand48_mult[3] = { + RAND48_MULT_0, + RAND48_MULT_1, + RAND48_MULT_2 +}; + +unsigned short _rand48_add = RAND48_ADD; + +void _dorand48(unsigned short xseed[3]) +{ + unsigned long accu; + unsigned short temp[2]; + + accu = (unsigned long) _rand48_mult[0] * (unsigned long) xseed[0] + (unsigned long) _rand48_add; + temp[0] = (unsigned short) accu; /* lower 16 bits */ + accu >>= sizeof(unsigned short) * 8; + accu += (unsigned long) _rand48_mult[0] * (unsigned long) xseed[1] + (unsigned long) _rand48_mult[1] * (unsigned long) xseed[0]; + temp[1] = (unsigned short) accu; /* middle 16 bits */ + accu >>= sizeof(unsigned short) * 8; + accu += _rand48_mult[0] * xseed[2] + _rand48_mult[1] * xseed[1] + _rand48_mult[2] * xseed[0]; + xseed[0] = temp[0]; + xseed[1] = temp[1]; + xseed[2] = (unsigned short) accu; +} + +double erand48(unsigned short xseed[3]) +{ + _dorand48(xseed); + return ldexp((double) xseed[0], -48) + + ldexp((double) xseed[1], -32) + + ldexp((double) xseed[2], -16); +} + +double drand48(void) +{ + return erand48(_rand48_seed); +} + +#endif // HAVE_DRAND48 diff --git a/src/klib/drand48.h b/src/klib/drand48.h new file mode 100644 index 00000000..56b55d3c --- /dev/null +++ b/src/klib/drand48.h @@ -0,0 +1,47 @@ + +/* + * Copyright (c) 1993 Martin Birgmeier + * All rights reserved. + + * You may redistribute unmodified or modified versions of this source + * code provided that the above copyright notice and this and the + * following conditions are retained. + + * This software is provided ``as is'', and comes with no warranties + * of any kind. I shall in no event be liable for anything that happens + * to anyone/anything when using this software. +*/ + +#ifndef _DRAND48_H +#define _DRAND48_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifndef HAVE_DRAND48 + +#define RAND48_SEED_0 (0x330e) +#define RAND48_SEED_1 (0xabcd) +#define RAND48_SEED_2 (0x1234) +#define RAND48_MULT_0 (0xe66d) +#define RAND48_MULT_1 (0xdeec) +#define RAND48_MULT_2 (0x0005) +#define RAND48_ADD (0x000b) + +unsigned short _rand48_seed[3]; + +unsigned short _rand48_mult[3]; + +unsigned short _rand48_add; + +void _dorand48(unsigned short xseed[3]); + +double erand48(unsigned short xseed[3]); + +double drand48(void); + +#endif // HAVE_DRAND48 + +#endif // _DRAND48_H + diff --git a/src/klib/ksort.h b/src/klib/ksort.h index d2fb3532..1c8342fd 100644 --- a/src/klib/ksort.h +++ b/src/klib/ksort.h @@ -45,6 +45,7 @@ #include #include +#include "drand48.h" typedef struct { void *left, *right; diff --git a/src/libpostal.c b/src/libpostal.c index 9209de11..2c0a8521 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -1191,6 +1191,10 @@ libpostal_address_parser_response_t *libpostal_parse_address(char *address, libp return parsed; } +bool libpostal_parser_print_features(bool print_features) { + return address_parser_print_features(print_features); +} + bool libpostal_setup_datadir(char *datadir) { char *transliteration_path = NULL; char *numex_path = NULL; diff --git a/src/libpostal.h b/src/libpostal.h index 2c651817..28ae900e 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -10,6 +10,18 @@ extern "C" { #include #include +#ifdef _WIN32 +#ifdef LIBPOSTAL_EXPORTS +#define LIBPOSTAL_EXPORT __declspec(dllexport) +#else +#define LIBPOSTAL_EXPORT __declspec(dllimport) +#endif +#elif __GNUC__ >= 4 +#define LIBPOSTAL_EXPORT __attribute__ ((visibility("default"))) +#else +#define LIBPOSTAL_EXPORT +#endif + #define LIBPOSTAL_MAX_LANGUAGE_LEN 4 // Doing these as #defines so we can duplicate the values exactly in Python @@ -123,11 +135,11 @@ typedef struct libpostal_normalize_options { } libpostal_normalize_options_t; -libpostal_normalize_options_t libpostal_get_default_options(void); +LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void); -char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n); -void libpostal_expansion_array_destroy(char **expansions, size_t n); +LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n); /* Address parser @@ -144,25 +156,27 @@ typedef struct libpostal_address_parser_options { char *country; } libpostal_address_parser_options_t; -void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self); +LIBPOSTAL_EXPORT void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self); -libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void); +LIBPOSTAL_EXPORT libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void); -libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options); +LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options); + +LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features); // Setup/teardown methods -bool libpostal_setup(void); -bool libpostal_setup_datadir(char *datadir); -void libpostal_teardown(void); +LIBPOSTAL_EXPORT bool libpostal_setup(void); +LIBPOSTAL_EXPORT bool libpostal_setup_datadir(char *datadir); +LIBPOSTAL_EXPORT void libpostal_teardown(void); -bool libpostal_setup_parser(void); -bool libpostal_setup_parser_datadir(char *datadir); -void libpostal_teardown_parser(void); +LIBPOSTAL_EXPORT bool libpostal_setup_parser(void); +LIBPOSTAL_EXPORT bool libpostal_setup_parser_datadir(char *datadir); +LIBPOSTAL_EXPORT void libpostal_teardown_parser(void); -bool libpostal_setup_language_classifier(void); -bool libpostal_setup_language_classifier_datadir(char *datadir); -void libpostal_teardown_language_classifier(void); +LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier(void); +LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier_datadir(char *datadir); +LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(void); /* Tokenization and token normalization APIs */ diff --git a/src/linenoise/linenoise.c b/src/linenoise/linenoise.c index c10557d0..c0a06588 100644 --- a/src/linenoise/linenoise.c +++ b/src/linenoise/linenoise.c @@ -105,7 +105,6 @@ * */ -#include #include #include #include @@ -114,7 +113,12 @@ #include #include #include + +#ifndef _WIN32 +#include #include +#endif //_WIN32 + #include #include "linenoise.h" @@ -123,8 +127,10 @@ static char *unsupported_term[] = {"dumb","cons25","emacs",NULL}; static linenoiseCompletionCallback *completionCallback = NULL; +#ifndef _WIN32 static struct termios orig_termios; /* In order to restore at exit.*/ static int rawmode = 0; /* For atexit() function to check if restore is needed*/ +#endif //_WIN32 static int mlmode = 0; /* Multi line mode. Default is single line. */ static int atexit_registered = 0; /* Register atexit just 1 time. */ static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN; @@ -150,25 +156,25 @@ struct linenoiseState { }; enum KEY_ACTION{ - KEY_NULL = 0, /* NULL */ - CTRL_A = 1, /* Ctrl+a */ - CTRL_B = 2, /* Ctrl-b */ - CTRL_C = 3, /* Ctrl-c */ - CTRL_D = 4, /* Ctrl-d */ - CTRL_E = 5, /* Ctrl-e */ - CTRL_F = 6, /* Ctrl-f */ - CTRL_H = 8, /* Ctrl-h */ - TAB = 9, /* Tab */ - CTRL_K = 11, /* Ctrl+k */ - CTRL_L = 12, /* Ctrl+l */ - ENTER = 13, /* Enter */ - CTRL_N = 14, /* Ctrl-n */ - CTRL_P = 16, /* Ctrl-p */ - CTRL_T = 20, /* Ctrl-t */ - CTRL_U = 21, /* Ctrl+u */ - CTRL_W = 23, /* Ctrl+w */ - ESC = 27, /* Escape */ - BACKSPACE = 127 /* Backspace */ + KEY_NULL = 0, /* NULL */ + CTRL_A = 1, /* Ctrl+a */ + CTRL_B = 2, /* Ctrl-b */ + CTRL_C = 3, /* Ctrl-c */ + CTRL_D = 4, /* Ctrl-d */ + CTRL_E = 5, /* Ctrl-e */ + CTRL_F = 6, /* Ctrl-f */ + CTRL_H = 8, /* Ctrl-h */ + TAB = 9, /* Tab */ + CTRL_K = 11, /* Ctrl+k */ + CTRL_L = 12, /* Ctrl+l */ + ENTER = 13, /* Enter */ + CTRL_N = 14, /* Ctrl-n */ + CTRL_P = 16, /* Ctrl-p */ + CTRL_T = 20, /* Ctrl-t */ + CTRL_U = 21, /* Ctrl+u */ + CTRL_W = 23, /* Ctrl+w */ + ESC = 27, /* Escape */ + BACKSPACE = 127 /* Backspace */ }; static void linenoiseAtExit(void); @@ -207,7 +213,13 @@ static int isUnsupportedTerm(void) { char *term = getenv("TERM"); int j; - if (term == NULL) return 0; + if (term == NULL) { +#ifdef _WIN32 + return 1; +#else + return 0; +#endif // _WIN32 + } for (j = 0; unsupported_term[j]; j++) if (!strcasecmp(term,unsupported_term[j])) return 1; return 0; @@ -215,6 +227,7 @@ static int isUnsupportedTerm(void) { /* Raw mode: 1960 magic shit. */ static int enableRawMode(int fd) { +#ifndef _WIN32 struct termios raw; if (!isatty(STDIN_FILENO)) goto fatal; @@ -247,12 +260,17 @@ static int enableRawMode(int fd) { fatal: errno = ENOTTY; return -1; +#else + return 0; +#endif //_WIN32 } static void disableRawMode(int fd) { +#ifndef _WIN32 /* Don't even check the return value as it's too late. */ if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1) rawmode = 0; +#endif //_WIN32 } /* Use the ESC [6n escape sequence to query the horizontal cursor position @@ -283,9 +301,13 @@ static int getCursorPosition(int ifd, int ofd) { /* Try to get the number of columns in the current terminal, or assume 80 * if it fails. */ static int getColumns(int ifd, int ofd) { +#ifndef _WIN32 struct winsize ws; if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) { +#else + if(1) { +#endif //_WIN32 /* ioctl() failed. Try to query the terminal itself. */ int start, cols; @@ -307,9 +329,12 @@ static int getColumns(int ifd, int ofd) { } } return cols; - } else { + } +#ifndef _WIN32 + else { return ws.ws_col; } +#endif //_WIN32 failed: return 80; diff --git a/src/normalize.c b/src/normalize.c index aa9f2ef1..ff21af9b 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -1,4 +1,5 @@ #include "normalize.h" +#include "strndup.h" #define FULL_STOP_CODEPOINT 0x002e #define APOSTROPHE_CODEPOINT 0x0027 diff --git a/src/numex_data.c b/src/numex_data.c index 02691e3c..837175c8 100644 --- a/src/numex_data.c +++ b/src/numex_data.c @@ -5131,7 +5131,7 @@ numex_rule_t numex_rules[] = { {NUMEX_LEFT_CONTEXT_ADD, NUMEX_RIGHT_CONTEXT_NONE, NUMEX_ORDINAL_RULE, GENDER_NEUTER, CATEGORY_DEFAULT, 10, 1000000000000LL}, {NUMEX_LEFT_CONTEXT_ADD, NUMEX_RIGHT_CONTEXT_NONE, NUMEX_ORDINAL_RULE, GENDER_NEUTER, CATEGORY_DEFAULT, 10, 1000000000000000LL}, {NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_NONE, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 0LL}, - {NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_NONE, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 0LL}, + {NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, NUMEX_RIGHT_CONTEXT_NONE, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 0LL}, {NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_NONE, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 1LL}, {NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_NONE, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 2LL}, {NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_NONE, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 3LL}, diff --git a/src/string_utils.c b/src/string_utils.c index 79083e09..fcd35d74 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -1,6 +1,7 @@ #include #include "log/log.h" #include "string_utils.h" +#include "strndup.h" #define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n)) diff --git a/src/strndup.c b/src/strndup.c new file mode 100644 index 00000000..6d1a9300 --- /dev/null +++ b/src/strndup.c @@ -0,0 +1,20 @@ +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifndef HAVE_STRNDUP + +#include +#include + +char *strndup(const char *s, size_t n) +{ + char* new = malloc(n+1); + if (new) { + strncpy(new, s, n); + new[n] = '\0'; + } + return new; +} + +#endif /* HAVE_STRNDUP */ diff --git a/src/strndup.h b/src/strndup.h new file mode 100644 index 00000000..7cd3bffb --- /dev/null +++ b/src/strndup.h @@ -0,0 +1,13 @@ +#ifndef STRNDUP_H +#define STRNDUP_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifndef HAVE_STRNDUP + +char *strndup(const char *s, size_t n); + +#endif /* HAVE_STRNDUP */ +#endif /* STRNDUP_H */ diff --git a/src/tokens.c b/src/tokens.c index e85183f1..310e861b 100644 --- a/src/tokens.c +++ b/src/tokens.c @@ -1,4 +1,5 @@ #include "tokens.h" +#include "strndup.h" tokenized_string_t *tokenized_string_new(void) { diff --git a/src/transliterate.c b/src/transliterate.c index 368356f3..bd8cb003 100644 --- a/src/transliterate.c +++ b/src/transliterate.c @@ -3,6 +3,7 @@ #include "file_utils.h" #include "log/log.h" +#include "strndup.h" #define TRANSLITERATION_TABLE_SIGNATURE 0xAAAAAAAA diff --git a/src/utf8proc/utf8proc.c b/src/utf8proc/utf8proc.c index c302b79e..34397d58 100644 --- a/src/utf8proc/utf8proc.c +++ b/src/utf8proc/utf8proc.c @@ -44,7 +44,7 @@ #include "utf8proc_data.c" -UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { +const utf8proc_int8_t utf8proc_utf8class[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -87,11 +87,11 @@ UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { be different, being based on ABI compatibility.): */ #define STRINGIZEx(x) #x #define STRINGIZE(x) STRINGIZEx(x) -UTF8PROC_DLLEXPORT const char *utf8proc_version(void) { +const char *utf8proc_version(void) { return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) ""; } -UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { +const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { switch (errcode) { case UTF8PROC_ERROR_NOMEM: return "Memory for processing UTF-8 data could not be allocated."; @@ -109,7 +109,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { } #define utf_cont(ch) (((ch) & 0xc0) == 0x80) -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( +utf8proc_ssize_t utf8proc_iterate( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst ) { utf8proc_uint32_t uc; @@ -157,11 +157,11 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( return 4; } -UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { +utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000); } -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { +utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { if (uc < 0x00) { return 0; } else if (uc < 0x80) { @@ -228,7 +228,7 @@ static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) { ); } -UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) { +const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) { return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); } @@ -259,18 +259,18 @@ static utf8proc_bool grapheme_break(int lbc, int tbc) { } /* return whether there is a grapheme break between codepoints c1 and c2 */ -UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) { +utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) { return grapheme_break(utf8proc_get_property(c1)->boundclass, utf8proc_get_property(c2)->boundclass); } -UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) +utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) { utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping; return cl >= 0 ? cl : c; } -UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) +utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) { utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping; return cu >= 0 ? cu : c; @@ -278,15 +278,15 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) /* return a character width analogous to wcwidth (except portable and hopefully less buggy than most system wcwidth functions). */ -UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { +int utf8proc_charwidth(utf8proc_int32_t c) { return utf8proc_get_property(c)->charwidth; } -UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { +utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { return utf8proc_get_property(c)->category; } -UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { +const char *utf8proc_category_string(utf8proc_int32_t c) { static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; return s[utf8proc_category(c)]; } @@ -295,7 +295,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ options & ~UTF8PROC_LUMP, last_boundclass) -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { +utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { const utf8proc_property_t *property; utf8proc_propval_t category; utf8proc_int32_t hangul_sindex; @@ -399,7 +399,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, return 1; } -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( +utf8proc_ssize_t utf8proc_decompose( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options ) { @@ -461,7 +461,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( return wpos; } -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { +utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored ASSERT: 'buffer' has one spare byte of free space at the end! */ if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { @@ -583,7 +583,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, } } -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( +utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ) { utf8proc_int32_t *buffer; @@ -612,28 +612,28 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( return result; } -UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { +utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { utf8proc_uint8_t *retval; utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE); return retval; } -UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { +utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { utf8proc_uint8_t *retval; utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE); return retval; } -UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { +utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { utf8proc_uint8_t *retval; utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); return retval; } -UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { +utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { utf8proc_uint8_t *retval; utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT); diff --git a/src/utf8proc/utf8proc.h b/src/utf8proc/utf8proc.h index 00f10c80..6080b029 100644 --- a/src/utf8proc/utf8proc.h +++ b/src/utf8proc/utf8proc.h @@ -111,18 +111,6 @@ typedef bool utf8proc_bool; #endif #include -#ifdef _WIN32 -# ifdef UTF8PROC_EXPORTS -# define UTF8PROC_DLLEXPORT __declspec(dllexport) -# else -# define UTF8PROC_DLLEXPORT __declspec(dllimport) -# endif -#elif __GNUC__ >= 4 -# define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default"))) -#else -# define UTF8PROC_DLLEXPORT -#endif - #ifdef __cplusplus extern "C" { #endif @@ -365,20 +353,20 @@ typedef enum { * Array containing the byte lengths of a UTF-8 encoded codepoint based * on the first byte. */ -UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256]; +extern const utf8proc_int8_t utf8proc_utf8class[256]; /** * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH * (http://semver.org format), possibly with a "-dev" suffix for * development versions. */ -UTF8PROC_DLLEXPORT const char *utf8proc_version(void); +const char *utf8proc_version(void); /** * Returns an informative error string for the given utf8proc error code * (e.g. the error codes returned by @ref utf8proc_map). */ -UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode); +const char *utf8proc_errmsg(utf8proc_ssize_t errcode); /** * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`. @@ -390,7 +378,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode); * In case of success, the number of bytes read is returned; otherwise, a * negative error code is returned. */ -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref); +utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref); /** * Check if a codepoint is valid (regardless of whether it has been @@ -398,7 +386,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str * * @return 1 if the given `codepoint` is valid and otherwise return 0. */ -UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint); +utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint); /** * Encodes the codepoint as an UTF-8 string in the byte array pointed @@ -409,7 +397,7 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codep * * This function does not check whether `codepoint` is valid Unicode. */ -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst); +utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst); /** * Look up the properties for a given codepoint. @@ -423,7 +411,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepo * If the codepoint is unassigned or invalid, a pointer to a special struct is * returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN). */ -UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint); +const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint); /** Decompose a codepoint into an array of codepoints. * @@ -452,7 +440,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int * required buffer size is returned, while the buffer will be overwritten with * undefined data. */ -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( +utf8proc_ssize_t utf8proc_decompose_char( utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass ); @@ -473,7 +461,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( * required buffer size is returned, while the buffer will be overwritten with * undefined data. */ -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( +utf8proc_ssize_t utf8proc_decompose( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options ); @@ -503,13 +491,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( * entries of the array pointed to by `str` have to be in the * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! */ -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); +utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); /** * Given a pair of consecutive codepoints, return whether a grapheme break is * permitted between them (as defined by the extended grapheme clusters in UAX#29). */ -UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); +utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); /** @@ -517,14 +505,14 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepo * lower-case character, if any; otherwise (if there is no lower-case * variant, or if `c` is not a valid codepoint) return `c`. */ -UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c); +utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c); /** * Given a codepoint `c`, return the codepoint of the corresponding * upper-case character, if any; otherwise (if there is no upper-case * variant, or if `c` is not a valid codepoint) return `c`. */ -UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c); +utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c); /** * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, @@ -534,19 +522,19 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c); * @note * If you want to check for particular types of non-printable characters, * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */ -UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint); +int utf8proc_charwidth(utf8proc_int32_t codepoint); /** * Return the Unicode category for the codepoint (one of the * @ref utf8proc_category_t constants.) */ -UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint); +utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint); /** * Return the two-letter (nul-terminated) Unicode category string for * the codepoint (e.g. `"Lu"` or `"Co"`). */ -UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint); +const char *utf8proc_category_string(utf8proc_int32_t codepoint); /** * Maps the given UTF-8 string pointed to by `str` to a new UTF-8 @@ -566,7 +554,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi * @note The memory of the new UTF-8 string will have been allocated * with `malloc`, and should therefore be deallocated with `free`. */ -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( +utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ); @@ -579,13 +567,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( */ /** @{ */ /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */ -UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str); +utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str); /** NFC normalization (@ref UTF8PROC_COMPOSE). */ -UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); +utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); /** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ -UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); +utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); /** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ -UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); +utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); /** @} */ #ifdef __cplusplus diff --git a/test/Makefile.am b/test/Makefile.am index b35f6110..f2e911f2 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -9,6 +9,6 @@ CFLAGS = $(CFLAGS_BASE) TESTS = test_libpostal noinst_PROGRAMS = test_libpostal -test_libpostal_SOURCES = test.c test_expand.c test_parser.c test_transliterate.c test_numex.c test_trie.c test_string_utils.c test_crf_context.c -test_libpostal_LDADD = ../src/libpostal.la $(CBLAS_LIBS) +test_libpostal_SOURCES = test.c test_expand.c test_parser.c test_transliterate.c test_numex.c test_trie.c test_string_utils.c test_crf_context.c ../src/strndup.c ../src/file_utils.c ../src/string_utils.c ../src/utf8proc/utf8proc.c ../src/trie.c ../src/trie_search.c ../src/transliterate.c ../src/numex.c ../src/features.c +test_libpostal_LDADD = ../src/libpostal.la ../src/libscanner.la $(CBLAS_LIBS) test_libpostal_CFLAGS = $(CFLAGS_O3) diff --git a/win_build.bat b/win_build.bat new file mode 100644 index 00000000..8b0db748 --- /dev/null +++ b/win_build.bat @@ -0,0 +1,24 @@ +@echo off + +cd %APPVEYOR_BUILD_FOLDER% + +echo Compiler: %COMPILER% +echo Architecture: %MSYS2_ARCH% +echo Platform: %PLATFORM% +echo MSYS2 directory: %MSYS2_DIR% +echo MSYS2 system: %MSYSTEM% +echo Configuration: %CONFIGURATION% +echo Bits: %BIT% + +IF %COMPILER%==msys2 ( + @echo on + SET "PATH=C:\%MSYS2_DIR%\%MSYSTEM%\bin;C:\%MSYS2_DIR%\usr\bin;%PATH%" + + bash -lc "cd $APPVEYOR_BUILD_FOLDER && cp -rf windows/* ./" + bash -lc "cd $APPVEYOR_BUILD_FOLDER && ./bootstrap.sh" + bash -lc "cd $APPVEYOR_BUILD_FOLDER && ./configure --datadir=/c" + bash -lc "cd $APPVEYOR_BUILD_FOLDER && make" + bash -lc "cd $APPVEYOR_BUILD_FOLDER && make install" + bash -lc "cd $APPVEYOR_BUILD_FOLDER && cp src/.libs/libpostal-*.dll libpostal.dll" + "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\lib.exe" /def:libpostal.def /out:libpostal.lib /machine:x64 +) diff --git a/windows/configure.ac b/windows/configure.ac new file mode 100644 index 00000000..0a964cf5 --- /dev/null +++ b/windows/configure.ac @@ -0,0 +1,105 @@ +# -*- Autoconf -*- +# Process this file with autoconf to produce a configure script. + +m4_define(LIBPOSTAL_MAJOR_VERSION, [1]) +m4_define(LIBPOSTAL_MINOR_VERSION, [0]) +m4_define(LIBPOSTAL_PATCH_VERSION, [0]) + +AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION) + +AC_CONFIG_MACRO_DIR([m4]) + +AM_INIT_AUTOMAKE([foreign subdir-objects]) +AC_CONFIG_SRCDIR([src]) +LT_INIT([win32-dll]) + +AC_CONFIG_HEADERS([config.h]) + +# Checks for programs. +AC_PROG_CC_C99 +AC_PROG_INSTALL + +LDFLAGS="$LDFLAGS -L/usr/local/lib" + +# Checks for libraries. +AC_SEARCH_LIBS([log], + [m],,[AC_MSG_ERROR([Could not find math library])]) + +# Checks for header files. +AC_HEADER_STDC +AC_HEADER_TIME +AC_HEADER_DIRENT +AC_HEADER_STDBOOL +AC_CHECK_HEADERS([fcntl.h float.h inttypes.h limits.h locale.h malloc.h memory.h stddef.h stdint.h stdlib.h string.h unistd.h]) + +# Checks for typedefs, structures, and compiler characteristics. +AC_C_INLINE +AC_TYPE_INT16_T +AC_TYPE_INT32_T +AC_TYPE_INT64_T +AC_TYPE_INT8_T +AC_TYPE_OFF_T +AC_TYPE_SIZE_T +AC_TYPE_SSIZE_T +AC_TYPE_UINT16_T +AC_TYPE_UINT32_T +AC_TYPE_UINT64_T +AC_TYPE_UINT8_T +AC_CHECK_TYPES([ptrdiff_t]) + +# Checks for library functions. +AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup]) + +AC_CONFIG_FILES([Makefile + libpostal.pc + src/Makefile + test/Makefile]) + +AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes]) +AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes]) + +AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf available])]) +AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])]) + +# ------------------------------------------------------------------ +# Checks for SSE2 build +# ------------------------------------------------------------------ +AC_ARG_ENABLE([sse2], + AS_HELP_STRING( + [--disable-sse2], + [disable SSE2 optimization routines] + ) + ) + +AS_IF([test "x$enable_sse2" != "xno"], [ + CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}" +]) + +AC_CHECK_HEADER(cblas.h, [AX_CBLAS]) + +AC_ARG_ENABLE([data-download], + [ --disable-data-download Disable downloading data], + [case "${enableval}" in + yes) DOWNLOAD_DATA=true ;; + no) DOWNLOAD_DATA=false ;; + *) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;; + esac], [DOWNLOAD_DATA=true]) + +AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"]) + +AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])], +[ + if test "x$withval" = "xno"; then + CFLAGS_SCANNER_EXTRA="" + else + CFLAGS_SCANNER_EXTRA="$withval" + fi +], +[ CFLAGS_SCANNER_EXTRA="" ] +) + +AC_MSG_NOTICE([extra cflags for scanner.c: $CFLAGS_SCANNER_EXTRA]) +AC_SUBST(CFLAGS_SCANNER_EXTRA) +AC_SUBST(LIBPOSTAL_SO_VERSION, LIBPOSTAL_MAJOR_VERSION:LIBPOSTAL_MINOR_VERSION:LIBPOSTAL_PATCH_VERSION) + +AC_OUTPUT