Merge pull request #272 from AeroXuk/master
Windows support via AppVeyor
This commit is contained in:
26
.appveyor.yml
Normal file
26
.appveyor.yml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
version: 1.0.{build}
|
||||||
|
|
||||||
|
image: Visual Studio 2015
|
||||||
|
platform: x64
|
||||||
|
|
||||||
|
environment:
|
||||||
|
matrix:
|
||||||
|
- COMPILER: msys2
|
||||||
|
PLATFORM: x64
|
||||||
|
MSYS2_ARCH: x86_64
|
||||||
|
MSYS2_DIR: msys64
|
||||||
|
MSYSTEM: MINGW64
|
||||||
|
BIT: 64
|
||||||
|
|
||||||
|
build_script:
|
||||||
|
- '%APPVEYOR_BUILD_FOLDER%\win_build.bat'
|
||||||
|
|
||||||
|
test_script:
|
||||||
|
- '%APPVEYOR_BUILD_FOLDER%\test\test_libpostal.exe'
|
||||||
|
|
||||||
|
after_build:
|
||||||
|
- 7z a libpostal.zip %APPVEYOR_BUILD_FOLDER%\libpostal.dll %APPVEYOR_BUILD_FOLDER%\libpostal.def %APPVEYOR_BUILD_FOLDER%\libpostal.exp %APPVEYOR_BUILD_FOLDER%\libpostal.lib %APPVEYOR_BUILD_FOLDER%\src\libpostal.h
|
||||||
|
|
||||||
|
artifacts:
|
||||||
|
- path: libpostal.zip
|
||||||
|
name: libpostal
|
||||||
37
README.md
37
README.md
@@ -1,6 +1,8 @@
|
|||||||
# libpostal: international street address NLP
|
# libpostal: international street address NLP
|
||||||
|
|
||||||
[](https://travis-ci.org/openvenues/libpostal) [](https://github.com/openvenues/libpostal/blob/master/LICENSE)
|
[](https://travis-ci.org/openvenues/libpostal)
|
||||||
|
[](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
|
||||||
|
[](https://github.com/openvenues/libpostal/blob/master/LICENSE)
|
||||||
[](#sponsors)
|
[](#sponsors)
|
||||||
[](#backers)
|
[](#backers)
|
||||||
|
|
||||||
@@ -137,6 +139,39 @@ For example, if you write a program called app.c, you can compile it like this:
|
|||||||
gcc app.c `pkg-config --cflags --libs libpostal`
|
gcc app.c `pkg-config --cflags --libs libpostal`
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**On Windows (MSys2/MinGW)**
|
||||||
|
|
||||||
|
For Windows the build procedure currently requires MSys2 and MinGW. This can be downloaded from http://msys2.org. Please follow the instructions on the MSys2 website for installation.
|
||||||
|
|
||||||
|
Please ensure Msys2 is up-to-date by running:
|
||||||
|
```
|
||||||
|
pacman -Syu
|
||||||
|
```
|
||||||
|
|
||||||
|
Install the following prerequisites:
|
||||||
|
```
|
||||||
|
pacman -S autoconf automake curl git make libtool gcc mingw-w64-x86_64-gcc
|
||||||
|
```
|
||||||
|
|
||||||
|
Then to build the C library:
|
||||||
|
```
|
||||||
|
git clone https://github.com/openvenues/libpostal
|
||||||
|
cd libpostal
|
||||||
|
cp -rf windows/* ./
|
||||||
|
./bootstrap.sh
|
||||||
|
./configure --datadir=[...some dir with a few GB of space...]
|
||||||
|
make
|
||||||
|
make install
|
||||||
|
```
|
||||||
|
Notes: When setting the datadir, the `C:` drive would be entered as `/c`. The libpostal build script automatically add `libpostal` on the end of the path, so '/c' would become `C:\libpostal\` on Windows.
|
||||||
|
|
||||||
|
The compiled .dll will be in the `src/.libs/` directory and should be called `libpostal-1.dll`.
|
||||||
|
|
||||||
|
If you require a .lib import library to link this to your application. You can generate one using the Visual Studio `lib.exe` tool and the `libpostal.def` definition file:
|
||||||
|
```
|
||||||
|
lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64
|
||||||
|
```
|
||||||
|
|
||||||
Examples of parsing
|
Examples of parsing
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
|
|||||||
17
libpostal.def
Normal file
17
libpostal.def
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
EXPORTS
|
||||||
|
libpostal_get_default_options
|
||||||
|
libpostal_expand_address
|
||||||
|
libpostal_expansion_array_destroy
|
||||||
|
libpostal_address_parser_response_destroy
|
||||||
|
libpostal_get_address_parser_default_options
|
||||||
|
libpostal_parse_address
|
||||||
|
libpostal_parser_print_features
|
||||||
|
libpostal_setup
|
||||||
|
libpostal_setup_datadir
|
||||||
|
libpostal_teardown
|
||||||
|
libpostal_setup_parser
|
||||||
|
libpostal_setup_parser_datadir
|
||||||
|
libpostal_teardown_parser
|
||||||
|
libpostal_setup_language_classifier
|
||||||
|
libpostal_setup_language_classifier_datadir
|
||||||
|
libpostal_teardown_language_classifier
|
||||||
@@ -12,10 +12,10 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include
|
|||||||
CFLAGS =
|
CFLAGS =
|
||||||
|
|
||||||
lib_LTLIBRARIES = libpostal.la
|
lib_LTLIBRARIES = libpostal.la
|
||||||
libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
|
libpostal_la_SOURCES = strndup.c libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
|
||||||
libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS)
|
libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS)
|
||||||
libpostal_la_CFLAGS = $(CFLAGS_O2)
|
libpostal_la_CFLAGS = $(CFLAGS_O2) -D LIBPOSTAL_EXPORTS
|
||||||
libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@
|
libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined
|
||||||
|
|
||||||
dist_bin_SCRIPTS = libpostal_data
|
dist_bin_SCRIPTS = libpostal_data
|
||||||
|
|
||||||
@@ -23,42 +23,42 @@ dist_bin_SCRIPTS = libpostal_data
|
|||||||
# On cross-compilation for ARM using gcc-4.7, there are "out of range" errors during compilation that can be fixed by adding
|
# On cross-compilation for ARM using gcc-4.7, there are "out of range" errors during compilation that can be fixed by adding
|
||||||
# -marm option. For that, CFLAGS_SCANNER_EXTRA is provided that can be filled during configuration stage (see ./configure --help).
|
# -marm option. For that, CFLAGS_SCANNER_EXTRA is provided that can be filled during configuration stage (see ./configure --help).
|
||||||
noinst_LTLIBRARIES = libscanner.la
|
noinst_LTLIBRARIES = libscanner.la
|
||||||
libscanner_la_SOURCES = scanner.c
|
libscanner_la_SOURCES = klib/drand48.c scanner.c
|
||||||
libscanner_la_CFLAGS = $(CFLAGS_O0) $(CFLAGS_SCANNER_EXTRA)
|
libscanner_la_CFLAGS = $(CFLAGS_O0) -D LIBPOSTAL_EXPORTS $(CFLAGS_SCANNER_EXTRA)
|
||||||
|
|
||||||
noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test
|
noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test
|
||||||
|
|
||||||
libpostal_SOURCES = main.c json_encode.c
|
libpostal_SOURCES = strndup.c main.c json_encode.c file_utils.c string_utils.c utf8proc/utf8proc.c
|
||||||
libpostal_LDADD = libpostal.la
|
libpostal_LDADD = libpostal.la
|
||||||
libpostal_CFLAGS = $(CFLAGS_O3)
|
libpostal_CFLAGS = $(CFLAGS_O3)
|
||||||
bench_SOURCES = bench.c
|
bench_SOURCES = bench.c
|
||||||
bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
|
bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
|
||||||
bench_CFLAGS = $(CFLAGS_O3)
|
bench_CFLAGS = $(CFLAGS_O3)
|
||||||
address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c
|
address_parser_SOURCES = strndup.c address_parser_cli.c json_encode.c linenoise/linenoise.c string_utils.c utf8proc/utf8proc.c
|
||||||
address_parser_LDADD = libscanner.la $(CBLAS_LIBS)
|
address_parser_LDADD = libpostal.la $(CBLAS_LIBS)
|
||||||
address_parser_CFLAGS = $(CFLAGS_O3)
|
address_parser_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
|
build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
|
||||||
build_address_dictionary_CFLAGS = $(CFLAGS_O3)
|
build_address_dictionary_CFLAGS = $(CFLAGS_O3)
|
||||||
build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c
|
build_numex_table_SOURCES = strndup.c numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c
|
||||||
build_numex_table_CFLAGS = $(CFLAGS_O3)
|
build_numex_table_CFLAGS = $(CFLAGS_O3)
|
||||||
build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
|
build_trans_table_SOURCES = strndup.c transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
|
||||||
build_trans_table_CFLAGS = $(CFLAGS_O3)
|
build_trans_table_CFLAGS = $(CFLAGS_O3)
|
||||||
address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c
|
address_parser_train_SOURCES = strndup.c address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c
|
||||||
address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS)
|
address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
address_parser_train_CFLAGS = $(CFLAGS_O3)
|
address_parser_train_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c
|
address_parser_test_SOURCES = strndup.c address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c
|
||||||
address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS)
|
address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
address_parser_test_CFLAGS = $(CFLAGS_O3)
|
address_parser_test_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
language_classifier_train_SOURCES = language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c
|
language_classifier_train_SOURCES = strndup.c language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c
|
||||||
language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS)
|
language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
language_classifier_train_CFLAGS = $(CFLAGS_O3)
|
language_classifier_train_CFLAGS = $(CFLAGS_O3)
|
||||||
language_classifier_SOURCES = language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
|
language_classifier_SOURCES = strndup.c language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
|
||||||
language_classifier_LDADD = libscanner.la $(CBLAS_LIBS)
|
language_classifier_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
language_classifier_CFLAGS = $(CFLAGS_O3)
|
language_classifier_CFLAGS = $(CFLAGS_O3)
|
||||||
language_classifier_test_SOURCES = language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
|
language_classifier_test_SOURCES = strndup.c language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
|
||||||
language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS)
|
language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS)
|
||||||
language_classifier_test_CFLAGS = $(CFLAGS_O3)
|
language_classifier_test_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
|
|||||||
@@ -49,6 +49,13 @@ address_parser_t *get_address_parser(void) {
|
|||||||
return parser;
|
return parser;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool address_parser_print_features(bool print_features) {
|
||||||
|
if (parser == NULL) return false;
|
||||||
|
|
||||||
|
parser->options.print_features = print_features;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool address_parser_save(address_parser_t *self, char *output_dir) {
|
bool address_parser_save(address_parser_t *self, char *output_dir) {
|
||||||
if (self == NULL || output_dir == NULL) return false;
|
if (self == NULL || output_dir == NULL) return false;
|
||||||
|
|
||||||
|
|||||||
@@ -215,6 +215,7 @@ address_parser_t *address_parser_new_options(parser_options_t options);
|
|||||||
address_parser_t *get_address_parser(void);
|
address_parser_t *get_address_parser(void);
|
||||||
bool address_parser_load(char *dir);
|
bool address_parser_load(char *dir);
|
||||||
|
|
||||||
|
bool address_parser_print_features(bool print_features);
|
||||||
libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country);
|
libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country);
|
||||||
void address_parser_destroy(address_parser_t *self);
|
void address_parser_destroy(address_parser_t *self);
|
||||||
|
|
||||||
|
|||||||
@@ -1,35 +1,15 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "address_parser.h"
|
|
||||||
#include "averaged_perceptron_tagger.h"
|
|
||||||
#include "address_dictionary.h"
|
|
||||||
#include "collections.h"
|
|
||||||
#include "constants.h"
|
|
||||||
#include "file_utils.h"
|
|
||||||
#include "json_encode.h"
|
#include "json_encode.h"
|
||||||
#include "libpostal.h"
|
#include "libpostal.h"
|
||||||
#include "normalize.h"
|
|
||||||
#include "scanner.h"
|
|
||||||
#include "shuffle.h"
|
|
||||||
#include "tokens.h"
|
|
||||||
|
|
||||||
#include "linenoise/linenoise.h"
|
#include "linenoise/linenoise.h"
|
||||||
#include "log/log.h"
|
#include "log/log.h"
|
||||||
|
#include "strndup.h"
|
||||||
bool load_address_parser_dependencies(void) {
|
|
||||||
if (!address_dictionary_module_setup(NULL)) {
|
|
||||||
log_error("Could not load address dictionaries\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
log_info("address dictionary module loaded\n");
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
char *address_parser_dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
|
char *address_parser_dir = NULL;
|
||||||
char *history_file = "address_parser.history";
|
char *history_file = "address_parser.history";
|
||||||
|
|
||||||
if (argc > 1) {
|
if (argc > 1) {
|
||||||
@@ -38,7 +18,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
printf("Loading models...\n");
|
printf("Loading models...\n");
|
||||||
|
|
||||||
if (!libpostal_setup() || !address_parser_module_setup(address_parser_dir)) {
|
if (!libpostal_setup() || !libpostal_setup_parser_datadir(address_parser_dir)) {
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -54,8 +34,6 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
char *input = NULL;
|
char *input = NULL;
|
||||||
|
|
||||||
address_parser_t *parser = get_address_parser();
|
|
||||||
|
|
||||||
while((input = linenoise("> ")) != NULL) {
|
while((input = linenoise("> ")) != NULL) {
|
||||||
|
|
||||||
if (input[0] != '\0') {
|
if (input[0] != '\0') {
|
||||||
@@ -63,7 +41,7 @@ int main(int argc, char **argv) {
|
|||||||
linenoiseHistorySave(history_file); /* Save the history on disk. */
|
linenoiseHistorySave(history_file); /* Save the history on disk. */
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strcmp(input, ".exit") == 0) {
|
if (strncmp(input, ".exit", 5) == 0) {
|
||||||
printf("Fin!\n");
|
printf("Fin!\n");
|
||||||
free(input);
|
free(input);
|
||||||
break;
|
break;
|
||||||
@@ -101,12 +79,12 @@ int main(int argc, char **argv) {
|
|||||||
if (cstring_array_num_strings(command) > 1) {
|
if (cstring_array_num_strings(command) > 1) {
|
||||||
char *flag = cstring_array_get_string(command, 1);
|
char *flag = cstring_array_get_string(command, 1);
|
||||||
if (string_compare_case_insensitive(flag, "off") == 0) {
|
if (string_compare_case_insensitive(flag, "off") == 0) {
|
||||||
parser->options.print_features = false;
|
libpostal_parser_print_features(false);
|
||||||
} else if (string_compare_case_insensitive(flag, "on") == 0) {
|
} else if (string_compare_case_insensitive(flag, "on") == 0) {
|
||||||
parser->options.print_features = true;
|
libpostal_parser_print_features(true);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
parser->options.print_features = true;
|
libpostal_parser_print_features(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
cstring_array_destroy(command);
|
cstring_array_destroy(command);
|
||||||
@@ -118,6 +96,9 @@ int main(int argc, char **argv) {
|
|||||||
libpostal_address_parser_response_t *parsed;
|
libpostal_address_parser_response_t *parsed;
|
||||||
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
|
||||||
|
|
||||||
|
if (country != NULL) options.country = country;
|
||||||
|
if (language != NULL) options.language = language;
|
||||||
|
|
||||||
if ((parsed = libpostal_parse_address(input, options))) {
|
if ((parsed = libpostal_parse_address(input, options))) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Result:\n\n");
|
printf("Result:\n\n");
|
||||||
|
|||||||
70
src/klib/drand48.c
Normal file
70
src/klib/drand48.c
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 1993 Martin Birgmeier
|
||||||
|
* All rights reserved.
|
||||||
|
|
||||||
|
* You may redistribute unmodified or modified versions of this source
|
||||||
|
* code provided that the above copyright notice and this and the
|
||||||
|
* following conditions are retained.
|
||||||
|
|
||||||
|
* This software is provided ``as is'', and comes with no warranties
|
||||||
|
* of any kind. I shall in no event be liable for anything that happens
|
||||||
|
* to anyone/anything when using this software.
|
||||||
|
*/
|
||||||
|
|
||||||
|
//I've rearranged the source into a header-only implementation for drand48() -Benjamin Kusin
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include "drand48.h"
|
||||||
|
|
||||||
|
#define RAND48_SEED_0 (0x330e)
|
||||||
|
#define RAND48_SEED_1 (0xabcd)
|
||||||
|
#define RAND48_SEED_2 (0x1234)
|
||||||
|
#define RAND48_MULT_0 (0xe66d)
|
||||||
|
#define RAND48_MULT_1 (0xdeec)
|
||||||
|
#define RAND48_MULT_2 (0x0005)
|
||||||
|
#define RAND48_ADD (0x000b)
|
||||||
|
|
||||||
|
unsigned short _rand48_seed[3] = {
|
||||||
|
RAND48_SEED_0,
|
||||||
|
RAND48_SEED_1,
|
||||||
|
RAND48_SEED_2
|
||||||
|
};
|
||||||
|
|
||||||
|
unsigned short _rand48_mult[3] = {
|
||||||
|
RAND48_MULT_0,
|
||||||
|
RAND48_MULT_1,
|
||||||
|
RAND48_MULT_2
|
||||||
|
};
|
||||||
|
|
||||||
|
unsigned short _rand48_add = RAND48_ADD;
|
||||||
|
|
||||||
|
void _dorand48(unsigned short xseed[3])
|
||||||
|
{
|
||||||
|
unsigned long accu;
|
||||||
|
unsigned short temp[2];
|
||||||
|
|
||||||
|
accu = (unsigned long) _rand48_mult[0] * (unsigned long) xseed[0] + (unsigned long) _rand48_add;
|
||||||
|
temp[0] = (unsigned short) accu; /* lower 16 bits */
|
||||||
|
accu >>= sizeof(unsigned short) * 8;
|
||||||
|
accu += (unsigned long) _rand48_mult[0] * (unsigned long) xseed[1] + (unsigned long) _rand48_mult[1] * (unsigned long) xseed[0];
|
||||||
|
temp[1] = (unsigned short) accu; /* middle 16 bits */
|
||||||
|
accu >>= sizeof(unsigned short) * 8;
|
||||||
|
accu += _rand48_mult[0] * xseed[2] + _rand48_mult[1] * xseed[1] + _rand48_mult[2] * xseed[0];
|
||||||
|
xseed[0] = temp[0];
|
||||||
|
xseed[1] = temp[1];
|
||||||
|
xseed[2] = (unsigned short) accu;
|
||||||
|
}
|
||||||
|
|
||||||
|
double erand48(unsigned short xseed[3])
|
||||||
|
{
|
||||||
|
_dorand48(xseed);
|
||||||
|
return ldexp((double) xseed[0], -48) +
|
||||||
|
ldexp((double) xseed[1], -32) +
|
||||||
|
ldexp((double) xseed[2], -16);
|
||||||
|
}
|
||||||
|
|
||||||
|
double drand48(void)
|
||||||
|
{
|
||||||
|
return erand48(_rand48_seed);
|
||||||
|
}
|
||||||
41
src/klib/drand48.h
Normal file
41
src/klib/drand48.h
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 1993 Martin Birgmeier
|
||||||
|
* All rights reserved.
|
||||||
|
|
||||||
|
* You may redistribute unmodified or modified versions of this source
|
||||||
|
* code provided that the above copyright notice and this and the
|
||||||
|
* following conditions are retained.
|
||||||
|
|
||||||
|
* This software is provided ``as is'', and comes with no warranties
|
||||||
|
* of any kind. I shall in no event be liable for anything that happens
|
||||||
|
* to anyone/anything when using this software.
|
||||||
|
*/
|
||||||
|
|
||||||
|
//I've rearranged the source into a header-only implementation for drand48() -Benjamin Kusin
|
||||||
|
|
||||||
|
#ifndef _DRAND48_H
|
||||||
|
#define _DRAND48_H
|
||||||
|
|
||||||
|
#define RAND48_SEED_0 (0x330e)
|
||||||
|
#define RAND48_SEED_1 (0xabcd)
|
||||||
|
#define RAND48_SEED_2 (0x1234)
|
||||||
|
#define RAND48_MULT_0 (0xe66d)
|
||||||
|
#define RAND48_MULT_1 (0xdeec)
|
||||||
|
#define RAND48_MULT_2 (0x0005)
|
||||||
|
#define RAND48_ADD (0x000b)
|
||||||
|
|
||||||
|
unsigned short _rand48_seed[3];
|
||||||
|
|
||||||
|
unsigned short _rand48_mult[3];
|
||||||
|
|
||||||
|
unsigned short _rand48_add;
|
||||||
|
|
||||||
|
void _dorand48(unsigned short xseed[3]);
|
||||||
|
|
||||||
|
double erand48(unsigned short xseed[3]);
|
||||||
|
|
||||||
|
double drand48(void);
|
||||||
|
|
||||||
|
|
||||||
|
#endif // _DRAND48_H
|
||||||
@@ -45,6 +45,7 @@
|
|||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include "drand48.h"
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
void *left, *right;
|
void *left, *right;
|
||||||
|
|||||||
@@ -1073,6 +1073,10 @@ libpostal_address_parser_response_t *libpostal_parse_address(char *address, libp
|
|||||||
return parsed;
|
return parsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool libpostal_parser_print_features(bool print_features) {
|
||||||
|
return address_parser_print_features(print_features);
|
||||||
|
}
|
||||||
|
|
||||||
bool libpostal_setup_datadir(char *datadir) {
|
bool libpostal_setup_datadir(char *datadir) {
|
||||||
char *transliteration_path = NULL;
|
char *transliteration_path = NULL;
|
||||||
char *numex_path = NULL;
|
char *numex_path = NULL;
|
||||||
|
|||||||
@@ -10,6 +10,18 @@ extern "C" {
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#ifdef LIBPOSTAL_EXPORTS
|
||||||
|
#define LIBPOSTAL_EXPORT __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define LIBPOSTAL_EXPORT __declspec(dllimport)
|
||||||
|
#endif
|
||||||
|
#elif __GNUC__ >= 4
|
||||||
|
#define LIBPOSTAL_EXPORT __attribute__ ((visibility("default")))
|
||||||
|
#else
|
||||||
|
#define LIBPOSTAL_EXPORT
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LIBPOSTAL_MAX_LANGUAGE_LEN 4
|
#define LIBPOSTAL_MAX_LANGUAGE_LEN 4
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -62,11 +74,11 @@ typedef struct libpostal_normalize_options {
|
|||||||
|
|
||||||
} libpostal_normalize_options_t;
|
} libpostal_normalize_options_t;
|
||||||
|
|
||||||
libpostal_normalize_options_t libpostal_get_default_options(void);
|
LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void);
|
||||||
|
|
||||||
char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n);
|
LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n);
|
||||||
|
|
||||||
void libpostal_expansion_array_destroy(char **expansions, size_t n);
|
LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Address parser
|
Address parser
|
||||||
@@ -83,25 +95,27 @@ typedef struct libpostal_address_parser_options {
|
|||||||
char *country;
|
char *country;
|
||||||
} libpostal_address_parser_options_t;
|
} libpostal_address_parser_options_t;
|
||||||
|
|
||||||
void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self);
|
LIBPOSTAL_EXPORT void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self);
|
||||||
|
|
||||||
libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void);
|
LIBPOSTAL_EXPORT libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void);
|
||||||
|
|
||||||
libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options);
|
LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options);
|
||||||
|
|
||||||
|
LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features);
|
||||||
|
|
||||||
// Setup/teardown methods
|
// Setup/teardown methods
|
||||||
|
|
||||||
bool libpostal_setup(void);
|
LIBPOSTAL_EXPORT bool libpostal_setup(void);
|
||||||
bool libpostal_setup_datadir(char *datadir);
|
LIBPOSTAL_EXPORT bool libpostal_setup_datadir(char *datadir);
|
||||||
void libpostal_teardown(void);
|
LIBPOSTAL_EXPORT void libpostal_teardown(void);
|
||||||
|
|
||||||
bool libpostal_setup_parser(void);
|
LIBPOSTAL_EXPORT bool libpostal_setup_parser(void);
|
||||||
bool libpostal_setup_parser_datadir(char *datadir);
|
LIBPOSTAL_EXPORT bool libpostal_setup_parser_datadir(char *datadir);
|
||||||
void libpostal_teardown_parser(void);
|
LIBPOSTAL_EXPORT void libpostal_teardown_parser(void);
|
||||||
|
|
||||||
bool libpostal_setup_language_classifier(void);
|
LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier(void);
|
||||||
bool libpostal_setup_language_classifier_datadir(char *datadir);
|
LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier_datadir(char *datadir);
|
||||||
void libpostal_teardown_language_classifier(void);
|
LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,7 +105,6 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <termios.h>
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@@ -114,7 +113,12 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
#include <termios.h>
|
||||||
#include <sys/ioctl.h>
|
#include <sys/ioctl.h>
|
||||||
|
#endif //_WIN32
|
||||||
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include "linenoise.h"
|
#include "linenoise.h"
|
||||||
|
|
||||||
@@ -123,8 +127,10 @@
|
|||||||
static char *unsupported_term[] = {"dumb","cons25","emacs",NULL};
|
static char *unsupported_term[] = {"dumb","cons25","emacs",NULL};
|
||||||
static linenoiseCompletionCallback *completionCallback = NULL;
|
static linenoiseCompletionCallback *completionCallback = NULL;
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
static struct termios orig_termios; /* In order to restore at exit.*/
|
static struct termios orig_termios; /* In order to restore at exit.*/
|
||||||
static int rawmode = 0; /* For atexit() function to check if restore is needed*/
|
static int rawmode = 0; /* For atexit() function to check if restore is needed*/
|
||||||
|
#endif //_WIN32
|
||||||
static int mlmode = 0; /* Multi line mode. Default is single line. */
|
static int mlmode = 0; /* Multi line mode. Default is single line. */
|
||||||
static int atexit_registered = 0; /* Register atexit just 1 time. */
|
static int atexit_registered = 0; /* Register atexit just 1 time. */
|
||||||
static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN;
|
static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN;
|
||||||
@@ -150,25 +156,25 @@ struct linenoiseState {
|
|||||||
};
|
};
|
||||||
|
|
||||||
enum KEY_ACTION{
|
enum KEY_ACTION{
|
||||||
KEY_NULL = 0, /* NULL */
|
KEY_NULL = 0, /* NULL */
|
||||||
CTRL_A = 1, /* Ctrl+a */
|
CTRL_A = 1, /* Ctrl+a */
|
||||||
CTRL_B = 2, /* Ctrl-b */
|
CTRL_B = 2, /* Ctrl-b */
|
||||||
CTRL_C = 3, /* Ctrl-c */
|
CTRL_C = 3, /* Ctrl-c */
|
||||||
CTRL_D = 4, /* Ctrl-d */
|
CTRL_D = 4, /* Ctrl-d */
|
||||||
CTRL_E = 5, /* Ctrl-e */
|
CTRL_E = 5, /* Ctrl-e */
|
||||||
CTRL_F = 6, /* Ctrl-f */
|
CTRL_F = 6, /* Ctrl-f */
|
||||||
CTRL_H = 8, /* Ctrl-h */
|
CTRL_H = 8, /* Ctrl-h */
|
||||||
TAB = 9, /* Tab */
|
TAB = 9, /* Tab */
|
||||||
CTRL_K = 11, /* Ctrl+k */
|
CTRL_K = 11, /* Ctrl+k */
|
||||||
CTRL_L = 12, /* Ctrl+l */
|
CTRL_L = 12, /* Ctrl+l */
|
||||||
ENTER = 13, /* Enter */
|
ENTER = 13, /* Enter */
|
||||||
CTRL_N = 14, /* Ctrl-n */
|
CTRL_N = 14, /* Ctrl-n */
|
||||||
CTRL_P = 16, /* Ctrl-p */
|
CTRL_P = 16, /* Ctrl-p */
|
||||||
CTRL_T = 20, /* Ctrl-t */
|
CTRL_T = 20, /* Ctrl-t */
|
||||||
CTRL_U = 21, /* Ctrl+u */
|
CTRL_U = 21, /* Ctrl+u */
|
||||||
CTRL_W = 23, /* Ctrl+w */
|
CTRL_W = 23, /* Ctrl+w */
|
||||||
ESC = 27, /* Escape */
|
ESC = 27, /* Escape */
|
||||||
BACKSPACE = 127 /* Backspace */
|
BACKSPACE = 127 /* Backspace */
|
||||||
};
|
};
|
||||||
|
|
||||||
static void linenoiseAtExit(void);
|
static void linenoiseAtExit(void);
|
||||||
@@ -207,7 +213,13 @@ static int isUnsupportedTerm(void) {
|
|||||||
char *term = getenv("TERM");
|
char *term = getenv("TERM");
|
||||||
int j;
|
int j;
|
||||||
|
|
||||||
if (term == NULL) return 0;
|
if (term == NULL) {
|
||||||
|
#ifdef _WIN32
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif // _WIN32
|
||||||
|
}
|
||||||
for (j = 0; unsupported_term[j]; j++)
|
for (j = 0; unsupported_term[j]; j++)
|
||||||
if (!strcasecmp(term,unsupported_term[j])) return 1;
|
if (!strcasecmp(term,unsupported_term[j])) return 1;
|
||||||
return 0;
|
return 0;
|
||||||
@@ -215,6 +227,7 @@ static int isUnsupportedTerm(void) {
|
|||||||
|
|
||||||
/* Raw mode: 1960 magic shit. */
|
/* Raw mode: 1960 magic shit. */
|
||||||
static int enableRawMode(int fd) {
|
static int enableRawMode(int fd) {
|
||||||
|
#ifndef _WIN32
|
||||||
struct termios raw;
|
struct termios raw;
|
||||||
|
|
||||||
if (!isatty(STDIN_FILENO)) goto fatal;
|
if (!isatty(STDIN_FILENO)) goto fatal;
|
||||||
@@ -247,12 +260,17 @@ static int enableRawMode(int fd) {
|
|||||||
fatal:
|
fatal:
|
||||||
errno = ENOTTY;
|
errno = ENOTTY;
|
||||||
return -1;
|
return -1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif //_WIN32
|
||||||
}
|
}
|
||||||
|
|
||||||
static void disableRawMode(int fd) {
|
static void disableRawMode(int fd) {
|
||||||
|
#ifndef _WIN32
|
||||||
/* Don't even check the return value as it's too late. */
|
/* Don't even check the return value as it's too late. */
|
||||||
if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1)
|
if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1)
|
||||||
rawmode = 0;
|
rawmode = 0;
|
||||||
|
#endif //_WIN32
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Use the ESC [6n escape sequence to query the horizontal cursor position
|
/* Use the ESC [6n escape sequence to query the horizontal cursor position
|
||||||
@@ -283,9 +301,13 @@ static int getCursorPosition(int ifd, int ofd) {
|
|||||||
/* Try to get the number of columns in the current terminal, or assume 80
|
/* Try to get the number of columns in the current terminal, or assume 80
|
||||||
* if it fails. */
|
* if it fails. */
|
||||||
static int getColumns(int ifd, int ofd) {
|
static int getColumns(int ifd, int ofd) {
|
||||||
|
#ifndef _WIN32
|
||||||
struct winsize ws;
|
struct winsize ws;
|
||||||
|
|
||||||
if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) {
|
if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) {
|
||||||
|
#else
|
||||||
|
if(1) {
|
||||||
|
#endif //_WIN32
|
||||||
/* ioctl() failed. Try to query the terminal itself. */
|
/* ioctl() failed. Try to query the terminal itself. */
|
||||||
int start, cols;
|
int start, cols;
|
||||||
|
|
||||||
@@ -307,9 +329,12 @@ static int getColumns(int ifd, int ofd) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return cols;
|
return cols;
|
||||||
} else {
|
}
|
||||||
|
#ifndef _WIN32
|
||||||
|
else {
|
||||||
return ws.ws_col;
|
return ws.ws_col;
|
||||||
}
|
}
|
||||||
|
#endif //_WIN32
|
||||||
|
|
||||||
failed:
|
failed:
|
||||||
return 80;
|
return 80;
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#include "normalize.h"
|
#include "normalize.h"
|
||||||
|
#include "strndup.h"
|
||||||
|
|
||||||
#define FULL_STOP_CODEPOINT 0x002e
|
#define FULL_STOP_CODEPOINT 0x002e
|
||||||
#define APOSTROPHE_CODEPOINT 0x0027
|
#define APOSTROPHE_CODEPOINT 0x0027
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "log/log.h"
|
#include "log/log.h"
|
||||||
#include "string_utils.h"
|
#include "string_utils.h"
|
||||||
|
#include "strndup.h"
|
||||||
|
|
||||||
#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n))
|
#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n))
|
||||||
|
|
||||||
|
|||||||
17
src/strndup.c
Normal file
17
src/strndup.c
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#include <config.h>
|
||||||
|
#ifndef HAVE_STRNDUP
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
char *strndup(const char *s, size_t n)
|
||||||
|
{
|
||||||
|
char* new = malloc(n+1);
|
||||||
|
if (new) {
|
||||||
|
strncpy(new, s, n);
|
||||||
|
new[n] = '\0';
|
||||||
|
}
|
||||||
|
return new;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* HAVE_STRNDUP */
|
||||||
11
src/strndup.h
Normal file
11
src/strndup.h
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
#ifndef STRNDUP_H
|
||||||
|
#define STRNDUP_H
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
|
#ifndef HAVE_STRNDUP
|
||||||
|
|
||||||
|
char *strndup(const char *s, size_t n);
|
||||||
|
|
||||||
|
#endif /* HAVE_STRNDUP */
|
||||||
|
#endif /* STRNDUP_H */
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
#include "tokens.h"
|
#include "tokens.h"
|
||||||
|
#include "strndup.h"
|
||||||
|
|
||||||
|
|
||||||
tokenized_string_t *tokenized_string_new(void) {
|
tokenized_string_t *tokenized_string_new(void) {
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
#include "file_utils.h"
|
#include "file_utils.h"
|
||||||
|
|
||||||
#include "log/log.h"
|
#include "log/log.h"
|
||||||
|
#include "strndup.h"
|
||||||
|
|
||||||
#define TRANSLITERATION_TABLE_SIGNATURE 0xAAAAAAAA
|
#define TRANSLITERATION_TABLE_SIGNATURE 0xAAAAAAAA
|
||||||
|
|
||||||
|
|||||||
@@ -44,7 +44,7 @@
|
|||||||
#include "utf8proc_data.c"
|
#include "utf8proc_data.c"
|
||||||
|
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
|
const utf8proc_int8_t utf8proc_utf8class[256] = {
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
@@ -87,11 +87,11 @@ UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
|
|||||||
be different, being based on ABI compatibility.): */
|
be different, being based on ABI compatibility.): */
|
||||||
#define STRINGIZEx(x) #x
|
#define STRINGIZEx(x) #x
|
||||||
#define STRINGIZE(x) STRINGIZEx(x)
|
#define STRINGIZE(x) STRINGIZEx(x)
|
||||||
UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
|
const char *utf8proc_version(void) {
|
||||||
return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
|
return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
|
const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
|
||||||
switch (errcode) {
|
switch (errcode) {
|
||||||
case UTF8PROC_ERROR_NOMEM:
|
case UTF8PROC_ERROR_NOMEM:
|
||||||
return "Memory for processing UTF-8 data could not be allocated.";
|
return "Memory for processing UTF-8 data could not be allocated.";
|
||||||
@@ -109,7 +109,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
|
#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
|
utf8proc_ssize_t utf8proc_iterate(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
|
||||||
) {
|
) {
|
||||||
utf8proc_uint32_t uc;
|
utf8proc_uint32_t uc;
|
||||||
@@ -157,11 +157,11 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
|
|||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
|
utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
|
||||||
return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
|
return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
|
utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
|
||||||
if (uc < 0x00) {
|
if (uc < 0x00) {
|
||||||
return 0;
|
return 0;
|
||||||
} else if (uc < 0x80) {
|
} else if (uc < 0x80) {
|
||||||
@@ -228,7 +228,7 @@ static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
|
const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
|
||||||
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
|
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -259,18 +259,18 @@ static utf8proc_bool grapheme_break(int lbc, int tbc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* return whether there is a grapheme break between codepoints c1 and c2 */
|
/* return whether there is a grapheme break between codepoints c1 and c2 */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
|
utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
|
||||||
return grapheme_break(utf8proc_get_property(c1)->boundclass,
|
return grapheme_break(utf8proc_get_property(c1)->boundclass,
|
||||||
utf8proc_get_property(c2)->boundclass);
|
utf8proc_get_property(c2)->boundclass);
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
||||||
{
|
{
|
||||||
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
|
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
|
||||||
return cl >= 0 ? cl : c;
|
return cl >= 0 ? cl : c;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
||||||
{
|
{
|
||||||
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
|
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
|
||||||
return cu >= 0 ? cu : c;
|
return cu >= 0 ? cu : c;
|
||||||
@@ -278,15 +278,15 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
|||||||
|
|
||||||
/* return a character width analogous to wcwidth (except portable and
|
/* return a character width analogous to wcwidth (except portable and
|
||||||
hopefully less buggy than most system wcwidth functions). */
|
hopefully less buggy than most system wcwidth functions). */
|
||||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
|
int utf8proc_charwidth(utf8proc_int32_t c) {
|
||||||
return utf8proc_get_property(c)->charwidth;
|
return utf8proc_get_property(c)->charwidth;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
|
utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
|
||||||
return utf8proc_get_property(c)->category;
|
return utf8proc_get_property(c)->category;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
|
const char *utf8proc_category_string(utf8proc_int32_t c) {
|
||||||
static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
|
static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
|
||||||
return s[utf8proc_category(c)];
|
return s[utf8proc_category(c)];
|
||||||
}
|
}
|
||||||
@@ -295,7 +295,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
|
|||||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||||
const utf8proc_property_t *property;
|
const utf8proc_property_t *property;
|
||||||
utf8proc_propval_t category;
|
utf8proc_propval_t category;
|
||||||
utf8proc_int32_t hangul_sindex;
|
utf8proc_int32_t hangul_sindex;
|
||||||
@@ -399,7 +399,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
utf8proc_ssize_t utf8proc_decompose(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||||
) {
|
) {
|
||||||
@@ -461,7 +461,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
return wpos;
|
return wpos;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
||||||
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
||||||
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
||||||
@@ -583,7 +583,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
utf8proc_ssize_t utf8proc_map(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||||
) {
|
) {
|
||||||
utf8proc_int32_t *buffer;
|
utf8proc_int32_t *buffer;
|
||||||
@@ -612,28 +612,28 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
|
utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
|
||||||
utf8proc_uint8_t *retval;
|
utf8proc_uint8_t *retval;
|
||||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
UTF8PROC_DECOMPOSE);
|
UTF8PROC_DECOMPOSE);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
|
utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
|
||||||
utf8proc_uint8_t *retval;
|
utf8proc_uint8_t *retval;
|
||||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
UTF8PROC_COMPOSE);
|
UTF8PROC_COMPOSE);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
|
utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
|
||||||
utf8proc_uint8_t *retval;
|
utf8proc_uint8_t *retval;
|
||||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
|
utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
|
||||||
utf8proc_uint8_t *retval;
|
utf8proc_uint8_t *retval;
|
||||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
||||||
|
|||||||
@@ -111,18 +111,6 @@ typedef bool utf8proc_bool;
|
|||||||
#endif
|
#endif
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
# ifdef UTF8PROC_EXPORTS
|
|
||||||
# define UTF8PROC_DLLEXPORT __declspec(dllexport)
|
|
||||||
# else
|
|
||||||
# define UTF8PROC_DLLEXPORT __declspec(dllimport)
|
|
||||||
# endif
|
|
||||||
#elif __GNUC__ >= 4
|
|
||||||
# define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
|
|
||||||
#else
|
|
||||||
# define UTF8PROC_DLLEXPORT
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
@@ -365,20 +353,20 @@ typedef enum {
|
|||||||
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
||||||
* on the first byte.
|
* on the first byte.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
|
extern const utf8proc_int8_t utf8proc_utf8class[256];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
|
* Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
|
||||||
* (http://semver.org format), possibly with a "-dev" suffix for
|
* (http://semver.org format), possibly with a "-dev" suffix for
|
||||||
* development versions.
|
* development versions.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT const char *utf8proc_version(void);
|
const char *utf8proc_version(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an informative error string for the given utf8proc error code
|
* Returns an informative error string for the given utf8proc error code
|
||||||
* (e.g. the error codes returned by @ref utf8proc_map).
|
* (e.g. the error codes returned by @ref utf8proc_map).
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
|
const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
|
* Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
|
||||||
@@ -390,7 +378,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
|
|||||||
* In case of success, the number of bytes read is returned; otherwise, a
|
* In case of success, the number of bytes read is returned; otherwise, a
|
||||||
* negative error code is returned.
|
* negative error code is returned.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
|
utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if a codepoint is valid (regardless of whether it has been
|
* Check if a codepoint is valid (regardless of whether it has been
|
||||||
@@ -398,7 +386,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str
|
|||||||
*
|
*
|
||||||
* @return 1 if the given `codepoint` is valid and otherwise return 0.
|
* @return 1 if the given `codepoint` is valid and otherwise return 0.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
|
utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encodes the codepoint as an UTF-8 string in the byte array pointed
|
* Encodes the codepoint as an UTF-8 string in the byte array pointed
|
||||||
@@ -409,7 +397,7 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codep
|
|||||||
*
|
*
|
||||||
* This function does not check whether `codepoint` is valid Unicode.
|
* This function does not check whether `codepoint` is valid Unicode.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst);
|
utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Look up the properties for a given codepoint.
|
* Look up the properties for a given codepoint.
|
||||||
@@ -423,7 +411,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepo
|
|||||||
* If the codepoint is unassigned or invalid, a pointer to a special struct is
|
* If the codepoint is unassigned or invalid, a pointer to a special struct is
|
||||||
* returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
|
* returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint);
|
const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint);
|
||||||
|
|
||||||
/** Decompose a codepoint into an array of codepoints.
|
/** Decompose a codepoint into an array of codepoints.
|
||||||
*
|
*
|
||||||
@@ -452,7 +440,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
|
|||||||
* required buffer size is returned, while the buffer will be overwritten with
|
* required buffer size is returned, while the buffer will be overwritten with
|
||||||
* undefined data.
|
* undefined data.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
|
utf8proc_ssize_t utf8proc_decompose_char(
|
||||||
utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize,
|
utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize,
|
||||||
utf8proc_option_t options, int *last_boundclass
|
utf8proc_option_t options, int *last_boundclass
|
||||||
);
|
);
|
||||||
@@ -473,7 +461,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
|
|||||||
* required buffer size is returned, while the buffer will be overwritten with
|
* required buffer size is returned, while the buffer will be overwritten with
|
||||||
* undefined data.
|
* undefined data.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
utf8proc_ssize_t utf8proc_decompose(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||||
);
|
);
|
||||||
@@ -503,13 +491,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
* entries of the array pointed to by `str` have to be in the
|
* entries of the array pointed to by `str` have to be in the
|
||||||
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
||||||
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
|
utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -517,14 +505,14 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepo
|
|||||||
* lower-case character, if any; otherwise (if there is no lower-case
|
* lower-case character, if any; otherwise (if there is no lower-case
|
||||||
* variant, or if `c` is not a valid codepoint) return `c`.
|
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
|
utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a codepoint `c`, return the codepoint of the corresponding
|
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||||
* upper-case character, if any; otherwise (if there is no upper-case
|
* upper-case character, if any; otherwise (if there is no upper-case
|
||||||
* variant, or if `c` is not a valid codepoint) return `c`.
|
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
||||||
@@ -534,19 +522,19 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
|||||||
* @note
|
* @note
|
||||||
* If you want to check for particular types of non-printable characters,
|
* If you want to check for particular types of non-printable characters,
|
||||||
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
|
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
|
||||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
|
int utf8proc_charwidth(utf8proc_int32_t codepoint);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the Unicode category for the codepoint (one of the
|
* Return the Unicode category for the codepoint (one of the
|
||||||
* @ref utf8proc_category_t constants.)
|
* @ref utf8proc_category_t constants.)
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
|
utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the two-letter (nul-terminated) Unicode category string for
|
* Return the two-letter (nul-terminated) Unicode category string for
|
||||||
* the codepoint (e.g. `"Lu"` or `"Co"`).
|
* the codepoint (e.g. `"Lu"` or `"Co"`).
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint);
|
const char *utf8proc_category_string(utf8proc_int32_t codepoint);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Maps the given UTF-8 string pointed to by `str` to a new UTF-8
|
* Maps the given UTF-8 string pointed to by `str` to a new UTF-8
|
||||||
@@ -566,7 +554,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
|
|||||||
* @note The memory of the new UTF-8 string will have been allocated
|
* @note The memory of the new UTF-8 string will have been allocated
|
||||||
* with `malloc`, and should therefore be deallocated with `free`.
|
* with `malloc`, and should therefore be deallocated with `free`.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
utf8proc_ssize_t utf8proc_map(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -579,13 +567,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
|||||||
*/
|
*/
|
||||||
/** @{ */
|
/** @{ */
|
||||||
/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
|
/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
|
utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
|
||||||
/** NFC normalization (@ref UTF8PROC_COMPOSE). */
|
/** NFC normalization (@ref UTF8PROC_COMPOSE). */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
|
utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
|
||||||
/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
|
/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
||||||
/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
@@ -9,6 +9,6 @@ CFLAGS = $(CFLAGS_BASE)
|
|||||||
|
|
||||||
TESTS = test_libpostal
|
TESTS = test_libpostal
|
||||||
noinst_PROGRAMS = test_libpostal
|
noinst_PROGRAMS = test_libpostal
|
||||||
test_libpostal_SOURCES = test.c test_expand.c test_parser.c test_transliterate.c test_numex.c test_trie.c test_string_utils.c test_crf_context.c
|
test_libpostal_SOURCES = test.c test_expand.c test_parser.c test_transliterate.c test_numex.c test_trie.c test_string_utils.c test_crf_context.c ../src/strndup.c ../src/file_utils.c ../src/string_utils.c ../src/utf8proc/utf8proc.c ../src/trie.c ../src/trie_search.c ../src/transliterate.c ../src/numex.c ../src/features.c
|
||||||
test_libpostal_LDADD = ../src/libpostal.la $(CBLAS_LIBS)
|
test_libpostal_LDADD = ../src/libpostal.la ../src/libscanner.la $(CBLAS_LIBS)
|
||||||
test_libpostal_CFLAGS = $(CFLAGS_O3)
|
test_libpostal_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|||||||
24
win_build.bat
Normal file
24
win_build.bat
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
@echo off
|
||||||
|
|
||||||
|
cd %APPVEYOR_BUILD_FOLDER%
|
||||||
|
|
||||||
|
echo Compiler: %COMPILER%
|
||||||
|
echo Architecture: %MSYS2_ARCH%
|
||||||
|
echo Platform: %PLATFORM%
|
||||||
|
echo MSYS2 directory: %MSYS2_DIR%
|
||||||
|
echo MSYS2 system: %MSYSTEM%
|
||||||
|
echo Configuration: %CONFIGURATION%
|
||||||
|
echo Bits: %BIT%
|
||||||
|
|
||||||
|
IF %COMPILER%==msys2 (
|
||||||
|
@echo on
|
||||||
|
SET "PATH=C:\%MSYS2_DIR%\%MSYSTEM%\bin;C:\%MSYS2_DIR%\usr\bin;%PATH%"
|
||||||
|
|
||||||
|
bash -lc "cd $APPVEYOR_BUILD_FOLDER && cp -rf windows/* ./"
|
||||||
|
bash -lc "cd $APPVEYOR_BUILD_FOLDER && ./bootstrap.sh"
|
||||||
|
bash -lc "cd $APPVEYOR_BUILD_FOLDER && ./configure --datadir=/c"
|
||||||
|
bash -lc "cd $APPVEYOR_BUILD_FOLDER && make"
|
||||||
|
bash -lc "cd $APPVEYOR_BUILD_FOLDER && make install"
|
||||||
|
bash -lc "cd $APPVEYOR_BUILD_FOLDER && cp src/.libs/libpostal-*.dll libpostal.dll"
|
||||||
|
"C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\lib.exe" /def:libpostal.def /out:libpostal.lib /machine:x64
|
||||||
|
)
|
||||||
105
windows/configure.ac
Normal file
105
windows/configure.ac
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
# -*- Autoconf -*-
|
||||||
|
# Process this file with autoconf to produce a configure script.
|
||||||
|
|
||||||
|
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
|
||||||
|
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
|
||||||
|
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
|
||||||
|
|
||||||
|
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
|
||||||
|
|
||||||
|
AC_CONFIG_MACRO_DIR([m4])
|
||||||
|
|
||||||
|
AM_INIT_AUTOMAKE([foreign subdir-objects])
|
||||||
|
AC_CONFIG_SRCDIR([src])
|
||||||
|
LT_INIT([win32-dll])
|
||||||
|
|
||||||
|
AC_CONFIG_HEADERS([config.h])
|
||||||
|
|
||||||
|
# Checks for programs.
|
||||||
|
AC_PROG_CC_C99
|
||||||
|
AC_PROG_INSTALL
|
||||||
|
|
||||||
|
LDFLAGS="$LDFLAGS -L/usr/local/lib"
|
||||||
|
|
||||||
|
# Checks for libraries.
|
||||||
|
AC_SEARCH_LIBS([log],
|
||||||
|
[m],,[AC_MSG_ERROR([Could not find math library])])
|
||||||
|
|
||||||
|
# Checks for header files.
|
||||||
|
AC_HEADER_STDC
|
||||||
|
AC_HEADER_TIME
|
||||||
|
AC_HEADER_DIRENT
|
||||||
|
AC_HEADER_STDBOOL
|
||||||
|
AC_CHECK_HEADERS([fcntl.h float.h inttypes.h limits.h locale.h malloc.h memory.h stddef.h stdint.h stdlib.h string.h unistd.h])
|
||||||
|
|
||||||
|
# Checks for typedefs, structures, and compiler characteristics.
|
||||||
|
AC_C_INLINE
|
||||||
|
AC_TYPE_INT16_T
|
||||||
|
AC_TYPE_INT32_T
|
||||||
|
AC_TYPE_INT64_T
|
||||||
|
AC_TYPE_INT8_T
|
||||||
|
AC_TYPE_OFF_T
|
||||||
|
AC_TYPE_SIZE_T
|
||||||
|
AC_TYPE_SSIZE_T
|
||||||
|
AC_TYPE_UINT16_T
|
||||||
|
AC_TYPE_UINT32_T
|
||||||
|
AC_TYPE_UINT64_T
|
||||||
|
AC_TYPE_UINT8_T
|
||||||
|
AC_CHECK_TYPES([ptrdiff_t])
|
||||||
|
|
||||||
|
# Checks for library functions.
|
||||||
|
AC_CHECK_FUNCS([malloc realloc getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
|
||||||
|
|
||||||
|
AC_CONFIG_FILES([Makefile
|
||||||
|
libpostal.pc
|
||||||
|
src/Makefile
|
||||||
|
test/Makefile])
|
||||||
|
|
||||||
|
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
||||||
|
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
||||||
|
|
||||||
|
AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf available])])
|
||||||
|
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Checks for SSE2 build
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
AC_ARG_ENABLE([sse2],
|
||||||
|
AS_HELP_STRING(
|
||||||
|
[--disable-sse2],
|
||||||
|
[disable SSE2 optimization routines]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
AS_IF([test "x$enable_sse2" != "xno"], [
|
||||||
|
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
|
||||||
|
])
|
||||||
|
|
||||||
|
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
|
||||||
|
|
||||||
|
AC_ARG_ENABLE([data-download],
|
||||||
|
[ --disable-data-download Disable downloading data],
|
||||||
|
[case "${enableval}" in
|
||||||
|
yes) DOWNLOAD_DATA=true ;;
|
||||||
|
no) DOWNLOAD_DATA=false ;;
|
||||||
|
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
|
||||||
|
esac], [DOWNLOAD_DATA=true])
|
||||||
|
|
||||||
|
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
|
||||||
|
|
||||||
|
AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],
|
||||||
|
[
|
||||||
|
if test "x$withval" = "xno"; then
|
||||||
|
CFLAGS_SCANNER_EXTRA=""
|
||||||
|
else
|
||||||
|
CFLAGS_SCANNER_EXTRA="$withval"
|
||||||
|
fi
|
||||||
|
],
|
||||||
|
[ CFLAGS_SCANNER_EXTRA="" ]
|
||||||
|
)
|
||||||
|
|
||||||
|
AC_MSG_NOTICE([extra cflags for scanner.c: $CFLAGS_SCANNER_EXTRA])
|
||||||
|
AC_SUBST(CFLAGS_SCANNER_EXTRA)
|
||||||
|
AC_SUBST(LIBPOSTAL_SO_VERSION, LIBPOSTAL_MAJOR_VERSION:LIBPOSTAL_MINOR_VERSION:LIBPOSTAL_PATCH_VERSION)
|
||||||
|
|
||||||
|
AC_OUTPUT
|
||||||
Reference in New Issue
Block a user