Merging changes from AeroXuk/libpostal_windows.

This commit is contained in:
AeroXuk
2017-11-19 12:44:38 +00:00
parent 7d6e648fc3
commit 2d3b420d35
17 changed files with 398 additions and 30 deletions

26
.appveyor.yml Normal file
View File

@@ -0,0 +1,26 @@
version: 1.0.{build}
branches:
only:
- master
image: Visual Studio 2015
platform: x64
environment:
matrix:
- COMPILER: msys2
PLATFORM: x64
MSYS2_ARCH: x86_64
MSYS2_DIR: msys64
MSYSTEM: MINGW64
BIT: 64
install:
-'%APPVEYOR_BUILD_FOLDER%\win_install.bat'
build_script:
- '%APPVEYOR_BUILD_FOLDER%\win_build.bat'
test_script:
- 'echo No tests yet'

16
libpostal.def Normal file
View File

@@ -0,0 +1,16 @@
EXPORTS
libpostal_get_default_options
libpostal_expand_address
libpostal_expansion_array_destroy
libpostal_address_parser_response_destroy
libpostal_get_address_parser_default_options
libpostal_parse_address
libpostal_setup
libpostal_setup_datadir
libpostal_teardown
libpostal_setup_parser
libpostal_setup_parser_datadir
libpostal_teardown_parser
libpostal_setup_language_classifier
libpostal_setup_language_classifier_datadir
libpostal_teardown_language_classifier

16
src/export.h Normal file
View File

@@ -0,0 +1,16 @@
#ifndef EXPORT_H
#define EXPORT_H
#ifdef _WIN32
#ifdef LIBPOSTAL_EXPORTS
#define LIBPOSTAL_EXPORT __declspec(dllexport)
#else
#define LIBPOSTAL_EXPORT __declspec(dllimport)
#endif
#elif __GNUC__ >= 4
#define LIBPOSTAL_EXPORT __attribute__ ((visibility("default")))
#else
#define LIBPOSTAL_EXPORT
#endif
#endif //EXPORT_H

70
src/klib/drand48.c Normal file
View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 1993 Martin Birgmeier
* All rights reserved.
* You may redistribute unmodified or modified versions of this source
* code provided that the above copyright notice and this and the
* following conditions are retained.
* This software is provided ``as is'', and comes with no warranties
* of any kind. I shall in no event be liable for anything that happens
* to anyone/anything when using this software.
*/
//I've rearranged the source into a header-only implementation for drand48() -Benjamin Kusin
#include <math.h>
#include "drand48.h"
#define RAND48_SEED_0 (0x330e)
#define RAND48_SEED_1 (0xabcd)
#define RAND48_SEED_2 (0x1234)
#define RAND48_MULT_0 (0xe66d)
#define RAND48_MULT_1 (0xdeec)
#define RAND48_MULT_2 (0x0005)
#define RAND48_ADD (0x000b)
unsigned short _rand48_seed[3] = {
RAND48_SEED_0,
RAND48_SEED_1,
RAND48_SEED_2
};
unsigned short _rand48_mult[3] = {
RAND48_MULT_0,
RAND48_MULT_1,
RAND48_MULT_2
};
unsigned short _rand48_add = RAND48_ADD;
void _dorand48(unsigned short xseed[3])
{
unsigned long accu;
unsigned short temp[2];
accu = (unsigned long) _rand48_mult[0] * (unsigned long) xseed[0] + (unsigned long) _rand48_add;
temp[0] = (unsigned short) accu; /* lower 16 bits */
accu >>= sizeof(unsigned short) * 8;
accu += (unsigned long) _rand48_mult[0] * (unsigned long) xseed[1] + (unsigned long) _rand48_mult[1] * (unsigned long) xseed[0];
temp[1] = (unsigned short) accu; /* middle 16 bits */
accu >>= sizeof(unsigned short) * 8;
accu += _rand48_mult[0] * xseed[2] + _rand48_mult[1] * xseed[1] + _rand48_mult[2] * xseed[0];
xseed[0] = temp[0];
xseed[1] = temp[1];
xseed[2] = (unsigned short) accu;
}
double erand48(unsigned short xseed[3])
{
_dorand48(xseed);
return ldexp((double) xseed[0], -48) +
ldexp((double) xseed[1], -32) +
ldexp((double) xseed[2], -16);
}
double drand48(void)
{
return erand48(_rand48_seed);
}

41
src/klib/drand48.h Normal file
View File

@@ -0,0 +1,41 @@
/*
* Copyright (c) 1993 Martin Birgmeier
* All rights reserved.
* You may redistribute unmodified or modified versions of this source
* code provided that the above copyright notice and this and the
* following conditions are retained.
* This software is provided ``as is'', and comes with no warranties
* of any kind. I shall in no event be liable for anything that happens
* to anyone/anything when using this software.
*/
//I've rearranged the source into a header-only implementation for drand48() -Benjamin Kusin
#ifndef _DRAND48_H
#define _DRAND48_H
#define RAND48_SEED_0 (0x330e)
#define RAND48_SEED_1 (0xabcd)
#define RAND48_SEED_2 (0x1234)
#define RAND48_MULT_0 (0xe66d)
#define RAND48_MULT_1 (0xdeec)
#define RAND48_MULT_2 (0x0005)
#define RAND48_ADD (0x000b)
unsigned short _rand48_seed[3];
unsigned short _rand48_mult[3];
unsigned short _rand48_add;
void _dorand48(unsigned short xseed[3]);
double erand48(unsigned short xseed[3]);
double drand48(void);
#endif // _DRAND48_H

View File

@@ -45,6 +45,7 @@
#include <stdlib.h>
#include <string.h>
#include "drand48.h"
typedef struct {
void *left, *right;

View File

@@ -57,7 +57,7 @@ static libpostal_normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = {
.roman_numerals = true
};
libpostal_normalize_options_t libpostal_get_default_options(void) {
LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void) {
return LIBPOSTAL_DEFAULT_OPTIONS;
}
@@ -942,7 +942,7 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_
char_array_destroy(temp_string);
}
char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
options.address_components |= LIBPOSTAL_ADDRESS_ANY;
uint64_t normalize_string_options = get_normalize_string_options(options);
@@ -1021,14 +1021,14 @@ char **libpostal_expand_address(char *input, libpostal_normalize_options_t optio
}
void libpostal_expansion_array_destroy(char **expansions, size_t n) {
LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n) {
for (size_t i = 0; i < n; i++) {
free(expansions[i]);
}
free(expansions);
}
void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) {
LIBPOSTAL_EXPORT void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) {
if (self == NULL) return;
for (size_t i = 0; i < self->num_components; i++) {
@@ -1057,11 +1057,11 @@ static libpostal_address_parser_options_t LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIO
.country = NULL
};
inline libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void) {
LIBPOSTAL_EXPORT inline libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void) {
return LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS;
}
libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options) {
LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options) {
libpostal_address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country);
if (parsed == NULL) {
@@ -1073,7 +1073,7 @@ libpostal_address_parser_response_t *libpostal_parse_address(char *address, libp
return parsed;
}
bool libpostal_setup_datadir(char *datadir) {
LIBPOSTAL_EXPORT bool libpostal_setup_datadir(char *datadir) {
char *transliteration_path = NULL;
char *numex_path = NULL;
char *address_dictionary_path = NULL;
@@ -1114,11 +1114,11 @@ bool libpostal_setup_datadir(char *datadir) {
return true;
}
bool libpostal_setup(void) {
LIBPOSTAL_EXPORT bool libpostal_setup(void) {
return libpostal_setup_datadir(NULL);
}
bool libpostal_setup_language_classifier_datadir(char *datadir) {
LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier_datadir(char *datadir) {
char *language_classifier_dir = NULL;
if (datadir != NULL) {
@@ -1137,11 +1137,11 @@ bool libpostal_setup_language_classifier_datadir(char *datadir) {
return true;
}
bool libpostal_setup_language_classifier(void) {
LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier(void) {
return libpostal_setup_language_classifier_datadir(NULL);
}
bool libpostal_setup_parser_datadir(char *datadir) {
LIBPOSTAL_EXPORT bool libpostal_setup_parser_datadir(char *datadir) {
char *parser_dir = NULL;
if (datadir != NULL) {
@@ -1160,11 +1160,11 @@ bool libpostal_setup_parser_datadir(char *datadir) {
return true;
}
bool libpostal_setup_parser(void) {
LIBPOSTAL_EXPORT bool libpostal_setup_parser(void) {
return libpostal_setup_parser_datadir(NULL);
}
void libpostal_teardown(void) {
LIBPOSTAL_EXPORT void libpostal_teardown(void) {
transliteration_module_teardown();
numex_module_teardown();
@@ -1172,10 +1172,10 @@ void libpostal_teardown(void) {
address_dictionary_module_teardown();
}
void libpostal_teardown_language_classifier(void) {
LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(void) {
language_classifier_module_teardown();
}
void libpostal_teardown_parser(void) {
LIBPOSTAL_EXPORT void libpostal_teardown_parser(void) {
address_parser_module_teardown();
}

View File

@@ -9,6 +9,7 @@ extern "C" {
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include "export.h"
#define LIBPOSTAL_MAX_LANGUAGE_LEN 4
@@ -62,11 +63,11 @@ typedef struct libpostal_normalize_options {
} libpostal_normalize_options_t;
libpostal_normalize_options_t libpostal_get_default_options(void);
LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void);
char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n);
LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n);
void libpostal_expansion_array_destroy(char **expansions, size_t n);
LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n);
/*
Address parser
@@ -83,25 +84,25 @@ typedef struct libpostal_address_parser_options {
char *country;
} libpostal_address_parser_options_t;
void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self);
LIBPOSTAL_EXPORT void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self);
libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void);
LIBPOSTAL_EXPORT libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void);
libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options);
LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options);
// Setup/teardown methods
bool libpostal_setup(void);
bool libpostal_setup_datadir(char *datadir);
void libpostal_teardown(void);
LIBPOSTAL_EXPORT bool libpostal_setup(void);
LIBPOSTAL_EXPORT bool libpostal_setup_datadir(char *datadir);
LIBPOSTAL_EXPORT void libpostal_teardown(void);
bool libpostal_setup_parser(void);
bool libpostal_setup_parser_datadir(char *datadir);
void libpostal_teardown_parser(void);
LIBPOSTAL_EXPORT bool libpostal_setup_parser(void);
LIBPOSTAL_EXPORT bool libpostal_setup_parser_datadir(char *datadir);
LIBPOSTAL_EXPORT void libpostal_teardown_parser(void);
bool libpostal_setup_language_classifier(void);
bool libpostal_setup_language_classifier_datadir(char *datadir);
void libpostal_teardown_language_classifier(void);
LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier(void);
LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier_datadir(char *datadir);
LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(void);
#ifdef __cplusplus
}

View File

@@ -38,6 +38,7 @@ As well as normalizations for individual string tokens:
#include "trie.h"
#include "tokens.h"
#include "vector.h"
#include "strndup.h"
#define NORMALIZE_STRING_LATIN_ASCII 1 << 0
#define NORMALIZE_STRING_TRANSLITERATE 1 << 1

View File

@@ -16,6 +16,7 @@ Utilities for manipulating strings in C.
#include "collections.h"
#include "utf8proc/utf8proc.h"
#include "vector.h"
#include "strndup.h"
#define MAX_UTF8_CHAR_SIZE 4

16
src/strndup.c Normal file
View File

@@ -0,0 +1,16 @@
#ifndef HAVE_STRNDUP
#include <stdlib.h>
#include <string.h>
char *strndup(const char *s, size_t n)
{
char* new = malloc(n+1);
if (new) {
strncpy(new, s, n);
new[n] = '\0';
}
return new;
}
#endif /* HAVE_STRNDUP */

6
src/strndup.h Normal file
View File

@@ -0,0 +1,6 @@
#ifndef HAVE_STRNDUP
#define HAVE_STRNDUP
char *strndup(const char *s, size_t n);
#endif /* HAVE_STRNDUP */

View File

@@ -11,6 +11,7 @@
#include "string_utils.h"
#include "token_types.h"
#include "vector.h"
#include "strndup.h"
typedef struct token {
size_t offset;

View File

@@ -12,6 +12,7 @@
#include "trie.h"
#include "trie_search.h"
#include "unicode_scripts.h"
#include "strndup.h"
#define LATIN_ASCII "latin-ascii"
#define LATIN_ASCII_SIMPLE "latin-ascii-simple"

21
win_build.bat Normal file
View File

@@ -0,0 +1,21 @@
@echo off
cd %APPVEYOR_BUILD_FOLDER%
echo Compiler: %COMPILER%
echo Architecture: %MSYS2_ARCH%
echo Platform: %PLATFORM%
echo MSYS2 directory: %MSYS2_DIR%
echo MSYS2 system: %MSYSTEM%
echo Configuration: %CONFIGURATION%
echo Bits: %BIT%
IF %COMPILER%==msys2 (
@echo on
SET "PATH=C:\%MSYS2_DIR%\%MSYSTEM%\bin;C:\%MSYS2_DIR%\usr\bin;%PATH%"
bash -lc "cd $APPVEYOR_BUILD_FOLDER && . bootstrap.sh"
bash -lc "cd $APPVEYOR_BUILD_FOLDER && . configure --datadir=$APPVEYOR_BUILD_FOLDER/data"
bash -lc "cd $APPVEYOR_BUILD_FOLDER && make"
bash -lc "cd $APPVEYOR_BUILD_FOLDER && make install"
)

105
windows/configure.ac Normal file
View File

@@ -0,0 +1,105 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
AC_CONFIG_MACRO_DIR([m4])
AM_INIT_AUTOMAKE([foreign subdir-objects])
AC_CONFIG_SRCDIR([src])
LT_INIT([win32-dll])
AC_CONFIG_HEADERS([config.h])
# Checks for programs.
AC_PROG_CC_C99
AC_PROG_INSTALL
LDFLAGS="$LDFLAGS -L/usr/local/lib"
# Checks for libraries.
AC_SEARCH_LIBS([log],
[m],,[AC_MSG_ERROR([Could not find math library])])
# Checks for header files.
AC_HEADER_STDC
AC_HEADER_TIME
AC_HEADER_DIRENT
AC_HEADER_STDBOOL
AC_CHECK_HEADERS([fcntl.h float.h inttypes.h limits.h locale.h malloc.h memory.h stddef.h stdint.h stdlib.h string.h unistd.h])
# Checks for typedefs, structures, and compiler characteristics.
AC_C_INLINE
AC_TYPE_INT16_T
AC_TYPE_INT32_T
AC_TYPE_INT64_T
AC_TYPE_INT8_T
AC_TYPE_OFF_T
AC_TYPE_SIZE_T
AC_TYPE_SSIZE_T
AC_TYPE_UINT16_T
AC_TYPE_UINT32_T
AC_TYPE_UINT64_T
AC_TYPE_UINT8_T
AC_CHECK_TYPES([ptrdiff_t])
# Checks for library functions.
AC_CHECK_FUNCS([malloc realloc getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
AC_CONFIG_FILES([Makefile
libpostal.pc
src/Makefile
test/Makefile])
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf available])])
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
# ------------------------------------------------------------------
# Checks for SSE2 build
# ------------------------------------------------------------------
AC_ARG_ENABLE([sse2],
AS_HELP_STRING(
[--disable-sse2],
[disable SSE2 optimization routines]
)
)
AS_IF([test "x$enable_sse2" != "xno"], [
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
])
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
AC_ARG_ENABLE([data-download],
[ --disable-data-download Disable downloading data],
[case "${enableval}" in
yes) DOWNLOAD_DATA=true ;;
no) DOWNLOAD_DATA=false ;;
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
esac], [DOWNLOAD_DATA=true])
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],
[
if test "x$withval" = "xno"; then
CFLAGS_SCANNER_EXTRA=""
else
CFLAGS_SCANNER_EXTRA="$withval"
fi
],
[ CFLAGS_SCANNER_EXTRA="" ]
)
AC_MSG_NOTICE([extra cflags for scanner.c: $CFLAGS_SCANNER_EXTRA])
AC_SUBST(CFLAGS_SCANNER_EXTRA)
AC_SUBST(LIBPOSTAL_SO_VERSION, LIBPOSTAL_MAJOR_VERSION:LIBPOSTAL_MINOR_VERSION:LIBPOSTAL_PATCH_VERSION)
AC_OUTPUT

45
windows/src/Makefile.am Normal file
View File

@@ -0,0 +1,45 @@
# this version of the makefile skips building the programs. It only builds the libraries and downloads data so you can use the API.
# Inherited from autoconf / user-specified
CFLAGS_CONF = @CFLAGS@
CFLAGS_BASE = -Wall -Wextra -Wno-unused-function -Wformat -Werror=format-security -Winit-self -Wno-sign-compare -DLIBPOSTAL_DATA_DIR='"$(datadir)/libpostal"' -g $(CFLAGS_CONF)
CFLAGS_O0 = $(CFLAGS_BASE) -O0
CFLAGS_O1 = $(CFLAGS_BASE) -O1
CFLAGS_O2 = $(CFLAGS_BASE) -O2
CFLAGS_O3 = $(CFLAGS_BASE) -O3
DEFAULT_INCLUDES = -I.. -I/usr/local/include
# Wonky but have to be able to override the user's optimization level to compile the scanner
# as it takes an unreasonably long time to compile with the optimizer on.
#EDIT - add UTF8PROC_EXPORTS so builds on windows
CFLAGS = -D UTF8PROC_EXPORTS -D LIBPOSTAL_EXPORTS
lib_LTLIBRARIES = libpostal.la
libpostal_la_SOURCES = strndup.c libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS)
libpostal_la_CFLAGS = $(CFLAGS_O2)
libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined
dist_bin_SCRIPTS = libpostal_data
# Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough
# On cross-compilation for ARM using gcc-4.7, there are "out of range" errors during compilation that can be fixed by adding
# -marm option. For that, CFLAGS_SCANNER_EXTRA is provided that can be filled during configuration stage (see ./configure --help).
noinst_LTLIBRARIES = libscanner.la
libscanner_la_SOURCES = klib/drand48.c scanner.c
libscanner_la_CFLAGS = $(CFLAGS_O0) $(CFLAGS_SCANNER_EXTRA)
# program building skipped here
pkginclude_HEADERS = libpostal.h
if DOWNLOAD_DATA
all-local:
${srcdir}/libpostal_data download all $(datadir)/libpostal
endif
lexer: scanner.re
re2c -F -s -b -8 -o scanner.c scanner.re
.PHONY: lexer