From 7a448b718d58cece0384d540067d48bbe4a73774 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 5 Jul 2023 21:02:41 -0400 Subject: [PATCH 1/7] [crf] using 32 bytes for posix_memalign to align blocks of 4 doubles for remez algorithm to fix test which uses an odd-sized context --- src/crf_context.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/crf_context.c b/src/crf_context.c index 0f399a1a..e8635e28 100644 --- a/src/crf_context.c +++ b/src/crf_context.c @@ -41,7 +41,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { if (context->flag & CRF_CONTEXT_MARGINALS) { #if defined(INTEL_SSE) || defined(ARM_NEON) - context->exp_state = double_matrix_new_aligned(T, L, 16); + context->exp_state = double_matrix_new_aligned(T, L, 32); if (context->exp_state == NULL) goto exit_context_created; double_matrix_zero(context->exp_state); #else @@ -53,7 +53,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { if (context->mexp_state == NULL) goto exit_context_created; #if defined(INTEL_SSE) || defined(ARM_NEON) - context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16); + context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32); if (context->exp_state_trans == NULL) goto exit_context_created; double_matrix_zero(context->exp_state_trans); #else @@ -65,7 +65,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { if (context->mexp_state_trans == NULL) goto exit_context_created; #if defined(INTEL_SSE) || defined(ARM_NEON) - context->exp_trans = double_matrix_new_aligned(L, L, 16); + context->exp_trans = double_matrix_new_aligned(L, L, 32); if (context->exp_trans == NULL) goto exit_context_created; double_matrix_zero(context->exp_trans); #else @@ -131,13 +131,13 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) { if (self->flag & CRF_CONTEXT_MARGINALS && ( #if defined(INTEL_SSE) || defined(ARM_NEON) - !double_matrix_resize_aligned(self->exp_state, T, L, 16) || + !double_matrix_resize_aligned(self->exp_state, T, L, 32) || #else !double_matrix_resize(self->exp_state, T, L) || #endif !double_matrix_resize(self->mexp_state, T, L) || #if defined(INTEL_SSE) || defined(ARM_NEON) - !double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) || + !double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) || #else !double_matrix_resize(self->exp_state_trans, T, L * L) || #endif From 59325c3b13cb941aa0938af22a02294093de728d Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 6 Jul 2023 01:16:22 -0400 Subject: [PATCH 2/7] [test] testing with sse2 disabled to see if the build is working generally --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ea9dca07..b9b0b0c1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,7 +30,7 @@ jobs: LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data run: | ./bootstrap.sh - ./configure --datadir=$LIBPOSTAL_DATA_DIR + ./configure --datadir=$LIBPOSTAL_DATA_DIR --disable-sse2 make - name: Test run: make check From d979fbb779bfa3236999107d91afb2aa8c5a10c5 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 6 Jul 2023 01:28:49 -0400 Subject: [PATCH 3/7] [test] trying make check in the same step, to see if that makes a difference --- .github/workflows/test.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b9b0b0c1..141f37da 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,12 +25,11 @@ jobs: run: | brew update brew install curl autoconf automake libtool pkg-config - - name: Build + - name: Build and Test env: LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data run: | ./bootstrap.sh ./configure --datadir=$LIBPOSTAL_DATA_DIR --disable-sse2 make - - name: Test - run: make check + make check From c76d020c18ce03b0e35f0dc281038a1e8abdcd86 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 6 Jul 2023 01:36:23 -0400 Subject: [PATCH 4/7] [fix] same result running test as a separate step --- .github/workflows/test.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 141f37da..b9b0b0c1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,11 +25,12 @@ jobs: run: | brew update brew install curl autoconf automake libtool pkg-config - - name: Build and Test + - name: Build env: LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data run: | ./bootstrap.sh ./configure --datadir=$LIBPOSTAL_DATA_DIR --disable-sse2 make - make check + - name: Test + run: make check From 57eaa414ceadb48d5922099eeaa446b02894a2e4 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 6 Jul 2023 01:49:02 -0400 Subject: [PATCH 5/7] [revert] reverting the commits from #578, leaving header file in repo for the moment --- README.md | 2 ++ configure.ac | 53 +++++++------------------------------------- src/crf_context.c | 16 ++++++------- src/vector_math.h | 6 ++--- test/Makefile.am | 2 +- windows/configure.ac | 53 +++++++------------------------------------- 6 files changed, 29 insertions(+), 103 deletions(-) diff --git a/README.md b/README.md index d8e2cb9c..3d641717 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,8 @@ brew install curl autoconf automake libtool pkg-config Then to install the C library: +If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed. + ``` git clone https://github.com/openvenues/libpostal cd libpostal diff --git a/configure.ac b/configure.ac index ed997e32..b7339215 100644 --- a/configure.ac +++ b/configure.ac @@ -84,57 +84,20 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])]) # ------------------------------------------------------------------ -# Architecture-specific options +# Checks for SSE2 build # ------------------------------------------------------------------ -# allow enabling hardware optimization on any system: -case "$host_cpu" in - arm*|aarch64*) - enable_arm_neon=yes - enable_intel_sse=no - AC_DEFINE([ARM_NEON], [1], - [Enable ARM_NEON optimizations]) - ;; - i?86|x86_64) - enable_intel_sse=yes - enable_arm_neon=no - AC_DEFINE([INTEL_SSE], [1], - [Enable Intel SSE optimizations]) - ;; -esac - -AC_ARG_ENABLE([neon], - AS_HELP_STRING([[[--disable-neon]]], - [Disable ARM NEON hardware optimizations]), - [ - enable_arm_neon=no - AC_DEFINE([ARM_NEON], [0], - [Disable ARM_NEON optimizations]) - ]) - AC_ARG_ENABLE([sse2], - AS_HELP_STRING([[[--disable-sse2]]], - [Disable Intel SSE2 hardware optimizations]), - [ - enable_intel_sse=no - AC_DEFINE([INTEL_SSE], [0], - [Disable INTEL_SSE optimizations]) - ]) + AS_HELP_STRING( + [--disable-sse2], + [disable SSE2 optimization routines] + ) + ) -SIMDFLAGS="" - -AS_IF([test "x$enable_intel_sse" != "xno"], [ - SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE" +AS_IF([test "x$enable_sse2" != "xno"], [ + CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}" ]) -AS_IF([test "x$enable_arm_neon" != "xno"], [ - SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON" -]) - -CFLAGS="${SIMDFLAGS} ${CFLAGS}" - -AC_SUBST([SIMDFLAGS], [$SIMDFLAGS]) - AC_CHECK_HEADER(cblas.h, [AX_CBLAS]) AC_ARG_ENABLE([data-download], diff --git a/src/crf_context.c b/src/crf_context.c index e8635e28..8e1a759e 100644 --- a/src/crf_context.c +++ b/src/crf_context.c @@ -40,7 +40,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { } if (context->flag & CRF_CONTEXT_MARGINALS) { -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) context->exp_state = double_matrix_new_aligned(T, L, 32); if (context->exp_state == NULL) goto exit_context_created; double_matrix_zero(context->exp_state); @@ -52,7 +52,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { context->mexp_state = double_matrix_new_zeros(T, L); if (context->mexp_state == NULL) goto exit_context_created; -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32); if (context->exp_state_trans == NULL) goto exit_context_created; double_matrix_zero(context->exp_state_trans); @@ -64,7 +64,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { context->mexp_state_trans = double_matrix_new_zeros(T, L * L); if (context->mexp_state_trans == NULL) goto exit_context_created; -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) context->exp_trans = double_matrix_new_aligned(L, L, 32); if (context->exp_trans == NULL) goto exit_context_created; double_matrix_zero(context->exp_trans); @@ -130,13 +130,13 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) { if (self->flag & CRF_CONTEXT_MARGINALS && ( -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) !double_matrix_resize_aligned(self->exp_state, T, L, 32) || #else !double_matrix_resize(self->exp_state, T, L) || #endif !double_matrix_resize(self->mexp_state, T, L) || -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) !double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) || #else !double_matrix_resize(self->exp_state_trans, T, L * L) || @@ -184,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) { } if (self->exp_state != NULL) { -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) double_matrix_destroy_aligned(self->exp_state); #else double_matrix_destroy(self->exp_state); @@ -200,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) { } if (self->exp_state_trans != NULL) { -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) double_matrix_destroy_aligned(self->exp_state_trans); #else double_matrix_destroy(self->exp_state_trans); @@ -216,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) { } if (self->exp_trans != NULL) { -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) double_matrix_destroy_aligned(self->exp_trans); #else double_matrix_destroy(self->exp_trans); diff --git a/src/vector_math.h b/src/vector_math.h index 7dbdb049..eff90466 100644 --- a/src/vector_math.h +++ b/src/vector_math.h @@ -8,10 +8,8 @@ #define ks_lt_index(a, b) ((a).value < (b).value) -#if defined(INTEL_SSE) +#if defined(USE_SSE) #include -#elif defined(ARM_NEON) -#include "sse2neon.h" #endif /* @@ -340,7 +338,7 @@ -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) /* From https://github.com/herumi/fmath/blob/master/fastexp.cpp diff --git a/test/Makefile.am b/test/Makefile.am index 5289e3c2..f2e911f2 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -5,7 +5,7 @@ CFLAGS_O2 = $(CFLAGS_BASE) -O2 CFLAGS_O3 = $(CFLAGS_BASE) -O3 DEFAULT_INCLUDES = -I.. -I/usr/local/include -CFLAGS = $(SIMDFLAGS) $(CFLAGS_BASE) +CFLAGS = $(CFLAGS_BASE) TESTS = test_libpostal noinst_PROGRAMS = test_libpostal diff --git a/windows/configure.ac b/windows/configure.ac index d19cd967..24e73fec 100644 --- a/windows/configure.ac +++ b/windows/configure.ac @@ -73,57 +73,20 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])]) # ------------------------------------------------------------------ -# Architecture-specific options +# Checks for SSE2 build # ------------------------------------------------------------------ -# allow enabling hardware optimization on any system: -case "$host_cpu" in - arm*|aarch64*) - enable_arm_neon=yes - enable_intel_sse=no - AC_DEFINE([ARM_NEON], [1], - [Enable ARM_NEON optimizations]) - ;; - i?86|x86_64) - enable_intel_sse=yes - enable_arm_neon=no - AC_DEFINE([INTEL_SSE], [1], - [Enable Intel SSE optimizations]) - ;; -esac - -AC_ARG_ENABLE([neon], - AS_HELP_STRING([[[--disable-neon]]], - [Disable ARM NEON hardware optimizations]), - [ - enable_arm_neon=no - AC_DEFINE([ARM_NEON], [0], - [Disable ARM_NEON optimizations]) - ]) - AC_ARG_ENABLE([sse2], - AS_HELP_STRING([[[--disable-sse2]]], - [Disable Intel SSE2 hardware optimizations]), - [ - enable_intel_sse=no - AC_DEFINE([INTEL_SSE], [0], - [Disable INTEL_SSE optimizations]) - ]) + AS_HELP_STRING( + [--disable-sse2], + [disable SSE2 optimization routines] + ) + ) -SIMDFLAGS="" - -AS_IF([test "x$enable_intel_sse" != "xno"], [ - SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE" +AS_IF([test "x$enable_sse2" != "xno"], [ + CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}" ]) -AS_IF([test "x$enable_arm_neon" != "xno"], [ - SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON" -]) - -CFLAGS="${SIMDFLAGS} ${CFLAGS}" - -AC_SUBST([SIMDFLAGS], [$SIMDFLAGS]) - AC_CHECK_HEADER(cblas.h, [AX_CBLAS]) AC_ARG_ENABLE([data-download], From 5a1f6df5a90aaad0da81fec20a1f0c11d869a438 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 6 Jul 2023 02:05:17 -0400 Subject: [PATCH 6/7] [sse] ok something about that PR breaks either way. Now trying it with SSE --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b9b0b0c1..ea9dca07 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,7 +30,7 @@ jobs: LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data run: | ./bootstrap.sh - ./configure --datadir=$LIBPOSTAL_DATA_DIR --disable-sse2 + ./configure --datadir=$LIBPOSTAL_DATA_DIR make - name: Test run: make check From 7bdcf96c9d9c61811ffd4570ba9fbbac5ffd237f Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 6 Jul 2023 16:00:55 -0400 Subject: [PATCH 7/7] [memalign] no more realloc on aligned pointers, just do an aligned malloc and copy to it. Slower but safe and this is not called that often in practice, usually to resize larger matrices. --- src/vector.h | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/vector.h b/src/vector.h index 78a0fad4..462a8baf 100644 --- a/src/vector.h +++ b/src/vector.h @@ -21,27 +21,19 @@ static inline void *_aligned_realloc(void *p, size_t size, size_t alignment) return NULL; } - if (size == 0) { + if (p == NULL) { return NULL; } - void *rp = realloc(p, size); - - /* If realloc result is not already at an aligned boundary, - _aligned_malloc a new block and copy the contents of the realloc'd - pointer to the aligned block, free the realloc'd pointer and return - the aligned pointer. - */ - if ( ((size_t)rp & (alignment - 1)) != 0) { - void *p1 = _aligned_malloc(size, alignment); - if (p1 != NULL) { - memcpy(p1, rp, size); - } - free(rp); - rp = p1; + void *p1 = _aligned_malloc(size, alignment); + if (p1 == NULL) { + free(p); + return NULL; } - return rp; + memcpy(p1, p, size); + free(p); + return p1; } static inline void _aligned_free(void *p) {