Use NEON on ARM hardware via sse2neon.h

The autoconf changes were adapted from:
https://github.com/glennrp/libpng/blob/libpng16/configure.ac
This commit is contained in:
Dino Kovač
2022-04-16 22:48:59 +02:00
parent 893745f09b
commit 6064bc6c06
7 changed files with 9025 additions and 32 deletions

View File

@@ -113,8 +113,6 @@ brew install curl autoconf automake libtool pkg-config
Then to install the C library:
If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed.
```
git clone https://github.com/openvenues/libpostal
cd libpostal

View File

@@ -73,19 +73,52 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
# ------------------------------------------------------------------
# Checks for SSE2 build
# Architecture-specific options
# ------------------------------------------------------------------
AC_ARG_ENABLE([sse2],
AS_HELP_STRING(
[--disable-sse2],
[disable SSE2 optimization routines]
)
)
AS_IF([test "x$enable_sse2" != "xno"], [
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
# allow enabling hardware optimization on any system:
case "$host_cpu" in
arm*|aarch64*)
enable_arm_neon=yes
enable_intel_sse=no
AC_DEFINE([ARM_NEON], [1],
[Enable ARM_NEON optimizations])
;;
i?86|x86_64)
enable_intel_sse=yes
enable_arm_neon=no
AC_DEFINE([INTEL_SSE], [1],
[Enable Intel SSE optimizations])
;;
esac
AC_ARG_ENABLE([hardware-optimizations],
AS_HELP_STRING([[[--disable-hardware-optimizations]]],
[Disable hardware optimizations (Intel SSE2 / ARM NEON)]),
[
# disable hardware optimization on all systems:
enable_arm_neon=no
AC_DEFINE([ARM_NEON], [0],
[Disable ARM_NEON optimizations])
enable_intel_sse=no
AC_DEFINE([INTEL_SSE], [0],
[Disable INTEL_SSE optimizations])
])
SIMDFLAGS=""
AS_IF([test "x$enable_intel_sse" != "xno"], [
SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE"
])
AS_IF([test "x$enable_arm_neon" != "xno"], [
SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON"
])
CFLAGS="${SIMDFLAGS} ${CFLAGS}"
AC_SUBST([SIMDFLAGS], [$SIMDFLAGS])
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
AC_ARG_ENABLE([data-download],

View File

@@ -40,7 +40,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
}
if (context->flag & CRF_CONTEXT_MARGINALS) {
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
context->exp_state = double_matrix_new_aligned(T, L, 16);
if (context->exp_state == NULL) goto exit_context_created;
double_matrix_zero(context->exp_state);
@@ -52,7 +52,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
context->mexp_state = double_matrix_new_zeros(T, L);
if (context->mexp_state == NULL) goto exit_context_created;
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16);
if (context->exp_state_trans == NULL) goto exit_context_created;
double_matrix_zero(context->exp_state_trans);
@@ -64,7 +64,7 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
context->mexp_state_trans = double_matrix_new_zeros(T, L * L);
if (context->mexp_state_trans == NULL) goto exit_context_created;
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
context->exp_trans = double_matrix_new_aligned(L, L, 16);
if (context->exp_trans == NULL) goto exit_context_created;
double_matrix_zero(context->exp_trans);
@@ -130,13 +130,13 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) {
if (self->flag & CRF_CONTEXT_MARGINALS &&
(
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
!double_matrix_resize_aligned(self->exp_state, T, L, 16) ||
#else
!double_matrix_resize(self->exp_state, T, L) ||
#endif
!double_matrix_resize(self->mexp_state, T, L) ||
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) ||
#else
!double_matrix_resize(self->exp_state_trans, T, L * L) ||
@@ -184,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) {
}
if (self->exp_state != NULL) {
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
double_matrix_destroy_aligned(self->exp_state);
#else
double_matrix_destroy(self->exp_state);
@@ -200,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) {
}
if (self->exp_state_trans != NULL) {
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
double_matrix_destroy_aligned(self->exp_state_trans);
#else
double_matrix_destroy(self->exp_state_trans);
@@ -216,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) {
}
if (self->exp_trans != NULL) {
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
double_matrix_destroy_aligned(self->exp_trans);
#else
double_matrix_destroy(self->exp_trans);

8853
src/sse2neon.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -8,8 +8,10 @@
#define ks_lt_index(a, b) ((a).value < (b).value)
#ifdef USE_SSE
#if defined(INTEL_SSE)
#include <emmintrin.h>
#elif defined(ARM_NEON)
#include "sse2neon.h"
#endif
/*
@@ -338,7 +340,7 @@
#ifdef USE_SSE
#if defined(INTEL_SSE) || defined(ARM_NEON)
/*
From https://github.com/herumi/fmath/blob/master/fastexp.cpp
@@ -524,4 +526,4 @@ static inline void remez9_0_log2_sse(double *values, size_t num)
#endif
#endif

View File

@@ -5,7 +5,7 @@ CFLAGS_O2 = $(CFLAGS_BASE) -O2
CFLAGS_O3 = $(CFLAGS_BASE) -O3
DEFAULT_INCLUDES = -I.. -I/usr/local/include
CFLAGS = $(CFLAGS_BASE)
CFLAGS = $(SIMDFLAGS) $(CFLAGS_BASE)
TESTS = test_libpostal
noinst_PROGRAMS = test_libpostal

View File

@@ -73,19 +73,126 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
# ------------------------------------------------------------------
# Checks for SSE2 build
# Architecture-specific options
# ------------------------------------------------------------------
AC_ARG_ENABLE([sse2],
AS_HELP_STRING(
[--disable-sse2],
[disable SSE2 optimization routines]
)
)
AS_IF([test "x$enable_sse2" != "xno"], [
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
# allow enabling hardware optimization on any system:
case "$host_cpu" in
arm*|aarch64*)
enable_arm_neon=yes
enable_intel_sse=no
AC_DEFINE([ARM_NEON], [1],
[Enable ARM_NEON optimizations])
;;
i?86|x86_64)
enable_intel_sse=yes
enable_arm_neon=no
AC_DEFINE([INTEL_SSE], [1],
[Enable Intel SSE optimizations])
;;
esac
AC_ARG_ENABLE([hardware-optimizations],
AS_HELP_STRING([[[--disable-hardware-optimizations]]],
[Disable hardware optimizations (Intel SSE2 / ARM NEON)]),
[
# disable hardware optimization on all systems:
enable_arm_neon=no
AC_DEFINE([ARM_NEON], [0],
[Disable ARM_NEON optimizations])
enable_intel_sse=no
AC_DEFINE([INTEL_SSE], [0],
[Disable INTEL_SSE optimizations])
])
# INTEL
# =====
#
# INTEL SSE (SIMD) support.
AC_ARG_ENABLE([intel-sse],
AS_HELP_STRING([[[--enable-intel-sse]]],
[Enable Intel SSE optimizations: =no/off, yes/on:]
[no/off: disable the optimizations;]
[yes/on: enable the optimizations.]
[If not specified: determined by the compiler.]),
[case "$enableval" in
no|off)
# disable the default enabling:
AC_DEFINE([INTEL_SSE], [0],
[Disable Intel SSE optimizations])
# Prevent inclusion of the assembler files below:
enable_intel_sse=no;;
yes|on)
enable_intel_sse=yes
AC_DEFINE([INTEL_SSE], [1],
[Enable Intel SSE optimizations]);;
*)
AC_MSG_ERROR([--enable-intel-sse=${enable_intel_sse}: invalid value])
esac])
# Add Intel specific files to all builds where the host_cpu is Intel ('x86*')
# or where Intel optimizations were explicitly requested (this allows a
# fallback if a future host CPU does not match 'x86*')
AM_CONDITIONAL([INTEL_SSE],
[test "$enable_intel_sse" != 'no' &&
case "$host_cpu" in
i?86|x86_64) :;;
*) test "$enable_intel_sse" != '';;
esac])
# ARM
# ===
#
# ARM NEON (SIMD) support.
AC_ARG_ENABLE([arm-neon],
AS_HELP_STRING([[[--enable-arm-neon]]],
[Enable ARM NEON optimizations: =no/off, check, api, yes/on:]
[no/off: disable the optimizations; check: use internal checking code]
[(deprecated and poorly supported); api: disable by default, enable by]
[a call to png_set_option; yes/on: turn on unconditionally.]
[If not specified: determined by the compiler.]),
[case "$enableval" in
no|off)
# disable the default enabling on __ARM_NEON__ systems:
AC_DEFINE([ARM_NEON], [0],
[Disable ARM Neon optimizations])
# Prevent inclusion of the assembler files below:
enable_arm_neon=no;;
yes|on)
enable_arm_neon=yes
AC_DEFINE([ARM_NEON], [1],
[Enable ARM Neon optimizations]);;
*)
AC_MSG_ERROR([--enable-arm-neon=${enable_arm_neon}: invalid value])
esac])
# Add ARM specific files to all builds where the host_cpu is arm ('arm*') or
# where ARM optimizations were explicitly requested (this allows a fallback if a
# future host CPU does not match 'arm*')
AM_CONDITIONAL([ARM_NEON],
[test "$enable_arm_neon" != 'no' &&
case "$host_cpu" in
arm*|aarch64*) :;;
*) test "$enable_arm_neon" != '';;
esac])
SIMDFLAGS=""
AS_IF([test "x$enable_intel_sse" != "xno"], [
SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE"
])
AS_IF([test "x$enable_arm_neon" != "xno"], [
SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON"
])
CFLAGS="${SIMDFLAGS} ${CFLAGS}"
AC_SUBST([SIMDFLAGS], [$SIMDFLAGS])
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
AC_ARG_ENABLE([data-download],