Merge branch 'master' into patch-1
This commit is contained in:
36
.github/workflows/test.yml
vendored
Normal file
36
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
name: Test
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [master]
|
||||||
|
pull_request:
|
||||||
|
branches: [master]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build_and_test:
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest, macos-latest]
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- name: Install Dependencies Linux
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
run: |
|
||||||
|
sudo apt-get update -y
|
||||||
|
sudo apt-get install curl autoconf automake libtool pkg-config
|
||||||
|
- name: Install Dependencies MacOS
|
||||||
|
if: runner.os == 'macOS'
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
brew install curl autoconf automake libtool pkg-config
|
||||||
|
- name: Build
|
||||||
|
env:
|
||||||
|
LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data
|
||||||
|
run: |
|
||||||
|
./bootstrap.sh
|
||||||
|
./configure --datadir=$LIBPOSTAL_DATA_DIR
|
||||||
|
make
|
||||||
|
- name: Test
|
||||||
|
run: make check
|
||||||
83
.travis.yml
83
.travis.yml
@@ -1,83 +0,0 @@
|
|||||||
language: c
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- master
|
|
||||||
env:
|
|
||||||
global:
|
|
||||||
- secure: "bHrAu46oecEj3gjamT+XWXtf2J0ZJCFa8tUdgM4evscaJiiwv1TtsGXyhIj/ai7DlRIPVJUtBUy6uoGGjr6GT43zTrzSxYAOMdVXZYsnTDcdL1/0dbwcIK6/u0EI377s1buGIxG1fHveWKXuXwJWDAw4KS+5HU88a42+zMbhKe4="
|
|
||||||
- secure: "SkvNYucKVns9qDjOEW2WIhDlOMKBOwhzVcwY++HWTRtn04ErrqR4k01Mmho0jGBQD9JrPLhDgnX1BNy5s+Kmq/bxn9OZm7K1z24qBKb0mBBiNEnf2jvT0AvF5xxM+cJf4KKNL+CC0MwNf5y7HVPq1xibOV4/CNIrc1ZZc9aqdkE="
|
|
||||||
- secure: "am/rRca5akv7gSSMeNQfHnWiTHhk8fQhOZvZ0Ut+PezkQlLgKp7bzmMFkkuQ4L5hpJU40kFzuWmIPgO33dacgq69Vx/Xct1bEnxGBGjriI5qOhMizmzLYPs5uWiRjtJnBqb4JOUh5K7JBlwrgvD72fY5ZK2lwtzTksfWo8N+ahU="
|
|
||||||
- secure: "mh/WDQapGJb6MAFvgCjiMAAv1aa8gUaIs2Ohtx7yPrDBwsD8UqlyEM7ktGLZGQ1q/7OJ/Z6QfDMfJQwDKzxyUSY1yHZTNkP3QzkTt2D1Qyvi++O6EkGqSdSS6Lb3aID3IsEaye/yasJ+rxiRSp05O9+OYvhJlqRZnzaimiAv5KI="
|
|
||||||
- secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
|
|
||||||
- GH_REF=github.com/openvenues/libpostal
|
|
||||||
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l)
|
|
||||||
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
|
|
||||||
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
|
|
||||||
- TAG_VERSION=$(cat ./versions/base).$TRAVIS_BUILD_NUMBER
|
|
||||||
- SRC_TARBALL_FILENAME=libpostal-$(cat ./versions/base).tar.gz
|
|
||||||
- LIBPOSTAL_DATA_DIR=$(pwd)/data
|
|
||||||
- LIBPOSTAL_DATA_FILENAME=libpostal_data.tar.gz
|
|
||||||
compiler:
|
|
||||||
- clang
|
|
||||||
- gcc
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
sources:
|
|
||||||
- ubuntu-toolchain-r-test
|
|
||||||
packages:
|
|
||||||
- gcc-4.8
|
|
||||||
- pkg-config
|
|
||||||
before_script:
|
|
||||||
- ./bootstrap.sh
|
|
||||||
- if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
|
|
||||||
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
|
|
||||||
- if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
|
|
||||||
install:
|
|
||||||
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
|
|
||||||
script:
|
|
||||||
- ./configure --datadir=$LIBPOSTAL_DATA_DIR
|
|
||||||
- make -j4
|
|
||||||
- if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
|
|
||||||
- if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
|
|
||||||
- if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
|
|
||||||
- make check
|
|
||||||
|
|
||||||
after_success:
|
|
||||||
- |
|
|
||||||
if [[ "$CC" == "gcc" && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" ]]; then
|
|
||||||
if [[ ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then
|
|
||||||
export PATH=$PATH:env/bin/;
|
|
||||||
git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1
|
|
||||||
cp src/*_data.c _travis/src
|
|
||||||
echo "$TAG_VERSION" > _travis/versions/base_data
|
|
||||||
cd _travis
|
|
||||||
git config user.name "$GIT_COMMITTER_NAME";
|
|
||||||
git config user.email "$GIT_COMMITTER_EMAIL";
|
|
||||||
git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER";
|
|
||||||
git push --quiet origin master;
|
|
||||||
|
|
||||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME $BASIC_MODULE_DIRS
|
|
||||||
fi
|
|
||||||
git tag $TAG_VERSION -a -m "[auto][ci skip] Generating tag for Travis build #$TRAVIS_BUILD_NUMBER";
|
|
||||||
git push --tags --quiet origin master;
|
|
||||||
fi;
|
|
||||||
|
|
||||||
before_deploy:
|
|
||||||
- make dist
|
|
||||||
|
|
||||||
deploy:
|
|
||||||
- provider: releases
|
|
||||||
file:
|
|
||||||
- "$SRC_TARBALL_FILENAME"
|
|
||||||
on:
|
|
||||||
tags: true
|
|
||||||
branch: master
|
|
||||||
skip_cleanup: true
|
|
||||||
- provider: releases
|
|
||||||
file:
|
|
||||||
- "$LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME"
|
|
||||||
on:
|
|
||||||
tags: true
|
|
||||||
branch: master
|
|
||||||
condition: "$CC = gcc && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 )"
|
|
||||||
skip_cleanup: true
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
## Submitting Issues
|
## Submitting Issues
|
||||||
|
|
||||||
When submitting issues to libpostal, please repeect these guildelines:
|
When submitting issues to libpostal, please respect these guidelines:
|
||||||
|
|
||||||
- Be constructive. Try to help solve the problem.
|
- Be constructive. Try to help solve the problem.
|
||||||
- Always search for existing issues before submitting one.
|
- Always search for existing issues before submitting one.
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ I was checking out libpostal, and saw something that could be improved.
|
|||||||
|
|
||||||
---
|
---
|
||||||
#### Here's how I'm using libpostal
|
#### Here's how I'm using libpostal
|
||||||
<!-- Always interested to know how people use the library! What are you working on? Which orgnization? What's your use case? -->
|
<!-- Always interested to know how people use the library! What are you working on? Which organization? What's your use case? -->
|
||||||
|
|
||||||
---
|
---
|
||||||
#### Here's what I did
|
#### Here's what I did
|
||||||
|
|||||||
60
README.md
60
README.md
@@ -1,6 +1,6 @@
|
|||||||
# libpostal: international street address NLP
|
# libpostal: international street address NLP
|
||||||
|
|
||||||
[](https://travis-ci.org/openvenues/libpostal)
|
[](https://github.com/openvenues/libpostal/actions)
|
||||||
[](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
|
[](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
|
||||||
[](https://github.com/openvenues/libpostal/blob/master/LICENSE)
|
[](https://github.com/openvenues/libpostal/blob/master/LICENSE)
|
||||||
[](#sponsors)
|
[](#sponsors)
|
||||||
@@ -98,7 +98,7 @@ Before you install, make sure you have the following prerequisites:
|
|||||||
|
|
||||||
**On Ubuntu/Debian**
|
**On Ubuntu/Debian**
|
||||||
```
|
```
|
||||||
sudo apt-get install curl autoconf automake libtool pkg-config
|
sudo apt-get install -y curl build-essential autoconf automake libtool pkg-config
|
||||||
```
|
```
|
||||||
|
|
||||||
**On CentOS/RHEL**
|
**On CentOS/RHEL**
|
||||||
@@ -106,19 +106,46 @@ sudo apt-get install curl autoconf automake libtool pkg-config
|
|||||||
sudo yum install curl autoconf automake libtool pkgconfig
|
sudo yum install curl autoconf automake libtool pkgconfig
|
||||||
```
|
```
|
||||||
|
|
||||||
**On Mac OSX**
|
**On macOS**
|
||||||
|
|
||||||
|
Install with one command via [MacPorts](https://www.macports.org/):
|
||||||
|
```
|
||||||
|
port install libpostal
|
||||||
|
```
|
||||||
|
|
||||||
|
Or as follows with [Homebrew](https://brew.sh/):
|
||||||
|
|
||||||
```
|
```
|
||||||
brew install curl autoconf automake libtool pkg-config
|
brew install curl autoconf automake libtool pkg-config
|
||||||
```
|
```
|
||||||
|
|
||||||
Then to install the C library:
|
Then to install the C library:
|
||||||
|
|
||||||
|
If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed.
|
||||||
|
|
||||||
```
|
```
|
||||||
git clone https://github.com/openvenues/libpostal
|
git clone https://github.com/openvenues/libpostal
|
||||||
cd libpostal
|
cd libpostal
|
||||||
|
|
||||||
|
# skip if installing for the first time
|
||||||
|
make distclean
|
||||||
|
|
||||||
./bootstrap.sh
|
./bootstrap.sh
|
||||||
./configure --datadir=[...some dir with a few GB of space...]
|
|
||||||
|
# omit --datadir flag to install data in current directory
|
||||||
|
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
|
||||||
make -j4
|
make -j4
|
||||||
|
|
||||||
|
# For Intel/AMD processors and the default model
|
||||||
|
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
|
||||||
|
|
||||||
|
# For Apple / ARM cpus and the default model
|
||||||
|
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] --disable-sse2
|
||||||
|
|
||||||
|
# For the improved Senzing model:
|
||||||
|
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] MODEL=senzing
|
||||||
|
|
||||||
|
make -j8
|
||||||
sudo make install
|
sudo make install
|
||||||
|
|
||||||
# On Linux it's probably a good idea to run
|
# On Linux it's probably a good idea to run
|
||||||
@@ -400,23 +427,19 @@ Libpostal is designed to be used by higher-level languages. If you don't see yo
|
|||||||
- LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
|
- LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
|
||||||
- Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
|
- Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
|
||||||
- Elixir: [Expostal](https://github.com/SweetIQ/expostal)
|
- Elixir: [Expostal](https://github.com/SweetIQ/expostal)
|
||||||
|
- Haskell: [haskell-postal](http://github.com/netom/haskell-postal)
|
||||||
|
- Rust: [rust-postal](https://github.com/pnordahl/rust-postal)
|
||||||
- Rust: [rustpostal](https://crates.io/crates/rustpostal)
|
- Rust: [rustpostal](https://crates.io/crates/rustpostal)
|
||||||
|
|
||||||
**Database extensions**
|
**Unofficial database extensions**
|
||||||
|
|
||||||
- PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)
|
- PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)
|
||||||
|
|
||||||
**Unofficial REST API**
|
**Unofficial servers**
|
||||||
|
|
||||||
- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest)
|
- Libpostal REST Go Docker: [libpostal-rest-docker](https://github.com/johnlonganecker/libpostal-rest-docker)
|
||||||
|
- Libpostal REST FastAPI Docker: [libpostal-fastapi](https://github.com/alpha-affinity/libpostal-fastapi)
|
||||||
**Libpostal REST Docker**
|
- Libpostal ZeroMQ Docker: [libpostal-zeromq](https://github.com/pasupulaphani/libpostal-docker)
|
||||||
|
|
||||||
- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker)
|
|
||||||
|
|
||||||
**Libpostal ZeroMQ Docker**
|
|
||||||
|
|
||||||
- Libpostal ZeroMQ Docker image: [pasupulaphani/libpostal-zeromq](https://hub.docker.com/r/pasupulaphani/libpostal-zeromq/) , Source: [Github](https://github.com/pasupulaphani/libpostal-docker)
|
|
||||||
|
|
||||||
|
|
||||||
Tests
|
Tests
|
||||||
@@ -491,7 +514,7 @@ optionally be separated so Rosenstraße and Rosen Straße are equivalent.
|
|||||||
for a wide variety of countries and languages, not just US/English.
|
for a wide variety of countries and languages, not just US/English.
|
||||||
The model is trained on over 1 billion addresses and address-like strings, using the
|
The model is trained on over 1 billion addresses and address-like strings, using the
|
||||||
templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted,
|
templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted,
|
||||||
tagged traning examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
|
tagged training examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
|
||||||
are performed to make the training data resemble real messy geocoder input as closely as possible.
|
are performed to make the training data resemble real messy geocoder input as closely as possible.
|
||||||
|
|
||||||
- **Language classification**: multinomial logistic regression
|
- **Language classification**: multinomial logistic regression
|
||||||
@@ -513,7 +536,7 @@ language (IX => 9) which occur in the names of many monarchs, popes, etc.
|
|||||||
|
|
||||||
- **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec,
|
- **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec,
|
||||||
implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian
|
implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian
|
||||||
languages chracter by character instead of on whitespace.
|
languages character by character instead of on whitespace.
|
||||||
|
|
||||||
- **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form,
|
- **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form,
|
||||||
strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration.
|
strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration.
|
||||||
@@ -537,6 +560,7 @@ Non-goals
|
|||||||
|
|
||||||
- Verifying that a location is a valid address
|
- Verifying that a location is a valid address
|
||||||
- Actually geocoding addresses to a lat/lon (that requires a database/search index)
|
- Actually geocoding addresses to a lat/lon (that requires a database/search index)
|
||||||
|
- Extracting addresses from free text
|
||||||
|
|
||||||
Raison d'être
|
Raison d'être
|
||||||
-------------
|
-------------
|
||||||
@@ -642,7 +666,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions:
|
|||||||
- Confines almost all mallocs to *name*_new and all frees to *name*_destroy
|
- Confines almost all mallocs to *name*_new and all frees to *name*_destroy
|
||||||
- Efficient existing implementations for simple things like hashtables
|
- Efficient existing implementations for simple things like hashtables
|
||||||
- Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
|
- Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
|
||||||
- Data structrues take advantage of sparsity as much as possible
|
- Data structures take advantage of sparsity as much as possible
|
||||||
- Efficient double-array trie implementation for most string dictionaries
|
- Efficient double-array trie implementation for most string dictionaries
|
||||||
- Cross-platform as much as possible, particularly for *nix
|
- Cross-platform as much as possible, particularly for *nix
|
||||||
|
|
||||||
|
|||||||
53
configure.ac
53
configure.ac
@@ -84,57 +84,20 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl
|
|||||||
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Architecture-specific options
|
# Checks for SSE2 build
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
# allow enabling hardware optimization on any system:
|
|
||||||
case "$host_cpu" in
|
|
||||||
arm*|aarch64*)
|
|
||||||
enable_arm_neon=yes
|
|
||||||
enable_intel_sse=no
|
|
||||||
AC_DEFINE([ARM_NEON], [1],
|
|
||||||
[Enable ARM_NEON optimizations])
|
|
||||||
;;
|
|
||||||
i?86|x86_64)
|
|
||||||
enable_intel_sse=yes
|
|
||||||
enable_arm_neon=no
|
|
||||||
AC_DEFINE([INTEL_SSE], [1],
|
|
||||||
[Enable Intel SSE optimizations])
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
AC_ARG_ENABLE([neon],
|
|
||||||
AS_HELP_STRING([[[--disable-neon]]],
|
|
||||||
[Disable ARM NEON hardware optimizations]),
|
|
||||||
[
|
|
||||||
enable_arm_neon=no
|
|
||||||
AC_DEFINE([ARM_NEON], [0],
|
|
||||||
[Disable ARM_NEON optimizations])
|
|
||||||
])
|
|
||||||
|
|
||||||
AC_ARG_ENABLE([sse2],
|
AC_ARG_ENABLE([sse2],
|
||||||
AS_HELP_STRING([[[--disable-sse2]]],
|
AS_HELP_STRING(
|
||||||
[Disable Intel SSE2 hardware optimizations]),
|
[--disable-sse2],
|
||||||
[
|
[disable SSE2 optimization routines]
|
||||||
enable_intel_sse=no
|
)
|
||||||
AC_DEFINE([INTEL_SSE], [0],
|
)
|
||||||
[Disable INTEL_SSE optimizations])
|
|
||||||
])
|
|
||||||
|
|
||||||
SIMDFLAGS=""
|
AS_IF([test "x$enable_sse2" != "xno" && test "x$(uname -m)" != "xarm64"], [
|
||||||
|
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
|
||||||
AS_IF([test "x$enable_intel_sse" != "xno"], [
|
|
||||||
SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE"
|
|
||||||
])
|
])
|
||||||
|
|
||||||
AS_IF([test "x$enable_arm_neon" != "xno"], [
|
|
||||||
SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON"
|
|
||||||
])
|
|
||||||
|
|
||||||
CFLAGS="${SIMDFLAGS} ${CFLAGS}"
|
|
||||||
|
|
||||||
AC_SUBST([SIMDFLAGS], [$SIMDFLAGS])
|
|
||||||
|
|
||||||
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
|
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
|
||||||
|
|
||||||
AC_ARG_ENABLE([data-download],
|
AC_ARG_ENABLE([data-download],
|
||||||
|
|||||||
@@ -152,11 +152,21 @@ if test $ax_cblas_ok = no; then
|
|||||||
[], [-lblas])])
|
[], [-lblas])])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# BLAS in OpenBLAS library?
|
||||||
|
if test $ax_cblas_ok = no; then
|
||||||
|
AC_CHECK_LIB(openblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lopenblas"])
|
||||||
|
fi
|
||||||
|
|
||||||
# Generic CBLAS library?
|
# Generic CBLAS library?
|
||||||
if test $ax_cblas_ok = no; then
|
if test $ax_cblas_ok = no; then
|
||||||
AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"])
|
AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Generic BLAS library?
|
||||||
|
if test $ax_cblas_ok = no; then
|
||||||
|
AC_CHECK_LIB(blas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lblas"])
|
||||||
|
fi
|
||||||
|
|
||||||
AC_SUBST(CBLAS_LIBS)
|
AC_SUBST(CBLAS_LIBS)
|
||||||
|
|
||||||
LIBS="$ax_cblas_save_LIBS"
|
LIBS="$ax_cblas_save_LIBS"
|
||||||
|
|||||||
@@ -63,10 +63,23 @@ numbers:
|
|||||||
|
|
||||||
|
|
||||||
house_numbers:
|
house_numbers:
|
||||||
|
gebaude: &gebaude
|
||||||
|
canonical: gebäude
|
||||||
|
abbreviated: geb
|
||||||
|
sample: true
|
||||||
|
canonical_probability: 0.5
|
||||||
|
abbreviated_probability: 0.5
|
||||||
|
sample_probability: 0.05
|
||||||
|
numeric:
|
||||||
|
direction: left
|
||||||
alphanumeric:
|
alphanumeric:
|
||||||
default: *nummer
|
default: *nummer
|
||||||
|
probability: 0.95
|
||||||
|
alternatives:
|
||||||
|
- alternative: *gebaude
|
||||||
|
probability: 0.05
|
||||||
|
|
||||||
alphanumeric_phrase_probability: 0.0001
|
alphanumeric_phrase_probability: 0.05
|
||||||
|
|
||||||
conscription_numbers:
|
conscription_numbers:
|
||||||
alphanumeric:
|
alphanumeric:
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ numbers:
|
|||||||
|
|
||||||
|
|
||||||
house_numbers:
|
house_numbers:
|
||||||
budnyok: &budnyok
|
budynok: &budynok
|
||||||
canonical: будинок
|
canonical: будинок
|
||||||
abbreviated: буд
|
abbreviated: буд
|
||||||
sample: true
|
sample: true
|
||||||
@@ -58,8 +58,8 @@ house_numbers:
|
|||||||
sample_probability: 0.1
|
sample_probability: 0.1
|
||||||
numeric:
|
numeric:
|
||||||
direction: left
|
direction: left
|
||||||
budnyok_latin: &budnyok_latin
|
budynok_latin: &budynok_latin
|
||||||
canonical: budnyok
|
canonical: budynok
|
||||||
abbreviated: bud
|
abbreviated: bud
|
||||||
sample: true
|
sample: true
|
||||||
canonical_probability: 0.6
|
canonical_probability: 0.6
|
||||||
@@ -88,10 +88,10 @@ house_numbers:
|
|||||||
direction: left
|
direction: left
|
||||||
|
|
||||||
alphanumeric:
|
alphanumeric:
|
||||||
default: *budnyok
|
default: *budynok
|
||||||
probability: 0.65
|
probability: 0.65
|
||||||
alternatives:
|
alternatives:
|
||||||
- alternative: *budnyok_latin
|
- alternative: *budynok_latin
|
||||||
probability: 0.05
|
probability: 0.05
|
||||||
- alternative: *dom
|
- alternative: *dom
|
||||||
probability: 0.25
|
probability: 0.25
|
||||||
|
|||||||
@@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
overrides:
|
overrides:
|
||||||
id:
|
id:
|
||||||
|
relation:
|
||||||
# Buenos Aires (state boundary coterminous with city)
|
# Buenos Aires (state boundary coterminous with city)
|
||||||
"3082668": null
|
"3082668": null
|
||||||
contained_by:
|
contained_by:
|
||||||
|
|||||||
@@ -132,6 +132,7 @@ falls|fls
|
|||||||
fare
|
fare
|
||||||
farm|frm
|
farm|frm
|
||||||
farms|frms
|
farms|frms
|
||||||
|
farm to market|fm|farm-to-market
|
||||||
fern
|
fern
|
||||||
ferry|fry|fy
|
ferry|fry|fy
|
||||||
field|fld|fd
|
field|fld|fd
|
||||||
|
|||||||
5
resources/dictionaries/hi/building_types.txt
Normal file
5
resources/dictionaries/hi/building_types.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
mandir|मन्दिर|मंदिर
|
||||||
|
station
|
||||||
|
police station
|
||||||
|
post office
|
||||||
|
office
|
||||||
4
resources/dictionaries/hi/directionals.txt
Normal file
4
resources/dictionaries/hi/directionals.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
dakshin|दक्षिण
|
||||||
|
uttar|उत्तर
|
||||||
|
poorva|poorav|पूर्व
|
||||||
|
paschim|पश्चिम
|
||||||
1
resources/dictionaries/hi/people.txt
Normal file
1
resources/dictionaries/hi/people.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
mahatma gandhi|mg|m g
|
||||||
24
resources/dictionaries/hi/personal_titles.txt
Normal file
24
resources/dictionaries/hi/personal_titles.txt
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
baba
|
||||||
|
babu
|
||||||
|
bhagat
|
||||||
|
guru
|
||||||
|
jagirdar
|
||||||
|
maharaja|maharaj
|
||||||
|
mahatma|महात्मा
|
||||||
|
pandit
|
||||||
|
raja
|
||||||
|
rajarshi
|
||||||
|
rajkumar
|
||||||
|
rajkumari
|
||||||
|
rani
|
||||||
|
rishi
|
||||||
|
sahib
|
||||||
|
sant
|
||||||
|
sardar
|
||||||
|
senapati
|
||||||
|
shah
|
||||||
|
shrimati|smt|srimathi|श्रीमती
|
||||||
|
shri|shree|sri|श्री
|
||||||
|
sushri
|
||||||
|
swami
|
||||||
|
ustad
|
||||||
3
resources/dictionaries/hi/qualifiers.txt
Normal file
3
resources/dictionaries/hi/qualifiers.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
nagar|नगर
|
||||||
|
colony
|
||||||
|
cantonment|cantt
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
bazaar|bazar
|
bazaar|bazar
|
||||||
marg
|
marg
|
||||||
nagar
|
flyover
|
||||||
|
रोड
|
||||||
@@ -1,2 +1,2 @@
|
|||||||
félemelet|felemelet
|
félemelet|felemelet|félem|1/2 em|1/2em
|
||||||
magasföldszint|magasfoldszint
|
magasföldszint|magasfoldszint|mgfszt|mgfsz|mfszt|mfsz
|
||||||
@@ -1 +1,2 @@
|
|||||||
szent|szt
|
szent|szt
|
||||||
|
idősebb|id
|
||||||
|
|||||||
@@ -1,21 +1,34 @@
|
|||||||
árok|arok
|
árok|arok
|
||||||
dűlő|dulo
|
dűlő|dulo|d.|d
|
||||||
fasor
|
fasor
|
||||||
|
fasora
|
||||||
|
főközlekedési út|főút|fout
|
||||||
|
határút|hatarut
|
||||||
|
kapu
|
||||||
kert
|
kert
|
||||||
körönd|korond|krnd
|
körönd|korond|krnd
|
||||||
|
körvasútsor|korvasutsor
|
||||||
körút|korut|krt
|
körút|korut|krt
|
||||||
köz|koz
|
köz|koz
|
||||||
|
lakótelep|lakotelep|ltp.|ltp
|
||||||
lejtő|lejto
|
lejtő|lejto
|
||||||
lépcső|lepcso
|
lépcső|lepcso
|
||||||
liget
|
liget
|
||||||
mező|mezo
|
mező|mezo
|
||||||
|
országút|orszagut
|
||||||
park
|
park
|
||||||
rakpart|rpt
|
parkja
|
||||||
sétány|setany
|
rakpart|rkpt|rkp|rpt
|
||||||
sor
|
sétány|setany|stny.|stny
|
||||||
sugárút|sugarut
|
sor|s.|s
|
||||||
|
sétány|setany|sét
|
||||||
|
sugárút|sugarut|sgrt.|sgrt|srt.|srt|sgt.|sgt
|
||||||
|
sziget
|
||||||
|
telep
|
||||||
tér|ter
|
tér|ter
|
||||||
tere
|
tere
|
||||||
utca|u
|
tanya|t.|t
|
||||||
út|ut
|
udvar
|
||||||
|
utca|u.|u
|
||||||
|
út|ut|u.|u
|
||||||
útja|utja
|
útja|utja
|
||||||
3
resources/dictionaries/ms/toponyms.txt
Normal file
3
resources/dictionaries/ms/toponyms.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
kuala lumpur|federal territory kuala lumpur|federal territory of kuala lumpur|wilayah persekutuan kuala lumpur|kl
|
||||||
|
labuan|federal territory labuan|federal territory of labuan|wilayah persekutuan labuan
|
||||||
|
putrajaya|federal territory putrajaya|federal territory of putrajaya|wilayah persekutuan putrajaya
|
||||||
@@ -1,12 +1,18 @@
|
|||||||
aleja|al
|
aleja|al
|
||||||
autostrada
|
autostrada
|
||||||
boczna
|
boczna
|
||||||
bulwar
|
bulwar|bulw
|
||||||
droga
|
droga
|
||||||
obwodnica
|
obwodnica
|
||||||
|
ogród
|
||||||
|
osiedle|os
|
||||||
|
park
|
||||||
plac|pl
|
plac|pl
|
||||||
rondo
|
rondo
|
||||||
rynek
|
rynek
|
||||||
|
skwer
|
||||||
szosa
|
szosa
|
||||||
ulica|ul
|
ulica|ul
|
||||||
|
wybrzeże|wyb
|
||||||
|
wyspa
|
||||||
zaulek
|
zaulek
|
||||||
@@ -10,10 +10,10 @@ calçada|calcada|cc
|
|||||||
calçadinha|caclcadinha|ccnh
|
calçadinha|caclcadinha|ccnh
|
||||||
câmara municipal|camara municipal|cm|c.m.|c. m.
|
câmara municipal|camara municipal|cm|c.m.|c. m.
|
||||||
caminho|cam|camno
|
caminho|cam|camno
|
||||||
direito|dto
|
direito|dto|dt
|
||||||
esquerdo|esq
|
esquerdo|esq
|
||||||
estrada|estr
|
estrada|estr
|
||||||
astrada marginal|estr marg
|
estrada marginal|estr marg
|
||||||
estrada municipal|em|e m|estr m
|
estrada municipal|em|e m|estr m
|
||||||
estrada nacional|en|e n|estr n
|
estrada nacional|en|e n|estr n
|
||||||
estrada regional|er|e r|estr r
|
estrada regional|er|e r|estr r
|
||||||
|
|||||||
6
resources/dictionaries/ro/building_types.txt
Normal file
6
resources/dictionaries/ro/building_types.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
anexa
|
||||||
|
bloc|blc|bl
|
||||||
|
casa
|
||||||
|
cladirea|cladire
|
||||||
|
complex
|
||||||
|
garaj
|
||||||
5
resources/dictionaries/ro/company_types.txt
Normal file
5
resources/dictionaries/ro/company_types.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
banca
|
||||||
|
organizatie neguvernamentala|ong
|
||||||
|
societate comerciala|sc
|
||||||
|
societate cu raspundere limitata|srl
|
||||||
|
societate pe actiuni|sa
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
&
|
&
|
||||||
colț|colt
|
colț|colt|colț cu|colt cu
|
||||||
între|intre
|
între|intre
|
||||||
la colțul de pe|la coltul de pe
|
la colțul de pe|la coltul de pe
|
||||||
și|si
|
și|si
|
||||||
@@ -1 +1 @@
|
|||||||
intrare
|
intrare|intrarea
|
||||||
|
|||||||
@@ -4,4 +4,4 @@ din
|
|||||||
in apropiere de
|
in apropiere de
|
||||||
în apropiere|in apropiere
|
în apropiere|in apropiere
|
||||||
în jurul aici|in jurul aici
|
în jurul aici|in jurul aici
|
||||||
lângă mine|langa mine
|
lângă mine|langa mine|lângă|langa
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
număr|numar|nr|nº|n°|#|№|no
|
număr|numar|nr|nº|n°|#|№|no|numarul|numărul
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ general|gen
|
|||||||
major|maj
|
major|maj
|
||||||
locotenent
|
locotenent
|
||||||
locotenent colonel
|
locotenent colonel
|
||||||
|
pictor
|
||||||
profesor|prof
|
profesor|prof
|
||||||
sergent
|
sergent
|
||||||
sublocotenent
|
sublocotenent
|
||||||
|
|||||||
3
resources/dictionaries/ro/place_names.txt
Normal file
3
resources/dictionaries/ro/place_names.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
cinema
|
||||||
|
cafenea
|
||||||
|
fabrica
|
||||||
@@ -1 +1,7 @@
|
|||||||
bloc|bl
|
bloc|bl
|
||||||
|
cartier|cartierul
|
||||||
|
comuna|comunā
|
||||||
|
kilometrul|kilometru|km
|
||||||
|
sat|satul
|
||||||
|
sector|sectorul|sect
|
||||||
|
zona
|
||||||
|
|||||||
@@ -1,2 +1,3 @@
|
|||||||
și|si|&
|
și|si|&
|
||||||
cel
|
cel
|
||||||
|
intre
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
aleea|ale|alea|al
|
aleea|ale|alea|al
|
||||||
bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard
|
bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard
|
||||||
calea|cal
|
calea|cale|cal
|
||||||
drumul
|
drumul|drum
|
||||||
fundătura|fundatura|fnd
|
fundătura|fundatura|fnd
|
||||||
fundacul|fdc
|
fundacul|fdc
|
||||||
intrarea|int|intr
|
intrarea|int|intr
|
||||||
piaţa|piata|piață|pta|pţa|p-ta|p-ţa
|
piaţa|piata|piață|pta|pţa|p-ta|p-ţa
|
||||||
strada|str
|
strada|str|st
|
||||||
stradela|str-la|sdla
|
stradela|str-la|sdla
|
||||||
șoseaua|soseaua|sos|șos
|
șoseaua|soseaua|sos|șos
|
||||||
splaiul|sp|spl
|
splaiul|sp|spl
|
||||||
|
|||||||
1
resources/dictionaries/ro/synonyms.txt
Normal file
1
resources/dictionaries/ro/synonyms.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
decembrie|dec
|
||||||
@@ -1,4 +1,8 @@
|
|||||||
apartament|ap|apt|apart
|
apartamentul|apartament|ap|apt|apart
|
||||||
birou
|
birou
|
||||||
|
cladire|cladirea|clădire|clădirea
|
||||||
|
corp|corpul
|
||||||
|
complex
|
||||||
|
interior|int
|
||||||
lotul
|
lotul
|
||||||
sală|sala
|
sală|sala
|
||||||
@@ -1,2 +1,4 @@
|
|||||||
вход
|
вход
|
||||||
vkhod
|
vkhod
|
||||||
|
подъезд
|
||||||
|
pod'ezd
|
||||||
|
|||||||
@@ -6,3 +6,5 @@ kvartal|kvart|kv|kv-l
|
|||||||
oblast|obl
|
oblast|obl
|
||||||
район|р-н
|
район|р-н
|
||||||
raion|r-n
|
raion|r-n
|
||||||
|
місто|міс|м
|
||||||
|
misto|mis|m
|
||||||
|
|||||||
93
resources/states/my.yaml
Normal file
93
resources/states/my.yaml
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
"KL":
|
||||||
|
en: Kuala Lumpur
|
||||||
|
ms: Kuala Lumpur
|
||||||
|
"federal territory kuala lumpur":
|
||||||
|
en: Kuala Lumpur
|
||||||
|
ms: Kuala Lumpur
|
||||||
|
"federal territory of kuala lumpur":
|
||||||
|
en: Kuala Lumpur
|
||||||
|
ms: Kuala Lumpur
|
||||||
|
"wilayah persekutuan kuala lumpur":
|
||||||
|
en: Kuala Lumpur
|
||||||
|
ms: Kuala Lumpur
|
||||||
|
"federal territory labuan":
|
||||||
|
en: Labuan
|
||||||
|
ms: Labuan
|
||||||
|
"federal territory of labuan":
|
||||||
|
en: Labuan
|
||||||
|
ms: Labuan
|
||||||
|
"wilayah persekutuan labuan":
|
||||||
|
en: Labuan
|
||||||
|
ms: Labuan
|
||||||
|
"federal territory putrajaya":
|
||||||
|
en: Putrajaya
|
||||||
|
ms: Putrajaya
|
||||||
|
"federal territory of putrajaya":
|
||||||
|
en: Putrajaya
|
||||||
|
ms: Putrajaya
|
||||||
|
"wilayah persekutuan putrajaya":
|
||||||
|
en: Putrajaya
|
||||||
|
ms: Putrajaya
|
||||||
|
"pulau pinang":
|
||||||
|
en: Penang
|
||||||
|
ms: Pulau Pinang
|
||||||
|
"penang":
|
||||||
|
en: Penang
|
||||||
|
ms: Pulau Pinang
|
||||||
|
JHR:
|
||||||
|
en: Johor
|
||||||
|
ms: Johor
|
||||||
|
KDH:
|
||||||
|
en: Kedah
|
||||||
|
ms: Kedah
|
||||||
|
KTN:
|
||||||
|
en: Kelantan
|
||||||
|
ms: Kelantan
|
||||||
|
MLK:
|
||||||
|
en: Melaka
|
||||||
|
ms: Melaka
|
||||||
|
NSN:
|
||||||
|
en: Negeri Sembilan
|
||||||
|
ms: Negeri Sembilan
|
||||||
|
PHG:
|
||||||
|
en: Pahang
|
||||||
|
ms: Pahang
|
||||||
|
PRK:
|
||||||
|
en: Perak
|
||||||
|
ms: Perak
|
||||||
|
PLS:
|
||||||
|
en: Perlis
|
||||||
|
ms: Perlis
|
||||||
|
PNG:
|
||||||
|
en: Penang
|
||||||
|
ms: Pulau Pinang
|
||||||
|
SBH:
|
||||||
|
en: Sabah
|
||||||
|
ms: Sabah
|
||||||
|
SWK:
|
||||||
|
en: Sarawak
|
||||||
|
ms: Sarawak
|
||||||
|
SGR:
|
||||||
|
en: Selangor
|
||||||
|
ms: Selangor
|
||||||
|
TRG:
|
||||||
|
en: Terengganu
|
||||||
|
ms: Terengganu
|
||||||
|
KUL:
|
||||||
|
en: Kuala Lumpur
|
||||||
|
ms: Kuala Lumpur
|
||||||
|
LBN:
|
||||||
|
en: Labuan
|
||||||
|
ms: Labuan
|
||||||
|
PJY:
|
||||||
|
en: Putrajaya
|
||||||
|
ms: Putrajaya
|
||||||
|
KL:
|
||||||
|
en: Kuala Lumpur
|
||||||
|
ms: Kuala Lumpur
|
||||||
|
LB:
|
||||||
|
en: Labuan
|
||||||
|
ms: Labuan
|
||||||
|
PY:
|
||||||
|
en: Putrajaya
|
||||||
|
ms: Putrajaya
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import argparse
|
import argparse
|
||||||
|
import fnmatch
|
||||||
import logging
|
import logging
|
||||||
import operator
|
import operator
|
||||||
import os
|
import os
|
||||||
@@ -24,7 +25,7 @@ from geodata.osm.components import osm_address_components
|
|||||||
from geodata.osm.definitions import osm_definitions
|
from geodata.osm.definitions import osm_definitions
|
||||||
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
|
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
|
||||||
from geodata.polygons.index import *
|
from geodata.polygons.index import *
|
||||||
from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder
|
from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder
|
||||||
from geodata.statistics.tf_idf import IDFIndex
|
from geodata.statistics.tf_idf import IDFIndex
|
||||||
|
|
||||||
|
|
||||||
@@ -212,6 +213,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
(ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
|
(ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
|
||||||
tests for neighborhoods. The properties vary by source but each has
|
tests for neighborhoods. The properties vary by source but each has
|
||||||
source has least a "name" key which in practice is what we care about.
|
source has least a "name" key which in practice is what we care about.
|
||||||
|
|
||||||
|
Quattroshapes data is no longer accessible and has been replaced by
|
||||||
|
WhosOnFirst.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
PRIORITIES_FILENAME = 'priorities.json'
|
PRIORITIES_FILENAME = 'priorities.json'
|
||||||
@@ -224,9 +228,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
source_priorities = {
|
source_priorities = {
|
||||||
'osm': 0, # Best names/polygons, same coordinate system
|
'osm': 0, # Best names/polygons, same coordinate system
|
||||||
'osm_cth': 1, # Prefer the OSM names if possible
|
'osm_cth': 1, # Prefer the OSM names if possible
|
||||||
'clickthathood': 2, # Better names/polygons than Quattroshapes
|
'clickthathood': 2, # Better names/polygons than WhosOnFirst
|
||||||
'osm_quattro': 3, # Prefer OSM names matched with Quattroshapes polygon
|
'osm_wof': 3, # Prefer OSM names matched with WhosOnFirst polygon
|
||||||
'quattroshapes': 4, # Good results in some countries/areas
|
'wof': 4, # Replacement of Quattroshapes
|
||||||
}
|
}
|
||||||
|
|
||||||
level_priorities = {
|
level_priorities = {
|
||||||
@@ -235,7 +239,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
}
|
}
|
||||||
|
|
||||||
regex_replacements = [
|
regex_replacements = [
|
||||||
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
|
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes
|
||||||
(re.compile('^paris-(?=[\d])', re.I), ''),
|
(re.compile('^paris-(?=[\d])', re.I), ''),
|
||||||
(re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
|
(re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
|
||||||
]
|
]
|
||||||
@@ -254,7 +258,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
|
def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
|
||||||
'''
|
'''
|
||||||
Given an OSM file (planet or some other bounds) containing neighborhoods
|
Given an OSM file (planet or some other bounds) containing neighborhoods
|
||||||
as points (some suburbs have boundaries)
|
as points (some suburbs have boundaries)
|
||||||
@@ -270,17 +274,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
|
|
||||||
logger = logging.getLogger('neighborhoods')
|
logger = logging.getLogger('neighborhoods')
|
||||||
|
|
||||||
qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
|
|
||||||
ensure_dir(qs_scratch_dir)
|
|
||||||
|
|
||||||
logger.info('Creating ClickThatHood neighborhoods')
|
logger.info('Creating ClickThatHood neighborhoods')
|
||||||
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
|
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
|
||||||
|
|
||||||
logger.info('Creating OSM neighborhoods')
|
logger.info('Creating OSM neighborhoods')
|
||||||
osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
|
osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
|
||||||
|
|
||||||
logger.info('Creating Quattroshapes neighborhoods')
|
logger.info('Creating WhosOnFirst neighborhoods')
|
||||||
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
|
wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods"))
|
||||||
|
|
||||||
country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
|
country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
|
||||||
|
|
||||||
@@ -292,7 +293,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
|
|
||||||
char_scripts = get_chars_by_script()
|
char_scripts = get_chars_by_script()
|
||||||
|
|
||||||
for idx in (cth, qs, osmn):
|
for idx in (cth, wof, osmn):
|
||||||
for i in xrange(idx.i):
|
for i in xrange(idx.i):
|
||||||
props = idx.get_properties(i)
|
props = idx.get_properties(i)
|
||||||
name = props.get('name')
|
name = props.get('name')
|
||||||
@@ -317,11 +318,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
index.index_polygon(poly.context)
|
index.index_polygon(poly.context)
|
||||||
index.add_polygon(poly.context, props)
|
index.add_polygon(poly.context, props)
|
||||||
|
|
||||||
qs.matched = [False] * qs.i
|
wof.matched = [False] * wof.i
|
||||||
cth.matched = [False] * cth.i
|
cth.matched = [False] * cth.i
|
||||||
|
|
||||||
logger.info('Matching OSM points to neighborhood polygons')
|
logger.info('Matching OSM points to neighborhood polygons')
|
||||||
# Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons
|
# Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons
|
||||||
num_polys = 0
|
num_polys = 0
|
||||||
for element_id, attrs, deps in parse_osm(filename):
|
for element_id, attrs, deps in parse_osm(filename):
|
||||||
try:
|
try:
|
||||||
@@ -359,14 +360,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
for name_key in OSM_NAME_TAGS:
|
for name_key in OSM_NAME_TAGS:
|
||||||
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
|
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
|
||||||
|
|
||||||
for idx in (cth, qs):
|
for idx in (cth, wof):
|
||||||
candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
|
candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
|
||||||
|
|
||||||
if candidates:
|
if candidates:
|
||||||
max_sim = 0.0
|
max_sim = 0.0
|
||||||
arg_max = None
|
arg_max = None
|
||||||
|
|
||||||
normalized_qs_names = {}
|
normalized_wof_names = {}
|
||||||
|
|
||||||
for osm_name in osm_names:
|
for osm_name in osm_names:
|
||||||
|
|
||||||
@@ -375,16 +376,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
|
|
||||||
for i in candidates:
|
for i in candidates:
|
||||||
props = idx.get_properties(i)
|
props = idx.get_properties(i)
|
||||||
name = normalized_qs_names.get(i)
|
name = normalized_wof_names.get(i)
|
||||||
if not name:
|
if not name:
|
||||||
name = props.get('name')
|
name = props.get('name')
|
||||||
if not name:
|
if not name:
|
||||||
continue
|
continue
|
||||||
for pattern, repl in cls.regex_replacements:
|
for pattern, repl in cls.regex_replacements:
|
||||||
name = pattern.sub(repl, name)
|
name = pattern.sub(repl, name)
|
||||||
normalized_qs_names[i] = name
|
normalized_wof_names[i] = name
|
||||||
|
|
||||||
if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood':
|
if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not contains_ideographs:
|
if not contains_ideographs:
|
||||||
@@ -446,7 +447,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
continue
|
continue
|
||||||
source = 'osm_cth'
|
source = 'osm_cth'
|
||||||
else:
|
else:
|
||||||
level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
|
level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None)
|
||||||
|
|
||||||
source = 'osm_quattro'
|
source = 'osm_quattro'
|
||||||
if level == 'neighborhood':
|
if level == 'neighborhood':
|
||||||
@@ -467,7 +468,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
if num_polys % 1000 == 0 and num_polys > 0:
|
if num_polys % 1000 == 0 and num_polys > 0:
|
||||||
logger.info('did {} neighborhoods'.format(num_polys))
|
logger.info('did {} neighborhoods'.format(num_polys))
|
||||||
|
|
||||||
for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')):
|
for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')):
|
||||||
for i in xrange(idx.i):
|
for i in xrange(idx.i):
|
||||||
props = idx.get_properties(i)
|
props = idx.get_properties(i)
|
||||||
poly = idx.get_polygon(i)
|
poly = idx.get_polygon(i)
|
||||||
@@ -482,7 +483,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
props['polygon_type'] = 'local_admin'
|
props['polygon_type'] = 'local_admin'
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
|
elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood':
|
||||||
component = AddressFormatter.SUBURB
|
component = AddressFormatter.SUBURB
|
||||||
name = props.get('name')
|
name = props.get('name')
|
||||||
if not name:
|
if not name:
|
||||||
@@ -525,28 +526,67 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
return sorted(candidates, key=self.priority)
|
return sorted(candidates, key=self.priority)
|
||||||
|
|
||||||
|
|
||||||
class QuattroshapesNeighborhoodsReverseGeocoder(GeohashPolygonIndex, QuattroshapesReverseGeocoder):
|
class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex):
|
||||||
persistent_polygons = False
|
persistent_polygons = False
|
||||||
cache_size = None
|
cache_size = None
|
||||||
|
|
||||||
|
NAME = "wof:name"
|
||||||
|
ASCII_NAME = "gn:asciiname"
|
||||||
|
LEVEL = "wof:placetype"
|
||||||
|
GEONAMES_ID = "gn:geonameid"
|
||||||
|
SUPERSEDED = "wof:superseded_by"
|
||||||
|
|
||||||
|
NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"}
|
||||||
|
POLYGON_TYPES = {"Polygon", "MultiPolygon"}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_neighborhoods_index(cls, quattroshapes_dir,
|
def is_valid_neighbourhood(cls, geojson):
|
||||||
output_dir,
|
validity = not geojson["properties"].get(cls.SUPERSEDED)
|
||||||
index_filename=None,
|
for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}:
|
||||||
polys_filename=DEFAULT_POLYS_FILENAME):
|
validity &= geojson["properties"].get(field)
|
||||||
local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME)
|
return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES
|
||||||
neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME)
|
|
||||||
return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename],
|
@classmethod
|
||||||
output_dir, index_filename=index_filename,
|
def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None):
|
||||||
polys_filename=polys_filename)
|
index = cls(save_dir=output_dir, index_filename=index_filename)
|
||||||
|
|
||||||
|
for root, dirnames, filenames in os.walk(wof_dir):
|
||||||
|
for fname in fnmatch.filter(filenames, "*.geojson"):
|
||||||
|
with open(os.path.join(root, fname)) as f:
|
||||||
|
geojson = json.load(f)
|
||||||
|
if cls.is_valid_neighbourhood(geojson):
|
||||||
|
properties = {
|
||||||
|
"name": safe_decode(geojson["properties"].get(cls.NAME)),
|
||||||
|
"name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)),
|
||||||
|
"qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)),
|
||||||
|
"gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID))
|
||||||
|
}
|
||||||
|
|
||||||
|
poly_type = geojson['geometry']['type']
|
||||||
|
if poly_type == 'Polygon':
|
||||||
|
poly = cls.to_polygon(geojson['geometry']['coordinates'][0])
|
||||||
|
index.index_polygon(poly)
|
||||||
|
poly = index.simplify_polygon(poly)
|
||||||
|
index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props)
|
||||||
|
elif poly_type == 'MultiPolygon':
|
||||||
|
polys = []
|
||||||
|
for coords in geojson['geometry']['coordinates']:
|
||||||
|
poly = cls.to_polygon(coords[0])
|
||||||
|
polys.append(poly)
|
||||||
|
index.index_polygon(poly)
|
||||||
|
|
||||||
|
multi_poly = index.simplify_polygon(MultiPolygon(polys))
|
||||||
|
index.add_polygon(multi_poly, dict(geojson['properties']))
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Handle argument parsing here
|
# Handle argument parsing here
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
parser.add_argument('-q', '--quattroshapes-dir',
|
parser.add_argument('-w', '--wof-dir',
|
||||||
help='Path to quattroshapes dir')
|
help='Path to WhosOnFirst dir')
|
||||||
|
|
||||||
parser.add_argument('-a', '--osm-admin-rtree-dir',
|
parser.add_argument('-a', '--osm-admin-rtree-dir',
|
||||||
help='Path to OSM admin rtree dir')
|
help='Path to OSM admin rtree dir')
|
||||||
@@ -567,16 +607,16 @@ if __name__ == '__main__':
|
|||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
|
if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
|
||||||
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
|
index = NeighborhoodReverseGeocoder.create_from_osm_and_wof(
|
||||||
args.osm_neighborhoods_file,
|
args.osm_neighborhoods_file,
|
||||||
args.quattroshapes_dir,
|
args.wof_dir,
|
||||||
args.country_rtree_dir,
|
args.country_rtree_dir,
|
||||||
args.osm_admin_rtree_dir,
|
args.osm_admin_rtree_dir,
|
||||||
args.osm_neighborhood_borders_file,
|
args.osm_neighborhood_borders_file,
|
||||||
args.out_dir
|
args.out_dir
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
parser.error('Must specify quattroshapes dir or osm admin borders file')
|
parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file')
|
||||||
|
|
||||||
index.save()
|
index.save()
|
||||||
|
|||||||
@@ -226,7 +226,6 @@ class PolygonIndex(object):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def create_from_geojson_files(cls, inputs, output_dir,
|
def create_from_geojson_files(cls, inputs, output_dir,
|
||||||
index_filename=None,
|
index_filename=None,
|
||||||
polys_filename=DEFAULT_POLYS_FILENAME,
|
|
||||||
include_only_properties=None):
|
include_only_properties=None):
|
||||||
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
|
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
|
||||||
for input_file in inputs:
|
for input_file in inputs:
|
||||||
|
|||||||
27
scripts/geodata/whosonfirst/download_wof_admin_polygon.py
Normal file
27
scripts/geodata/whosonfirst/download_wof_admin_polygon.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
import os
|
||||||
|
import pycountry
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||||
|
|
||||||
|
|
||||||
|
WOF_DATA_ADMIN_REPO_URL_PREFIX = "https://github.com/whosonfirst-data/whosonfirst-data/"
|
||||||
|
WOF_DATA_ADMIN_REPO_PREFIX = "whosonfirst-data-admin-"
|
||||||
|
|
||||||
|
|
||||||
|
def download_wof_data_admin(wof_dir):
|
||||||
|
for country_object in pycountry.countries:
|
||||||
|
repo_name = WOF_DATA_ADMIN_REPO_PREFIX + country_object.alpha2.lower()
|
||||||
|
repo_location = os.path.join(wof_dir, repo_name)
|
||||||
|
if not os.path.exists(repo_location):
|
||||||
|
subprocess.call(["git", "clone", WOF_DATA_ADMIN_REPO_URL_PREFIX + repo_name])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
sys.exit('Usage: python download_whosonfirst_data.py wof_dir')
|
||||||
|
|
||||||
|
download_wof_data_admin(sys.argv[1])
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
requests==2.20.0
|
requests==2.32.2
|
||||||
six==1.10.0
|
six==1.10.0
|
||||||
PyYAML==5.4
|
PyYAML==5.4
|
||||||
ujson==1.33
|
ujson==5.4.0
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
boto3==1.4.0
|
boto3==1.4.0
|
||||||
botocore==1.4.53
|
botocore==1.4.53
|
||||||
Fiona==1.6.3.post1
|
Fiona==1.10.0
|
||||||
PyYAML==5.4
|
PyYAML==5.4
|
||||||
Rtree==0.8.2
|
Rtree==0.8.2
|
||||||
Shapely==1.5.14
|
Shapely==1.5.14
|
||||||
@@ -13,14 +13,14 @@ distribute==0.7.3
|
|||||||
future==0.15.2
|
future==0.15.2
|
||||||
futures==3.0.5
|
futures==3.0.5
|
||||||
ftfy==4.2.0
|
ftfy==4.2.0
|
||||||
gevent==1.1.2
|
gevent==23.9.0
|
||||||
greenlet==0.4.10
|
greenlet==0.4.10
|
||||||
jmespath==0.9.0
|
jmespath==0.9.0
|
||||||
leveldb==0.193
|
leveldb==0.193
|
||||||
lxml==4.6.3
|
lxml==4.9.1
|
||||||
lru-dict==1.1.3
|
lru-dict==1.1.3
|
||||||
marisa-trie==0.7.2
|
marisa-trie==0.7.2
|
||||||
numpy==1.10.4
|
numpy==1.22.0
|
||||||
pycountry==1.20
|
pycountry==1.20
|
||||||
git+https://github.com/kmike/pymorphy2
|
git+https://github.com/kmike/pymorphy2
|
||||||
pymorphy2-dicts-ru==2.4.394633.4298366
|
pymorphy2-dicts-ru==2.4.394633.4298366
|
||||||
@@ -29,9 +29,9 @@ pyproj==1.9.5.1
|
|||||||
pystache==0.5.4
|
pystache==0.5.4
|
||||||
python-Levenshtein==0.12.0
|
python-Levenshtein==0.12.0
|
||||||
python-geohash==0.8.5
|
python-geohash==0.8.5
|
||||||
requests==2.20.0
|
requests==2.32.2
|
||||||
s3transfer==0.1.3
|
s3transfer==0.1.3
|
||||||
six==1.10.0
|
six==1.10.0
|
||||||
ujson==1.35
|
ujson==5.4.0
|
||||||
urlnorm==1.1.3
|
urlnorm==1.1.3
|
||||||
wsgiref==0.1.2
|
wsgiref==0.1.2
|
||||||
|
|||||||
@@ -40,8 +40,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (context->flag & CRF_CONTEXT_MARGINALS) {
|
if (context->flag & CRF_CONTEXT_MARGINALS) {
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
context->exp_state = double_matrix_new_aligned(T, L, 16);
|
context->exp_state = double_matrix_new_aligned(T, L, 32);
|
||||||
if (context->exp_state == NULL) goto exit_context_created;
|
if (context->exp_state == NULL) goto exit_context_created;
|
||||||
double_matrix_zero(context->exp_state);
|
double_matrix_zero(context->exp_state);
|
||||||
#else
|
#else
|
||||||
@@ -52,8 +52,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
|||||||
context->mexp_state = double_matrix_new_zeros(T, L);
|
context->mexp_state = double_matrix_new_zeros(T, L);
|
||||||
if (context->mexp_state == NULL) goto exit_context_created;
|
if (context->mexp_state == NULL) goto exit_context_created;
|
||||||
|
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16);
|
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32);
|
||||||
if (context->exp_state_trans == NULL) goto exit_context_created;
|
if (context->exp_state_trans == NULL) goto exit_context_created;
|
||||||
double_matrix_zero(context->exp_state_trans);
|
double_matrix_zero(context->exp_state_trans);
|
||||||
#else
|
#else
|
||||||
@@ -64,8 +64,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
|||||||
context->mexp_state_trans = double_matrix_new_zeros(T, L * L);
|
context->mexp_state_trans = double_matrix_new_zeros(T, L * L);
|
||||||
if (context->mexp_state_trans == NULL) goto exit_context_created;
|
if (context->mexp_state_trans == NULL) goto exit_context_created;
|
||||||
|
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
context->exp_trans = double_matrix_new_aligned(L, L, 16);
|
context->exp_trans = double_matrix_new_aligned(L, L, 32);
|
||||||
if (context->exp_trans == NULL) goto exit_context_created;
|
if (context->exp_trans == NULL) goto exit_context_created;
|
||||||
double_matrix_zero(context->exp_trans);
|
double_matrix_zero(context->exp_trans);
|
||||||
#else
|
#else
|
||||||
@@ -130,14 +130,14 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) {
|
|||||||
|
|
||||||
if (self->flag & CRF_CONTEXT_MARGINALS &&
|
if (self->flag & CRF_CONTEXT_MARGINALS &&
|
||||||
(
|
(
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
!double_matrix_resize_aligned(self->exp_state, T, L, 16) ||
|
!double_matrix_resize_aligned(self->exp_state, T, L, 32) ||
|
||||||
#else
|
#else
|
||||||
!double_matrix_resize(self->exp_state, T, L) ||
|
!double_matrix_resize(self->exp_state, T, L) ||
|
||||||
#endif
|
#endif
|
||||||
!double_matrix_resize(self->mexp_state, T, L) ||
|
!double_matrix_resize(self->mexp_state, T, L) ||
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) ||
|
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) ||
|
||||||
#else
|
#else
|
||||||
!double_matrix_resize(self->exp_state_trans, T, L * L) ||
|
!double_matrix_resize(self->exp_state_trans, T, L * L) ||
|
||||||
#endif
|
#endif
|
||||||
@@ -184,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (self->exp_state != NULL) {
|
if (self->exp_state != NULL) {
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
double_matrix_destroy_aligned(self->exp_state);
|
double_matrix_destroy_aligned(self->exp_state);
|
||||||
#else
|
#else
|
||||||
double_matrix_destroy(self->exp_state);
|
double_matrix_destroy(self->exp_state);
|
||||||
@@ -200,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (self->exp_state_trans != NULL) {
|
if (self->exp_state_trans != NULL) {
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
double_matrix_destroy_aligned(self->exp_state_trans);
|
double_matrix_destroy_aligned(self->exp_state_trans);
|
||||||
#else
|
#else
|
||||||
double_matrix_destroy(self->exp_state_trans);
|
double_matrix_destroy(self->exp_state_trans);
|
||||||
@@ -216,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (self->exp_trans != NULL) {
|
if (self->exp_trans != NULL) {
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
double_matrix_destroy_aligned(self->exp_trans);
|
double_matrix_destroy_aligned(self->exp_trans);
|
||||||
#else
|
#else
|
||||||
double_matrix_destroy(self->exp_trans);
|
double_matrix_destroy(self->exp_trans);
|
||||||
|
|||||||
12
src/expand.c
12
src/expand.c
@@ -15,6 +15,14 @@
|
|||||||
#include "token_types.h"
|
#include "token_types.h"
|
||||||
#include "transliterate.h"
|
#include "transliterate.h"
|
||||||
|
|
||||||
|
#ifdef HAVE_CONFIG_H
|
||||||
|
#include <config.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef HAVE_STRNDUP
|
||||||
|
#include "strndup.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define DEFAULT_KEY_LEN 32
|
#define DEFAULT_KEY_LEN 32
|
||||||
|
|
||||||
@@ -1567,7 +1575,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
|||||||
|
|
||||||
size_t len = strlen(input);
|
size_t len = strlen(input);
|
||||||
|
|
||||||
language_classifier_response_t *lang_response = NULL;
|
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||||
|
|
||||||
if (options.num_languages == 0) {
|
if (options.num_languages == 0) {
|
||||||
lang_response = classify_languages(input);
|
lang_response = classify_languages(input);
|
||||||
@@ -1627,7 +1635,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
|||||||
kh_destroy(str_set, unique_strings);
|
kh_destroy(str_set, unique_strings);
|
||||||
|
|
||||||
if (lang_response != NULL) {
|
if (lang_response != NULL) {
|
||||||
language_classifier_response_destroy(lang_response);
|
libpostal_language_classifier_response_destroy(lang_response);
|
||||||
}
|
}
|
||||||
|
|
||||||
char_array_destroy(temp_string);
|
char_array_destroy(temp_string);
|
||||||
|
|||||||
@@ -198,7 +198,7 @@ bool file_write_float(FILE *file, float value) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline uint32_t file_deserialize_uint32(unsigned char *buf) {
|
inline uint32_t file_deserialize_uint32(unsigned char *buf) {
|
||||||
return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
|
return ((uint32_t)buf[0] << 24) | ((uint32_t)buf[1] << 16) | ((uint32_t)buf[2] << 8) | (uint32_t)buf[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
bool file_read_uint32(FILE *file, uint32_t *value) {
|
bool file_read_uint32(FILE *file, uint32_t *value) {
|
||||||
@@ -243,7 +243,7 @@ bool file_write_uint32(FILE *file, uint32_t value) {
|
|||||||
|
|
||||||
|
|
||||||
inline uint16_t file_deserialize_uint16(unsigned char *buf) {
|
inline uint16_t file_deserialize_uint16(unsigned char *buf) {
|
||||||
return (buf[0] << 8) | buf[1];
|
return ((uint16_t)buf[0] << 8) | buf[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ language_classifier_t *get_language_classifier(void) {
|
|||||||
return language_classifier;
|
return language_classifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
void language_classifier_response_destroy(language_classifier_response_t *self) {
|
void language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
|
||||||
if (self == NULL) return;
|
if (self == NULL) return;
|
||||||
if (self->languages != NULL) {
|
if (self->languages != NULL) {
|
||||||
free(self->languages);
|
free(self->languages);
|
||||||
@@ -59,7 +59,7 @@ void language_classifier_response_destroy(language_classifier_response_t *self)
|
|||||||
free(self);
|
free(self);
|
||||||
}
|
}
|
||||||
|
|
||||||
language_classifier_response_t *classify_languages(char *address) {
|
libpostal_language_classifier_response_t *classify_languages(char *address) {
|
||||||
language_classifier_t *classifier = get_language_classifier();
|
language_classifier_t *classifier = get_language_classifier();
|
||||||
|
|
||||||
if (classifier == NULL) {
|
if (classifier == NULL) {
|
||||||
@@ -88,7 +88,7 @@ language_classifier_response_t *classify_languages(char *address) {
|
|||||||
size_t n = classifier->num_labels;
|
size_t n = classifier->num_labels;
|
||||||
double_matrix_t *p_y = double_matrix_new_zeros(1, n);
|
double_matrix_t *p_y = double_matrix_new_zeros(1, n);
|
||||||
|
|
||||||
language_classifier_response_t *response = NULL;
|
libpostal_language_classifier_response_t *response = NULL;
|
||||||
bool model_exp = false;
|
bool model_exp = false;
|
||||||
if (classifier->weights_type == MATRIX_DENSE) {
|
if (classifier->weights_type == MATRIX_DENSE) {
|
||||||
model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y);
|
model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y);
|
||||||
@@ -129,7 +129,7 @@ language_classifier_response_t *classify_languages(char *address) {
|
|||||||
|
|
||||||
free(indices);
|
free(indices);
|
||||||
|
|
||||||
response = malloc(sizeof(language_classifier_response_t));
|
response = malloc(sizeof(libpostal_language_classifier_response_t));
|
||||||
response->num_languages = num_languages;
|
response->num_languages = num_languages;
|
||||||
response->languages = languages;
|
response->languages = languages;
|
||||||
response->probs = probs;
|
response->probs = probs;
|
||||||
|
|||||||
@@ -6,6 +6,8 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#include "libpostal.h"
|
||||||
|
|
||||||
#include "collections.h"
|
#include "collections.h"
|
||||||
#include "language_features.h"
|
#include "language_features.h"
|
||||||
#include "logistic_regression.h"
|
#include "logistic_regression.h"
|
||||||
@@ -29,21 +31,14 @@ typedef struct language_classifier {
|
|||||||
} weights;
|
} weights;
|
||||||
} language_classifier_t;
|
} language_classifier_t;
|
||||||
|
|
||||||
|
|
||||||
typedef struct language_classifier_response {
|
|
||||||
size_t num_languages;
|
|
||||||
char **languages;
|
|
||||||
double *probs;
|
|
||||||
} language_classifier_response_t;
|
|
||||||
|
|
||||||
// General usage
|
// General usage
|
||||||
|
|
||||||
language_classifier_t *language_classifier_new(void);
|
language_classifier_t *language_classifier_new(void);
|
||||||
language_classifier_t *get_language_classifier(void);
|
language_classifier_t *get_language_classifier(void);
|
||||||
language_classifier_t *get_language_classifier_country(void);
|
language_classifier_t *get_language_classifier_country(void);
|
||||||
|
|
||||||
language_classifier_response_t *classify_languages(char *address);
|
libpostal_language_classifier_response_t *classify_languages(char *address);
|
||||||
void language_classifier_response_destroy(language_classifier_response_t *self);
|
void language_classifier_response_destroy(libpostal_language_classifier_response_t *self);
|
||||||
|
|
||||||
void language_classifier_destroy(language_classifier_t *self);
|
void language_classifier_destroy(language_classifier_t *self);
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
language_classifier_response_t *response = classify_languages(address);
|
libpostal_language_classifier_response_t *response = classify_languages(address);
|
||||||
if (response == NULL) {
|
if (response == NULL) {
|
||||||
printf("Could not classify language\n");
|
printf("Could not classify language\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ double test_accuracy(char *filename) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
language_classifier_response_t *response = classify_languages(address);
|
libpostal_language_classifier_response_t *response = classify_languages(address);
|
||||||
if (response == NULL || response->num_languages == 0) {
|
if (response == NULL || response->num_languages == 0) {
|
||||||
printf("%s\tNULL\t%s\n", language, address);
|
printf("%s\tNULL\t%s\n", language, address);
|
||||||
continue;
|
continue;
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels
|
|||||||
|
|
||||||
|
|
||||||
char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
|
char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
|
||||||
language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
|
libpostal_language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
|
||||||
if (lang_response == NULL) {
|
if (lang_response == NULL) {
|
||||||
*num_languages = 0;
|
*num_languages = 0;
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -297,19 +297,21 @@ bool libpostal_setup_datadir(char *datadir) {
|
|||||||
address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
|
address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool setup_succeed = true;
|
||||||
|
|
||||||
if (!transliteration_module_setup(transliteration_path)) {
|
if (!transliteration_module_setup(transliteration_path)) {
|
||||||
log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
|
log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
|
||||||
return false;
|
setup_succeed = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!numex_module_setup(numex_path)) {
|
if (setup_succeed && !numex_module_setup(numex_path)) {
|
||||||
log_error("Error loading numex module, dir=%s\n", numex_path);
|
log_error("Error loading numex module, dir=%s\n", numex_path);
|
||||||
return false;
|
setup_succeed = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!address_dictionary_module_setup(address_dictionary_path)) {
|
if (setup_succeed && !address_dictionary_module_setup(address_dictionary_path)) {
|
||||||
log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
|
log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
|
||||||
return false;
|
setup_succeed = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (transliteration_path != NULL) {
|
if (transliteration_path != NULL) {
|
||||||
@@ -324,7 +326,7 @@ bool libpostal_setup_datadir(char *datadir) {
|
|||||||
free(address_dictionary_path);
|
free(address_dictionary_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return setup_succeed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool libpostal_setup(void) {
|
bool libpostal_setup(void) {
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
|||||||
|
|
||||||
LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
|
LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
|
||||||
|
|
||||||
if [ $DATAMODEL == "senzing" ]; then
|
if [ "$DATAMODEL" = "senzing" ]; then
|
||||||
LIBPOSTAL_DATA_FILE_CHUNKS=1
|
LIBPOSTAL_DATA_FILE_CHUNKS=1
|
||||||
LIBPOSTAL_PARSER_MODEL_CHUNKS=1
|
LIBPOSTAL_PARSER_MODEL_CHUNKS=1
|
||||||
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
|
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ typedef enum {
|
|||||||
matrix->m = m; \
|
matrix->m = m; \
|
||||||
matrix->n = n; \
|
matrix->n = n; \
|
||||||
\
|
\
|
||||||
matrix->values = _aligned_malloc(sizeof(type) * m * n, alignment); \
|
matrix->values = aligned_malloc(sizeof(type) * m * n, alignment); \
|
||||||
if (matrix->values == NULL) { \
|
if (matrix->values == NULL) { \
|
||||||
free(matrix); \
|
free(matrix); \
|
||||||
return NULL; \
|
return NULL; \
|
||||||
@@ -86,7 +86,7 @@ typedef enum {
|
|||||||
if (self == NULL) return; \
|
if (self == NULL) return; \
|
||||||
\
|
\
|
||||||
if (self->values != NULL) { \
|
if (self->values != NULL) { \
|
||||||
_aligned_free(self->values); \
|
aligned_free(self->values); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
free(self); \
|
free(self); \
|
||||||
@@ -118,7 +118,7 @@ typedef enum {
|
|||||||
if (self == NULL) return false; \
|
if (self == NULL) return false; \
|
||||||
\
|
\
|
||||||
if (m * n > (self->m * self->n)) { \
|
if (m * n > (self->m * self->n)) { \
|
||||||
type *ptr = _aligned_realloc(self->values, sizeof(type) * m * n, alignment); \
|
type *ptr = aligned_resize(self->values, sizeof(type) * self->m * self->n, sizeof(type) * m * n, alignment); \
|
||||||
if (ptr == NULL) { \
|
if (ptr == NULL) { \
|
||||||
return false; \
|
return false; \
|
||||||
} \
|
} \
|
||||||
|
|||||||
@@ -670,7 +670,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels,
|
|||||||
|
|
||||||
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
|
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
|
||||||
|
|
||||||
language_classifier_response_t *lang_response = NULL;
|
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||||
|
|
||||||
if (num_languages == 0) {
|
if (num_languages == 0) {
|
||||||
lang_response = place_languages(num_components, labels, values);
|
lang_response = place_languages(num_components, labels, values);
|
||||||
|
|||||||
@@ -5,6 +5,15 @@
|
|||||||
|
|
||||||
#include "log/log.h"
|
#include "log/log.h"
|
||||||
|
|
||||||
|
#ifdef HAVE_CONFIG_H
|
||||||
|
#include <config.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef HAVE_STRNDUP
|
||||||
|
#include "strndup.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB
|
#define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB
|
||||||
|
|
||||||
#define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n"
|
#define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n"
|
||||||
|
|||||||
@@ -17,10 +17,10 @@ static inline bool is_address_text_component(char *label) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
|
libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
|
||||||
if (num_components == 0 || values == NULL || labels == NULL) return NULL;
|
if (num_components == 0 || values == NULL || labels == NULL) return NULL;
|
||||||
|
|
||||||
language_classifier_response_t *lang_response = NULL;
|
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||||
|
|
||||||
char *label;
|
char *label;
|
||||||
char *value;
|
char *value;
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ typedef struct place {
|
|||||||
char *website;
|
char *website;
|
||||||
} place_t;
|
} place_t;
|
||||||
|
|
||||||
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
|
libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
|
||||||
|
|
||||||
place_t *place_new(void);
|
place_t *place_new(void);
|
||||||
|
|
||||||
|
|||||||
@@ -94,15 +94,15 @@ inline bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khas
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) {
|
uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) {
|
||||||
khash_t(int_set) *unique_columns = kh_init(int_set);
|
khash_t(int_uint32) *unique_columns = kh_init(int_uint32);
|
||||||
uint32_array *ret = uint32_array_new();
|
uint32_array *ret = uint32_array_new();
|
||||||
|
|
||||||
if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) {
|
if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) {
|
||||||
kh_destroy(int_set, unique_columns);
|
kh_destroy(int_uint32, unique_columns);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
kh_destroy(int_set, unique_columns);
|
kh_destroy(int_uint32, unique_columns);
|
||||||
uint32_array_destroy(ret);
|
uint32_array_destroy(ret);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|||||||
47
src/vector.h
47
src/vector.h
@@ -7,43 +7,44 @@
|
|||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
|
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
|
||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
|
static inline void *aligned_malloc(size_t size, size_t alignment) {
|
||||||
|
return _aligned_malloc(size, alignment);
|
||||||
|
}
|
||||||
|
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment) {
|
||||||
|
return _aligned_realloc(p, new_size, alignment);
|
||||||
|
}
|
||||||
|
static inline void aligned_free(void *p) {
|
||||||
|
_aligned_free(p);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
static inline void *_aligned_malloc(size_t size, size_t alignment)
|
static inline void *aligned_malloc(size_t size, size_t alignment)
|
||||||
{
|
{
|
||||||
void *p;
|
void *p;
|
||||||
int ret = posix_memalign(&p, alignment, size);
|
int ret = posix_memalign(&p, alignment, size);
|
||||||
return (ret == 0) ? p : NULL;
|
return (ret == 0) ? p : NULL;
|
||||||
}
|
}
|
||||||
static inline void *_aligned_realloc(void *p, size_t size, size_t alignment)
|
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment)
|
||||||
{
|
{
|
||||||
if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) {
|
if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (size == 0) {
|
if (p == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *rp = realloc(p, size);
|
void *p1 = aligned_malloc(new_size, alignment);
|
||||||
|
if (p1 == NULL) {
|
||||||
/* If realloc result is not already at an aligned boundary,
|
free(p);
|
||||||
_aligned_malloc a new block and copy the contents of the realloc'd
|
return NULL;
|
||||||
pointer to the aligned block, free the realloc'd pointer and return
|
|
||||||
the aligned pointer.
|
|
||||||
*/
|
|
||||||
if ( ((size_t)rp & (alignment - 1)) != 0) {
|
|
||||||
void *p1 = _aligned_malloc(size, alignment);
|
|
||||||
if (p1 != NULL) {
|
|
||||||
memcpy(p1, rp, size);
|
|
||||||
}
|
|
||||||
free(rp);
|
|
||||||
rp = p1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return rp;
|
memcpy(p1, p, old_size);
|
||||||
|
free(p);
|
||||||
|
return p1;
|
||||||
}
|
}
|
||||||
static inline void _aligned_free(void *p)
|
static inline void aligned_free(void *p)
|
||||||
{
|
{
|
||||||
free(p);
|
free(p);
|
||||||
}
|
}
|
||||||
@@ -79,7 +80,7 @@ static inline void _aligned_free(void *p)
|
|||||||
name *array = malloc(sizeof(name)); \
|
name *array = malloc(sizeof(name)); \
|
||||||
if (array == NULL) return NULL; \
|
if (array == NULL) return NULL; \
|
||||||
array->n = array->m = 0; \
|
array->n = array->m = 0; \
|
||||||
array->a = _aligned_malloc(size * sizeof(type), alignment); \
|
array->a = aligned_malloc(size * sizeof(type), alignment); \
|
||||||
if (array->a == NULL) return NULL; \
|
if (array->a == NULL) return NULL; \
|
||||||
array->m = size; \
|
array->m = size; \
|
||||||
return array; \
|
return array; \
|
||||||
@@ -94,7 +95,7 @@ static inline void _aligned_free(void *p)
|
|||||||
} \
|
} \
|
||||||
static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) { \
|
static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) { \
|
||||||
if (size <= array->m) return true; \
|
if (size <= array->m) return true; \
|
||||||
type *ptr = _aligned_realloc(array->a, sizeof(type) * size, alignment); \
|
type *ptr = aligned_resize(array->a, sizeof(type) * array->m, sizeof(type) * size, alignment); \
|
||||||
if (ptr == NULL) return false; \
|
if (ptr == NULL) return false; \
|
||||||
array->a = ptr; \
|
array->a = ptr; \
|
||||||
array->m = size; \
|
array->m = size; \
|
||||||
@@ -160,7 +161,7 @@ static inline void _aligned_free(void *p)
|
|||||||
} \
|
} \
|
||||||
static inline void name##_destroy_aligned(name *array) { \
|
static inline void name##_destroy_aligned(name *array) { \
|
||||||
if (array == NULL) return; \
|
if (array == NULL) return; \
|
||||||
if (array->a != NULL) _aligned_free(array->a); \
|
if (array->a != NULL) aligned_free(array->a); \
|
||||||
free(array); \
|
free(array); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -182,7 +183,7 @@ static inline void _aligned_free(void *p)
|
|||||||
free_func(array->a[i]); \
|
free_func(array->a[i]); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
_aligned_free(array->a); \
|
aligned_free(array->a); \
|
||||||
free(array); \
|
free(array); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,10 +8,8 @@
|
|||||||
|
|
||||||
#define ks_lt_index(a, b) ((a).value < (b).value)
|
#define ks_lt_index(a, b) ((a).value < (b).value)
|
||||||
|
|
||||||
#if defined(INTEL_SSE)
|
#if defined(USE_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#elif defined(ARM_NEON)
|
|
||||||
#include "sse2neon.h"
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -340,7 +338,7 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
#if defined(USE_SSE)
|
||||||
/*
|
/*
|
||||||
From https://github.com/herumi/fmath/blob/master/fastexp.cpp
|
From https://github.com/herumi/fmath/blob/master/fastexp.cpp
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ CFLAGS_O2 = $(CFLAGS_BASE) -O2
|
|||||||
CFLAGS_O3 = $(CFLAGS_BASE) -O3
|
CFLAGS_O3 = $(CFLAGS_BASE) -O3
|
||||||
DEFAULT_INCLUDES = -I.. -I/usr/local/include
|
DEFAULT_INCLUDES = -I.. -I/usr/local/include
|
||||||
|
|
||||||
CFLAGS = $(SIMDFLAGS) $(CFLAGS_BASE)
|
CFLAGS = $(CFLAGS_BASE)
|
||||||
|
|
||||||
TESTS = test_libpostal
|
TESTS = test_libpostal
|
||||||
noinst_PROGRAMS = test_libpostal
|
noinst_PROGRAMS = test_libpostal
|
||||||
|
|||||||
@@ -73,57 +73,20 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl
|
|||||||
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Architecture-specific options
|
# Checks for SSE2 build
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
# allow enabling hardware optimization on any system:
|
|
||||||
case "$host_cpu" in
|
|
||||||
arm*|aarch64*)
|
|
||||||
enable_arm_neon=yes
|
|
||||||
enable_intel_sse=no
|
|
||||||
AC_DEFINE([ARM_NEON], [1],
|
|
||||||
[Enable ARM_NEON optimizations])
|
|
||||||
;;
|
|
||||||
i?86|x86_64)
|
|
||||||
enable_intel_sse=yes
|
|
||||||
enable_arm_neon=no
|
|
||||||
AC_DEFINE([INTEL_SSE], [1],
|
|
||||||
[Enable Intel SSE optimizations])
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
AC_ARG_ENABLE([neon],
|
|
||||||
AS_HELP_STRING([[[--disable-neon]]],
|
|
||||||
[Disable ARM NEON hardware optimizations]),
|
|
||||||
[
|
|
||||||
enable_arm_neon=no
|
|
||||||
AC_DEFINE([ARM_NEON], [0],
|
|
||||||
[Disable ARM_NEON optimizations])
|
|
||||||
])
|
|
||||||
|
|
||||||
AC_ARG_ENABLE([sse2],
|
AC_ARG_ENABLE([sse2],
|
||||||
AS_HELP_STRING([[[--disable-sse2]]],
|
AS_HELP_STRING(
|
||||||
[Disable Intel SSE2 hardware optimizations]),
|
[--disable-sse2],
|
||||||
[
|
[disable SSE2 optimization routines]
|
||||||
enable_intel_sse=no
|
)
|
||||||
AC_DEFINE([INTEL_SSE], [0],
|
)
|
||||||
[Disable INTEL_SSE optimizations])
|
|
||||||
])
|
|
||||||
|
|
||||||
SIMDFLAGS=""
|
AS_IF([test "x$enable_sse2" != "xno"], [
|
||||||
|
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
|
||||||
AS_IF([test "x$enable_intel_sse" != "xno"], [
|
|
||||||
SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE"
|
|
||||||
])
|
])
|
||||||
|
|
||||||
AS_IF([test "x$enable_arm_neon" != "xno"], [
|
|
||||||
SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON"
|
|
||||||
])
|
|
||||||
|
|
||||||
CFLAGS="${SIMDFLAGS} ${CFLAGS}"
|
|
||||||
|
|
||||||
AC_SUBST([SIMDFLAGS], [$SIMDFLAGS])
|
|
||||||
|
|
||||||
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
|
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
|
||||||
|
|
||||||
AC_ARG_ENABLE([data-download],
|
AC_ARG_ENABLE([data-download],
|
||||||
|
|||||||
Reference in New Issue
Block a user