Merge branch 'master' into patch-1
This commit is contained in:
36
.github/workflows/test.yml
vendored
Normal file
36
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches: [master]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build_and_test:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install Dependencies Linux
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install curl autoconf automake libtool pkg-config
|
||||
- name: Install Dependencies MacOS
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
brew update
|
||||
brew install curl autoconf automake libtool pkg-config
|
||||
- name: Build
|
||||
env:
|
||||
LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data
|
||||
run: |
|
||||
./bootstrap.sh
|
||||
./configure --datadir=$LIBPOSTAL_DATA_DIR
|
||||
make
|
||||
- name: Test
|
||||
run: make check
|
||||
83
.travis.yml
83
.travis.yml
@@ -1,83 +0,0 @@
|
||||
language: c
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
env:
|
||||
global:
|
||||
- secure: "bHrAu46oecEj3gjamT+XWXtf2J0ZJCFa8tUdgM4evscaJiiwv1TtsGXyhIj/ai7DlRIPVJUtBUy6uoGGjr6GT43zTrzSxYAOMdVXZYsnTDcdL1/0dbwcIK6/u0EI377s1buGIxG1fHveWKXuXwJWDAw4KS+5HU88a42+zMbhKe4="
|
||||
- secure: "SkvNYucKVns9qDjOEW2WIhDlOMKBOwhzVcwY++HWTRtn04ErrqR4k01Mmho0jGBQD9JrPLhDgnX1BNy5s+Kmq/bxn9OZm7K1z24qBKb0mBBiNEnf2jvT0AvF5xxM+cJf4KKNL+CC0MwNf5y7HVPq1xibOV4/CNIrc1ZZc9aqdkE="
|
||||
- secure: "am/rRca5akv7gSSMeNQfHnWiTHhk8fQhOZvZ0Ut+PezkQlLgKp7bzmMFkkuQ4L5hpJU40kFzuWmIPgO33dacgq69Vx/Xct1bEnxGBGjriI5qOhMizmzLYPs5uWiRjtJnBqb4JOUh5K7JBlwrgvD72fY5ZK2lwtzTksfWo8N+ahU="
|
||||
- secure: "mh/WDQapGJb6MAFvgCjiMAAv1aa8gUaIs2Ohtx7yPrDBwsD8UqlyEM7ktGLZGQ1q/7OJ/Z6QfDMfJQwDKzxyUSY1yHZTNkP3QzkTt2D1Qyvi++O6EkGqSdSS6Lb3aID3IsEaye/yasJ+rxiRSp05O9+OYvhJlqRZnzaimiAv5KI="
|
||||
- secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
|
||||
- GH_REF=github.com/openvenues/libpostal
|
||||
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l)
|
||||
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
|
||||
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
|
||||
- TAG_VERSION=$(cat ./versions/base).$TRAVIS_BUILD_NUMBER
|
||||
- SRC_TARBALL_FILENAME=libpostal-$(cat ./versions/base).tar.gz
|
||||
- LIBPOSTAL_DATA_DIR=$(pwd)/data
|
||||
- LIBPOSTAL_DATA_FILENAME=libpostal_data.tar.gz
|
||||
compiler:
|
||||
- clang
|
||||
- gcc
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- pkg-config
|
||||
before_script:
|
||||
- ./bootstrap.sh
|
||||
- if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
|
||||
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
|
||||
- if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
|
||||
install:
|
||||
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
|
||||
script:
|
||||
- ./configure --datadir=$LIBPOSTAL_DATA_DIR
|
||||
- make -j4
|
||||
- if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
|
||||
- if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
|
||||
- if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
|
||||
- make check
|
||||
|
||||
after_success:
|
||||
- |
|
||||
if [[ "$CC" == "gcc" && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" ]]; then
|
||||
if [[ ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then
|
||||
export PATH=$PATH:env/bin/;
|
||||
git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1
|
||||
cp src/*_data.c _travis/src
|
||||
echo "$TAG_VERSION" > _travis/versions/base_data
|
||||
cd _travis
|
||||
git config user.name "$GIT_COMMITTER_NAME";
|
||||
git config user.email "$GIT_COMMITTER_EMAIL";
|
||||
git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER";
|
||||
git push --quiet origin master;
|
||||
|
||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME $BASIC_MODULE_DIRS
|
||||
fi
|
||||
git tag $TAG_VERSION -a -m "[auto][ci skip] Generating tag for Travis build #$TRAVIS_BUILD_NUMBER";
|
||||
git push --tags --quiet origin master;
|
||||
fi;
|
||||
|
||||
before_deploy:
|
||||
- make dist
|
||||
|
||||
deploy:
|
||||
- provider: releases
|
||||
file:
|
||||
- "$SRC_TARBALL_FILENAME"
|
||||
on:
|
||||
tags: true
|
||||
branch: master
|
||||
skip_cleanup: true
|
||||
- provider: releases
|
||||
file:
|
||||
- "$LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME"
|
||||
on:
|
||||
tags: true
|
||||
branch: master
|
||||
condition: "$CC = gcc && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 )"
|
||||
skip_cleanup: true
|
||||
@@ -1,6 +1,6 @@
|
||||
## Submitting Issues
|
||||
|
||||
When submitting issues to libpostal, please repeect these guildelines:
|
||||
When submitting issues to libpostal, please respect these guidelines:
|
||||
|
||||
- Be constructive. Try to help solve the problem.
|
||||
- Always search for existing issues before submitting one.
|
||||
|
||||
@@ -8,7 +8,7 @@ I was checking out libpostal, and saw something that could be improved.
|
||||
|
||||
---
|
||||
#### Here's how I'm using libpostal
|
||||
<!-- Always interested to know how people use the library! What are you working on? Which orgnization? What's your use case? -->
|
||||
<!-- Always interested to know how people use the library! What are you working on? Which organization? What's your use case? -->
|
||||
|
||||
---
|
||||
#### Here's what I did
|
||||
|
||||
60
README.md
60
README.md
@@ -1,6 +1,6 @@
|
||||
# libpostal: international street address NLP
|
||||
|
||||
[](https://travis-ci.org/openvenues/libpostal)
|
||||
[](https://github.com/openvenues/libpostal/actions)
|
||||
[](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
|
||||
[](https://github.com/openvenues/libpostal/blob/master/LICENSE)
|
||||
[](#sponsors)
|
||||
@@ -98,7 +98,7 @@ Before you install, make sure you have the following prerequisites:
|
||||
|
||||
**On Ubuntu/Debian**
|
||||
```
|
||||
sudo apt-get install curl autoconf automake libtool pkg-config
|
||||
sudo apt-get install -y curl build-essential autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
**On CentOS/RHEL**
|
||||
@@ -106,19 +106,46 @@ sudo apt-get install curl autoconf automake libtool pkg-config
|
||||
sudo yum install curl autoconf automake libtool pkgconfig
|
||||
```
|
||||
|
||||
**On Mac OSX**
|
||||
**On macOS**
|
||||
|
||||
Install with one command via [MacPorts](https://www.macports.org/):
|
||||
```
|
||||
port install libpostal
|
||||
```
|
||||
|
||||
Or as follows with [Homebrew](https://brew.sh/):
|
||||
|
||||
```
|
||||
brew install curl autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
Then to install the C library:
|
||||
|
||||
If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed.
|
||||
|
||||
```
|
||||
git clone https://github.com/openvenues/libpostal
|
||||
cd libpostal
|
||||
|
||||
# skip if installing for the first time
|
||||
make distclean
|
||||
|
||||
./bootstrap.sh
|
||||
./configure --datadir=[...some dir with a few GB of space...]
|
||||
|
||||
# omit --datadir flag to install data in current directory
|
||||
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
|
||||
make -j4
|
||||
|
||||
# For Intel/AMD processors and the default model
|
||||
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
|
||||
|
||||
# For Apple / ARM cpus and the default model
|
||||
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] --disable-sse2
|
||||
|
||||
# For the improved Senzing model:
|
||||
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] MODEL=senzing
|
||||
|
||||
make -j8
|
||||
sudo make install
|
||||
|
||||
# On Linux it's probably a good idea to run
|
||||
@@ -400,23 +427,19 @@ Libpostal is designed to be used by higher-level languages. If you don't see yo
|
||||
- LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
|
||||
- Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
|
||||
- Elixir: [Expostal](https://github.com/SweetIQ/expostal)
|
||||
- Haskell: [haskell-postal](http://github.com/netom/haskell-postal)
|
||||
- Rust: [rust-postal](https://github.com/pnordahl/rust-postal)
|
||||
- Rust: [rustpostal](https://crates.io/crates/rustpostal)
|
||||
|
||||
**Database extensions**
|
||||
**Unofficial database extensions**
|
||||
|
||||
- PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)
|
||||
|
||||
**Unofficial REST API**
|
||||
**Unofficial servers**
|
||||
|
||||
- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest)
|
||||
|
||||
**Libpostal REST Docker**
|
||||
|
||||
- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker)
|
||||
|
||||
**Libpostal ZeroMQ Docker**
|
||||
|
||||
- Libpostal ZeroMQ Docker image: [pasupulaphani/libpostal-zeromq](https://hub.docker.com/r/pasupulaphani/libpostal-zeromq/) , Source: [Github](https://github.com/pasupulaphani/libpostal-docker)
|
||||
- Libpostal REST Go Docker: [libpostal-rest-docker](https://github.com/johnlonganecker/libpostal-rest-docker)
|
||||
- Libpostal REST FastAPI Docker: [libpostal-fastapi](https://github.com/alpha-affinity/libpostal-fastapi)
|
||||
- Libpostal ZeroMQ Docker: [libpostal-zeromq](https://github.com/pasupulaphani/libpostal-docker)
|
||||
|
||||
|
||||
Tests
|
||||
@@ -491,7 +514,7 @@ optionally be separated so Rosenstraße and Rosen Straße are equivalent.
|
||||
for a wide variety of countries and languages, not just US/English.
|
||||
The model is trained on over 1 billion addresses and address-like strings, using the
|
||||
templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted,
|
||||
tagged traning examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
|
||||
tagged training examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
|
||||
are performed to make the training data resemble real messy geocoder input as closely as possible.
|
||||
|
||||
- **Language classification**: multinomial logistic regression
|
||||
@@ -513,7 +536,7 @@ language (IX => 9) which occur in the names of many monarchs, popes, etc.
|
||||
|
||||
- **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec,
|
||||
implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian
|
||||
languages chracter by character instead of on whitespace.
|
||||
languages character by character instead of on whitespace.
|
||||
|
||||
- **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form,
|
||||
strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration.
|
||||
@@ -537,6 +560,7 @@ Non-goals
|
||||
|
||||
- Verifying that a location is a valid address
|
||||
- Actually geocoding addresses to a lat/lon (that requires a database/search index)
|
||||
- Extracting addresses from free text
|
||||
|
||||
Raison d'être
|
||||
-------------
|
||||
@@ -642,7 +666,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions:
|
||||
- Confines almost all mallocs to *name*_new and all frees to *name*_destroy
|
||||
- Efficient existing implementations for simple things like hashtables
|
||||
- Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
|
||||
- Data structrues take advantage of sparsity as much as possible
|
||||
- Data structures take advantage of sparsity as much as possible
|
||||
- Efficient double-array trie implementation for most string dictionaries
|
||||
- Cross-platform as much as possible, particularly for *nix
|
||||
|
||||
|
||||
53
configure.ac
53
configure.ac
@@ -84,57 +84,20 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl
|
||||
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Architecture-specific options
|
||||
# Checks for SSE2 build
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
# allow enabling hardware optimization on any system:
|
||||
case "$host_cpu" in
|
||||
arm*|aarch64*)
|
||||
enable_arm_neon=yes
|
||||
enable_intel_sse=no
|
||||
AC_DEFINE([ARM_NEON], [1],
|
||||
[Enable ARM_NEON optimizations])
|
||||
;;
|
||||
i?86|x86_64)
|
||||
enable_intel_sse=yes
|
||||
enable_arm_neon=no
|
||||
AC_DEFINE([INTEL_SSE], [1],
|
||||
[Enable Intel SSE optimizations])
|
||||
;;
|
||||
esac
|
||||
|
||||
AC_ARG_ENABLE([neon],
|
||||
AS_HELP_STRING([[[--disable-neon]]],
|
||||
[Disable ARM NEON hardware optimizations]),
|
||||
[
|
||||
enable_arm_neon=no
|
||||
AC_DEFINE([ARM_NEON], [0],
|
||||
[Disable ARM_NEON optimizations])
|
||||
])
|
||||
|
||||
AC_ARG_ENABLE([sse2],
|
||||
AS_HELP_STRING([[[--disable-sse2]]],
|
||||
[Disable Intel SSE2 hardware optimizations]),
|
||||
[
|
||||
enable_intel_sse=no
|
||||
AC_DEFINE([INTEL_SSE], [0],
|
||||
[Disable INTEL_SSE optimizations])
|
||||
])
|
||||
AS_HELP_STRING(
|
||||
[--disable-sse2],
|
||||
[disable SSE2 optimization routines]
|
||||
)
|
||||
)
|
||||
|
||||
SIMDFLAGS=""
|
||||
|
||||
AS_IF([test "x$enable_intel_sse" != "xno"], [
|
||||
SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE"
|
||||
AS_IF([test "x$enable_sse2" != "xno" && test "x$(uname -m)" != "xarm64"], [
|
||||
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
|
||||
])
|
||||
|
||||
AS_IF([test "x$enable_arm_neon" != "xno"], [
|
||||
SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON"
|
||||
])
|
||||
|
||||
CFLAGS="${SIMDFLAGS} ${CFLAGS}"
|
||||
|
||||
AC_SUBST([SIMDFLAGS], [$SIMDFLAGS])
|
||||
|
||||
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
|
||||
|
||||
AC_ARG_ENABLE([data-download],
|
||||
|
||||
@@ -152,11 +152,21 @@ if test $ax_cblas_ok = no; then
|
||||
[], [-lblas])])
|
||||
fi
|
||||
|
||||
# BLAS in OpenBLAS library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(openblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lopenblas"])
|
||||
fi
|
||||
|
||||
# Generic CBLAS library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"])
|
||||
fi
|
||||
|
||||
# Generic BLAS library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(blas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lblas"])
|
||||
fi
|
||||
|
||||
AC_SUBST(CBLAS_LIBS)
|
||||
|
||||
LIBS="$ax_cblas_save_LIBS"
|
||||
|
||||
@@ -63,10 +63,23 @@ numbers:
|
||||
|
||||
|
||||
house_numbers:
|
||||
gebaude: &gebaude
|
||||
canonical: gebäude
|
||||
abbreviated: geb
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.05
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric:
|
||||
default: *nummer
|
||||
probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *gebaude
|
||||
probability: 0.05
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
alphanumeric_phrase_probability: 0.05
|
||||
|
||||
conscription_numbers:
|
||||
alphanumeric:
|
||||
|
||||
@@ -49,7 +49,7 @@ numbers:
|
||||
|
||||
|
||||
house_numbers:
|
||||
budnyok: &budnyok
|
||||
budynok: &budynok
|
||||
canonical: будинок
|
||||
abbreviated: буд
|
||||
sample: true
|
||||
@@ -58,8 +58,8 @@ house_numbers:
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
budnyok_latin: &budnyok_latin
|
||||
canonical: budnyok
|
||||
budynok_latin: &budynok_latin
|
||||
canonical: budynok
|
||||
abbreviated: bud
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
@@ -88,10 +88,10 @@ house_numbers:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
default: *budnyok
|
||||
default: *budynok
|
||||
probability: 0.65
|
||||
alternatives:
|
||||
- alternative: *budnyok_latin
|
||||
- alternative: *budynok_latin
|
||||
probability: 0.05
|
||||
- alternative: *dom
|
||||
probability: 0.25
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
overrides:
|
||||
id:
|
||||
relation:
|
||||
# Buenos Aires (state boundary coterminous with city)
|
||||
"3082668": null
|
||||
contained_by:
|
||||
|
||||
@@ -132,6 +132,7 @@ falls|fls
|
||||
fare
|
||||
farm|frm
|
||||
farms|frms
|
||||
farm to market|fm|farm-to-market
|
||||
fern
|
||||
ferry|fry|fy
|
||||
field|fld|fd
|
||||
|
||||
5
resources/dictionaries/hi/building_types.txt
Normal file
5
resources/dictionaries/hi/building_types.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
mandir|मन्दिर|मंदिर
|
||||
station
|
||||
police station
|
||||
post office
|
||||
office
|
||||
4
resources/dictionaries/hi/directionals.txt
Normal file
4
resources/dictionaries/hi/directionals.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
dakshin|दक्षिण
|
||||
uttar|उत्तर
|
||||
poorva|poorav|पूर्व
|
||||
paschim|पश्चिम
|
||||
1
resources/dictionaries/hi/people.txt
Normal file
1
resources/dictionaries/hi/people.txt
Normal file
@@ -0,0 +1 @@
|
||||
mahatma gandhi|mg|m g
|
||||
24
resources/dictionaries/hi/personal_titles.txt
Normal file
24
resources/dictionaries/hi/personal_titles.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
baba
|
||||
babu
|
||||
bhagat
|
||||
guru
|
||||
jagirdar
|
||||
maharaja|maharaj
|
||||
mahatma|महात्मा
|
||||
pandit
|
||||
raja
|
||||
rajarshi
|
||||
rajkumar
|
||||
rajkumari
|
||||
rani
|
||||
rishi
|
||||
sahib
|
||||
sant
|
||||
sardar
|
||||
senapati
|
||||
shah
|
||||
shrimati|smt|srimathi|श्रीमती
|
||||
shri|shree|sri|श्री
|
||||
sushri
|
||||
swami
|
||||
ustad
|
||||
3
resources/dictionaries/hi/qualifiers.txt
Normal file
3
resources/dictionaries/hi/qualifiers.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
nagar|नगर
|
||||
colony
|
||||
cantonment|cantt
|
||||
@@ -1,3 +1,4 @@
|
||||
bazaar|bazar
|
||||
marg
|
||||
nagar
|
||||
flyover
|
||||
रोड
|
||||
@@ -1,2 +1,2 @@
|
||||
félemelet|felemelet
|
||||
magasföldszint|magasfoldszint
|
||||
félemelet|felemelet|félem|1/2 em|1/2em
|
||||
magasföldszint|magasfoldszint|mgfszt|mgfsz|mfszt|mfsz
|
||||
@@ -1 +1,2 @@
|
||||
szent|szt
|
||||
idősebb|id
|
||||
|
||||
@@ -1,21 +1,34 @@
|
||||
árok|arok
|
||||
dűlő|dulo
|
||||
dűlő|dulo|d.|d
|
||||
fasor
|
||||
fasora
|
||||
főközlekedési út|főút|fout
|
||||
határút|hatarut
|
||||
kapu
|
||||
kert
|
||||
körönd|korond|krnd
|
||||
körvasútsor|korvasutsor
|
||||
körút|korut|krt
|
||||
köz|koz
|
||||
lakótelep|lakotelep|ltp.|ltp
|
||||
lejtő|lejto
|
||||
lépcső|lepcso
|
||||
liget
|
||||
mező|mezo
|
||||
országút|orszagut
|
||||
park
|
||||
rakpart|rpt
|
||||
sétány|setany
|
||||
sor
|
||||
sugárút|sugarut
|
||||
parkja
|
||||
rakpart|rkpt|rkp|rpt
|
||||
sétány|setany|stny.|stny
|
||||
sor|s.|s
|
||||
sétány|setany|sét
|
||||
sugárút|sugarut|sgrt.|sgrt|srt.|srt|sgt.|sgt
|
||||
sziget
|
||||
telep
|
||||
tér|ter
|
||||
tere
|
||||
utca|u
|
||||
út|ut
|
||||
tanya|t.|t
|
||||
udvar
|
||||
utca|u.|u
|
||||
út|ut|u.|u
|
||||
útja|utja
|
||||
3
resources/dictionaries/ms/toponyms.txt
Normal file
3
resources/dictionaries/ms/toponyms.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
kuala lumpur|federal territory kuala lumpur|federal territory of kuala lumpur|wilayah persekutuan kuala lumpur|kl
|
||||
labuan|federal territory labuan|federal territory of labuan|wilayah persekutuan labuan
|
||||
putrajaya|federal territory putrajaya|federal territory of putrajaya|wilayah persekutuan putrajaya
|
||||
@@ -1,12 +1,18 @@
|
||||
aleja|al
|
||||
autostrada
|
||||
boczna
|
||||
bulwar
|
||||
bulwar|bulw
|
||||
droga
|
||||
obwodnica
|
||||
ogród
|
||||
osiedle|os
|
||||
park
|
||||
plac|pl
|
||||
rondo
|
||||
rynek
|
||||
skwer
|
||||
szosa
|
||||
ulica|ul
|
||||
wybrzeże|wyb
|
||||
wyspa
|
||||
zaulek
|
||||
@@ -10,10 +10,10 @@ calçada|calcada|cc
|
||||
calçadinha|caclcadinha|ccnh
|
||||
câmara municipal|camara municipal|cm|c.m.|c. m.
|
||||
caminho|cam|camno
|
||||
direito|dto
|
||||
direito|dto|dt
|
||||
esquerdo|esq
|
||||
estrada|estr
|
||||
astrada marginal|estr marg
|
||||
estrada marginal|estr marg
|
||||
estrada municipal|em|e m|estr m
|
||||
estrada nacional|en|e n|estr n
|
||||
estrada regional|er|e r|estr r
|
||||
|
||||
6
resources/dictionaries/ro/building_types.txt
Normal file
6
resources/dictionaries/ro/building_types.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
anexa
|
||||
bloc|blc|bl
|
||||
casa
|
||||
cladirea|cladire
|
||||
complex
|
||||
garaj
|
||||
5
resources/dictionaries/ro/company_types.txt
Normal file
5
resources/dictionaries/ro/company_types.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
banca
|
||||
organizatie neguvernamentala|ong
|
||||
societate comerciala|sc
|
||||
societate cu raspundere limitata|srl
|
||||
societate pe actiuni|sa
|
||||
@@ -1,5 +1,5 @@
|
||||
&
|
||||
colț|colt
|
||||
colț|colt|colț cu|colt cu
|
||||
între|intre
|
||||
la colțul de pe|la coltul de pe
|
||||
și|si
|
||||
@@ -1 +1 @@
|
||||
intrare
|
||||
intrare|intrarea
|
||||
|
||||
@@ -4,4 +4,4 @@ din
|
||||
in apropiere de
|
||||
în apropiere|in apropiere
|
||||
în jurul aici|in jurul aici
|
||||
lângă mine|langa mine
|
||||
lângă mine|langa mine|lângă|langa
|
||||
|
||||
@@ -1 +1 @@
|
||||
număr|numar|nr|nº|n°|#|№|no
|
||||
număr|numar|nr|nº|n°|#|№|no|numarul|numărul
|
||||
|
||||
@@ -8,6 +8,7 @@ general|gen
|
||||
major|maj
|
||||
locotenent
|
||||
locotenent colonel
|
||||
pictor
|
||||
profesor|prof
|
||||
sergent
|
||||
sublocotenent
|
||||
|
||||
3
resources/dictionaries/ro/place_names.txt
Normal file
3
resources/dictionaries/ro/place_names.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
cinema
|
||||
cafenea
|
||||
fabrica
|
||||
@@ -1 +1,7 @@
|
||||
bloc|bl
|
||||
cartier|cartierul
|
||||
comuna|comunā
|
||||
kilometrul|kilometru|km
|
||||
sat|satul
|
||||
sector|sectorul|sect
|
||||
zona
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
și|si|&
|
||||
cel
|
||||
intre
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
aleea|ale|alea|al
|
||||
bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard
|
||||
calea|cal
|
||||
drumul
|
||||
calea|cale|cal
|
||||
drumul|drum
|
||||
fundătura|fundatura|fnd
|
||||
fundacul|fdc
|
||||
intrarea|int|intr
|
||||
piaţa|piata|piață|pta|pţa|p-ta|p-ţa
|
||||
strada|str
|
||||
strada|str|st
|
||||
stradela|str-la|sdla
|
||||
șoseaua|soseaua|sos|șos
|
||||
splaiul|sp|spl
|
||||
|
||||
1
resources/dictionaries/ro/synonyms.txt
Normal file
1
resources/dictionaries/ro/synonyms.txt
Normal file
@@ -0,0 +1 @@
|
||||
decembrie|dec
|
||||
@@ -1,4 +1,8 @@
|
||||
apartament|ap|apt|apart
|
||||
apartamentul|apartament|ap|apt|apart
|
||||
birou
|
||||
cladire|cladirea|clădire|clădirea
|
||||
corp|corpul
|
||||
complex
|
||||
interior|int
|
||||
lotul
|
||||
sală|sala
|
||||
@@ -1,2 +1,4 @@
|
||||
вход
|
||||
vkhod
|
||||
подъезд
|
||||
pod'ezd
|
||||
|
||||
@@ -6,3 +6,5 @@ kvartal|kvart|kv|kv-l
|
||||
oblast|obl
|
||||
район|р-н
|
||||
raion|r-n
|
||||
місто|міс|м
|
||||
misto|mis|m
|
||||
|
||||
93
resources/states/my.yaml
Normal file
93
resources/states/my.yaml
Normal file
@@ -0,0 +1,93 @@
|
||||
"KL":
|
||||
en: Kuala Lumpur
|
||||
ms: Kuala Lumpur
|
||||
"federal territory kuala lumpur":
|
||||
en: Kuala Lumpur
|
||||
ms: Kuala Lumpur
|
||||
"federal territory of kuala lumpur":
|
||||
en: Kuala Lumpur
|
||||
ms: Kuala Lumpur
|
||||
"wilayah persekutuan kuala lumpur":
|
||||
en: Kuala Lumpur
|
||||
ms: Kuala Lumpur
|
||||
"federal territory labuan":
|
||||
en: Labuan
|
||||
ms: Labuan
|
||||
"federal territory of labuan":
|
||||
en: Labuan
|
||||
ms: Labuan
|
||||
"wilayah persekutuan labuan":
|
||||
en: Labuan
|
||||
ms: Labuan
|
||||
"federal territory putrajaya":
|
||||
en: Putrajaya
|
||||
ms: Putrajaya
|
||||
"federal territory of putrajaya":
|
||||
en: Putrajaya
|
||||
ms: Putrajaya
|
||||
"wilayah persekutuan putrajaya":
|
||||
en: Putrajaya
|
||||
ms: Putrajaya
|
||||
"pulau pinang":
|
||||
en: Penang
|
||||
ms: Pulau Pinang
|
||||
"penang":
|
||||
en: Penang
|
||||
ms: Pulau Pinang
|
||||
JHR:
|
||||
en: Johor
|
||||
ms: Johor
|
||||
KDH:
|
||||
en: Kedah
|
||||
ms: Kedah
|
||||
KTN:
|
||||
en: Kelantan
|
||||
ms: Kelantan
|
||||
MLK:
|
||||
en: Melaka
|
||||
ms: Melaka
|
||||
NSN:
|
||||
en: Negeri Sembilan
|
||||
ms: Negeri Sembilan
|
||||
PHG:
|
||||
en: Pahang
|
||||
ms: Pahang
|
||||
PRK:
|
||||
en: Perak
|
||||
ms: Perak
|
||||
PLS:
|
||||
en: Perlis
|
||||
ms: Perlis
|
||||
PNG:
|
||||
en: Penang
|
||||
ms: Pulau Pinang
|
||||
SBH:
|
||||
en: Sabah
|
||||
ms: Sabah
|
||||
SWK:
|
||||
en: Sarawak
|
||||
ms: Sarawak
|
||||
SGR:
|
||||
en: Selangor
|
||||
ms: Selangor
|
||||
TRG:
|
||||
en: Terengganu
|
||||
ms: Terengganu
|
||||
KUL:
|
||||
en: Kuala Lumpur
|
||||
ms: Kuala Lumpur
|
||||
LBN:
|
||||
en: Labuan
|
||||
ms: Labuan
|
||||
PJY:
|
||||
en: Putrajaya
|
||||
ms: Putrajaya
|
||||
KL:
|
||||
en: Kuala Lumpur
|
||||
ms: Kuala Lumpur
|
||||
LB:
|
||||
en: Labuan
|
||||
ms: Labuan
|
||||
PY:
|
||||
en: Putrajaya
|
||||
ms: Putrajaya
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import argparse
|
||||
import fnmatch
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
@@ -24,7 +25,7 @@ from geodata.osm.components import osm_address_components
|
||||
from geodata.osm.definitions import osm_definitions
|
||||
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
|
||||
from geodata.polygons.index import *
|
||||
from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder
|
||||
from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder
|
||||
from geodata.statistics.tf_idf import IDFIndex
|
||||
|
||||
|
||||
@@ -212,6 +213,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
(ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
|
||||
tests for neighborhoods. The properties vary by source but each has
|
||||
source has least a "name" key which in practice is what we care about.
|
||||
|
||||
Quattroshapes data is no longer accessible and has been replaced by
|
||||
WhosOnFirst.
|
||||
'''
|
||||
|
||||
PRIORITIES_FILENAME = 'priorities.json'
|
||||
@@ -224,9 +228,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
source_priorities = {
|
||||
'osm': 0, # Best names/polygons, same coordinate system
|
||||
'osm_cth': 1, # Prefer the OSM names if possible
|
||||
'clickthathood': 2, # Better names/polygons than Quattroshapes
|
||||
'osm_quattro': 3, # Prefer OSM names matched with Quattroshapes polygon
|
||||
'quattroshapes': 4, # Good results in some countries/areas
|
||||
'clickthathood': 2, # Better names/polygons than WhosOnFirst
|
||||
'osm_wof': 3, # Prefer OSM names matched with WhosOnFirst polygon
|
||||
'wof': 4, # Replacement of Quattroshapes
|
||||
}
|
||||
|
||||
level_priorities = {
|
||||
@@ -235,7 +239,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
}
|
||||
|
||||
regex_replacements = [
|
||||
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
|
||||
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes
|
||||
(re.compile('^paris-(?=[\d])', re.I), ''),
|
||||
(re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
|
||||
]
|
||||
@@ -254,7 +258,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
return doc
|
||||
|
||||
@classmethod
|
||||
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
|
||||
def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
|
||||
'''
|
||||
Given an OSM file (planet or some other bounds) containing neighborhoods
|
||||
as points (some suburbs have boundaries)
|
||||
@@ -270,17 +274,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
|
||||
logger = logging.getLogger('neighborhoods')
|
||||
|
||||
qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
|
||||
ensure_dir(qs_scratch_dir)
|
||||
|
||||
logger.info('Creating ClickThatHood neighborhoods')
|
||||
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
|
||||
|
||||
logger.info('Creating OSM neighborhoods')
|
||||
osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
|
||||
|
||||
logger.info('Creating Quattroshapes neighborhoods')
|
||||
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
|
||||
logger.info('Creating WhosOnFirst neighborhoods')
|
||||
wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods"))
|
||||
|
||||
country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
|
||||
|
||||
@@ -292,7 +293,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
|
||||
char_scripts = get_chars_by_script()
|
||||
|
||||
for idx in (cth, qs, osmn):
|
||||
for idx in (cth, wof, osmn):
|
||||
for i in xrange(idx.i):
|
||||
props = idx.get_properties(i)
|
||||
name = props.get('name')
|
||||
@@ -317,11 +318,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
index.index_polygon(poly.context)
|
||||
index.add_polygon(poly.context, props)
|
||||
|
||||
qs.matched = [False] * qs.i
|
||||
wof.matched = [False] * wof.i
|
||||
cth.matched = [False] * cth.i
|
||||
|
||||
logger.info('Matching OSM points to neighborhood polygons')
|
||||
# Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons
|
||||
# Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons
|
||||
num_polys = 0
|
||||
for element_id, attrs, deps in parse_osm(filename):
|
||||
try:
|
||||
@@ -359,14 +360,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
for name_key in OSM_NAME_TAGS:
|
||||
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
|
||||
|
||||
for idx in (cth, qs):
|
||||
for idx in (cth, wof):
|
||||
candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
|
||||
|
||||
if candidates:
|
||||
max_sim = 0.0
|
||||
arg_max = None
|
||||
|
||||
normalized_qs_names = {}
|
||||
normalized_wof_names = {}
|
||||
|
||||
for osm_name in osm_names:
|
||||
|
||||
@@ -375,16 +376,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
|
||||
for i in candidates:
|
||||
props = idx.get_properties(i)
|
||||
name = normalized_qs_names.get(i)
|
||||
name = normalized_wof_names.get(i)
|
||||
if not name:
|
||||
name = props.get('name')
|
||||
if not name:
|
||||
continue
|
||||
for pattern, repl in cls.regex_replacements:
|
||||
name = pattern.sub(repl, name)
|
||||
normalized_qs_names[i] = name
|
||||
normalized_wof_names[i] = name
|
||||
|
||||
if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood':
|
||||
if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood':
|
||||
continue
|
||||
|
||||
if not contains_ideographs:
|
||||
@@ -446,7 +447,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
continue
|
||||
source = 'osm_cth'
|
||||
else:
|
||||
level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
|
||||
level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None)
|
||||
|
||||
source = 'osm_quattro'
|
||||
if level == 'neighborhood':
|
||||
@@ -467,7 +468,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
if num_polys % 1000 == 0 and num_polys > 0:
|
||||
logger.info('did {} neighborhoods'.format(num_polys))
|
||||
|
||||
for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')):
|
||||
for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')):
|
||||
for i in xrange(idx.i):
|
||||
props = idx.get_properties(i)
|
||||
poly = idx.get_polygon(i)
|
||||
@@ -482,7 +483,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
props['polygon_type'] = 'local_admin'
|
||||
else:
|
||||
continue
|
||||
elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
|
||||
elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood':
|
||||
component = AddressFormatter.SUBURB
|
||||
name = props.get('name')
|
||||
if not name:
|
||||
@@ -525,28 +526,67 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||
return sorted(candidates, key=self.priority)
|
||||
|
||||
|
||||
class QuattroshapesNeighborhoodsReverseGeocoder(GeohashPolygonIndex, QuattroshapesReverseGeocoder):
|
||||
class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex):
|
||||
persistent_polygons = False
|
||||
cache_size = None
|
||||
|
||||
NAME = "wof:name"
|
||||
ASCII_NAME = "gn:asciiname"
|
||||
LEVEL = "wof:placetype"
|
||||
GEONAMES_ID = "gn:geonameid"
|
||||
SUPERSEDED = "wof:superseded_by"
|
||||
|
||||
NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"}
|
||||
POLYGON_TYPES = {"Polygon", "MultiPolygon"}
|
||||
|
||||
@classmethod
|
||||
def create_neighborhoods_index(cls, quattroshapes_dir,
|
||||
output_dir,
|
||||
index_filename=None,
|
||||
polys_filename=DEFAULT_POLYS_FILENAME):
|
||||
local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME)
|
||||
neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME)
|
||||
return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename],
|
||||
output_dir, index_filename=index_filename,
|
||||
polys_filename=polys_filename)
|
||||
def is_valid_neighbourhood(cls, geojson):
|
||||
validity = not geojson["properties"].get(cls.SUPERSEDED)
|
||||
for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}:
|
||||
validity &= geojson["properties"].get(field)
|
||||
return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES
|
||||
|
||||
@classmethod
|
||||
def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None):
|
||||
index = cls(save_dir=output_dir, index_filename=index_filename)
|
||||
|
||||
for root, dirnames, filenames in os.walk(wof_dir):
|
||||
for fname in fnmatch.filter(filenames, "*.geojson"):
|
||||
with open(os.path.join(root, fname)) as f:
|
||||
geojson = json.load(f)
|
||||
if cls.is_valid_neighbourhood(geojson):
|
||||
properties = {
|
||||
"name": safe_decode(geojson["properties"].get(cls.NAME)),
|
||||
"name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)),
|
||||
"qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)),
|
||||
"gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID))
|
||||
}
|
||||
|
||||
poly_type = geojson['geometry']['type']
|
||||
if poly_type == 'Polygon':
|
||||
poly = cls.to_polygon(geojson['geometry']['coordinates'][0])
|
||||
index.index_polygon(poly)
|
||||
poly = index.simplify_polygon(poly)
|
||||
index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props)
|
||||
elif poly_type == 'MultiPolygon':
|
||||
polys = []
|
||||
for coords in geojson['geometry']['coordinates']:
|
||||
poly = cls.to_polygon(coords[0])
|
||||
polys.append(poly)
|
||||
index.index_polygon(poly)
|
||||
|
||||
multi_poly = index.simplify_polygon(MultiPolygon(polys))
|
||||
index.add_polygon(multi_poly, dict(geojson['properties']))
|
||||
|
||||
return index
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-q', '--quattroshapes-dir',
|
||||
help='Path to quattroshapes dir')
|
||||
parser.add_argument('-w', '--wof-dir',
|
||||
help='Path to WhosOnFirst dir')
|
||||
|
||||
parser.add_argument('-a', '--osm-admin-rtree-dir',
|
||||
help='Path to OSM admin rtree dir')
|
||||
@@ -567,16 +607,16 @@ if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
|
||||
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
|
||||
if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
|
||||
index = NeighborhoodReverseGeocoder.create_from_osm_and_wof(
|
||||
args.osm_neighborhoods_file,
|
||||
args.quattroshapes_dir,
|
||||
args.wof_dir,
|
||||
args.country_rtree_dir,
|
||||
args.osm_admin_rtree_dir,
|
||||
args.osm_neighborhood_borders_file,
|
||||
args.out_dir
|
||||
)
|
||||
else:
|
||||
parser.error('Must specify quattroshapes dir or osm admin borders file')
|
||||
parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file')
|
||||
|
||||
index.save()
|
||||
|
||||
@@ -226,7 +226,6 @@ class PolygonIndex(object):
|
||||
@classmethod
|
||||
def create_from_geojson_files(cls, inputs, output_dir,
|
||||
index_filename=None,
|
||||
polys_filename=DEFAULT_POLYS_FILENAME,
|
||||
include_only_properties=None):
|
||||
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
|
||||
for input_file in inputs:
|
||||
|
||||
27
scripts/geodata/whosonfirst/download_wof_admin_polygon.py
Normal file
27
scripts/geodata/whosonfirst/download_wof_admin_polygon.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import os
|
||||
import pycountry
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
|
||||
WOF_DATA_ADMIN_REPO_URL_PREFIX = "https://github.com/whosonfirst-data/whosonfirst-data/"
|
||||
WOF_DATA_ADMIN_REPO_PREFIX = "whosonfirst-data-admin-"
|
||||
|
||||
|
||||
def download_wof_data_admin(wof_dir):
|
||||
for country_object in pycountry.countries:
|
||||
repo_name = WOF_DATA_ADMIN_REPO_PREFIX + country_object.alpha2.lower()
|
||||
repo_location = os.path.join(wof_dir, repo_name)
|
||||
if not os.path.exists(repo_location):
|
||||
subprocess.call(["git", "clone", WOF_DATA_ADMIN_REPO_URL_PREFIX + repo_name])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit('Usage: python download_whosonfirst_data.py wof_dir')
|
||||
|
||||
download_wof_data_admin(sys.argv[1])
|
||||
@@ -1,4 +1,4 @@
|
||||
requests==2.20.0
|
||||
requests==2.32.2
|
||||
six==1.10.0
|
||||
PyYAML==5.4
|
||||
ujson==1.33
|
||||
ujson==5.4.0
|
||||
@@ -1,6 +1,6 @@
|
||||
boto3==1.4.0
|
||||
botocore==1.4.53
|
||||
Fiona==1.6.3.post1
|
||||
Fiona==1.10.0
|
||||
PyYAML==5.4
|
||||
Rtree==0.8.2
|
||||
Shapely==1.5.14
|
||||
@@ -13,14 +13,14 @@ distribute==0.7.3
|
||||
future==0.15.2
|
||||
futures==3.0.5
|
||||
ftfy==4.2.0
|
||||
gevent==1.1.2
|
||||
gevent==23.9.0
|
||||
greenlet==0.4.10
|
||||
jmespath==0.9.0
|
||||
leveldb==0.193
|
||||
lxml==4.6.3
|
||||
lxml==4.9.1
|
||||
lru-dict==1.1.3
|
||||
marisa-trie==0.7.2
|
||||
numpy==1.10.4
|
||||
numpy==1.22.0
|
||||
pycountry==1.20
|
||||
git+https://github.com/kmike/pymorphy2
|
||||
pymorphy2-dicts-ru==2.4.394633.4298366
|
||||
@@ -29,9 +29,9 @@ pyproj==1.9.5.1
|
||||
pystache==0.5.4
|
||||
python-Levenshtein==0.12.0
|
||||
python-geohash==0.8.5
|
||||
requests==2.20.0
|
||||
requests==2.32.2
|
||||
s3transfer==0.1.3
|
||||
six==1.10.0
|
||||
ujson==1.35
|
||||
ujson==5.4.0
|
||||
urlnorm==1.1.3
|
||||
wsgiref==0.1.2
|
||||
|
||||
@@ -40,8 +40,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
||||
}
|
||||
|
||||
if (context->flag & CRF_CONTEXT_MARGINALS) {
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
context->exp_state = double_matrix_new_aligned(T, L, 16);
|
||||
#if defined(USE_SSE)
|
||||
context->exp_state = double_matrix_new_aligned(T, L, 32);
|
||||
if (context->exp_state == NULL) goto exit_context_created;
|
||||
double_matrix_zero(context->exp_state);
|
||||
#else
|
||||
@@ -52,8 +52,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
||||
context->mexp_state = double_matrix_new_zeros(T, L);
|
||||
if (context->mexp_state == NULL) goto exit_context_created;
|
||||
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16);
|
||||
#if defined(USE_SSE)
|
||||
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32);
|
||||
if (context->exp_state_trans == NULL) goto exit_context_created;
|
||||
double_matrix_zero(context->exp_state_trans);
|
||||
#else
|
||||
@@ -64,8 +64,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
||||
context->mexp_state_trans = double_matrix_new_zeros(T, L * L);
|
||||
if (context->mexp_state_trans == NULL) goto exit_context_created;
|
||||
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
context->exp_trans = double_matrix_new_aligned(L, L, 16);
|
||||
#if defined(USE_SSE)
|
||||
context->exp_trans = double_matrix_new_aligned(L, L, 32);
|
||||
if (context->exp_trans == NULL) goto exit_context_created;
|
||||
double_matrix_zero(context->exp_trans);
|
||||
#else
|
||||
@@ -130,14 +130,14 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) {
|
||||
|
||||
if (self->flag & CRF_CONTEXT_MARGINALS &&
|
||||
(
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
!double_matrix_resize_aligned(self->exp_state, T, L, 16) ||
|
||||
#if defined(USE_SSE)
|
||||
!double_matrix_resize_aligned(self->exp_state, T, L, 32) ||
|
||||
#else
|
||||
!double_matrix_resize(self->exp_state, T, L) ||
|
||||
#endif
|
||||
!double_matrix_resize(self->mexp_state, T, L) ||
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) ||
|
||||
#if defined(USE_SSE)
|
||||
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) ||
|
||||
#else
|
||||
!double_matrix_resize(self->exp_state_trans, T, L * L) ||
|
||||
#endif
|
||||
@@ -184,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) {
|
||||
}
|
||||
|
||||
if (self->exp_state != NULL) {
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
#if defined(USE_SSE)
|
||||
double_matrix_destroy_aligned(self->exp_state);
|
||||
#else
|
||||
double_matrix_destroy(self->exp_state);
|
||||
@@ -200,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) {
|
||||
}
|
||||
|
||||
if (self->exp_state_trans != NULL) {
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
#if defined(USE_SSE)
|
||||
double_matrix_destroy_aligned(self->exp_state_trans);
|
||||
#else
|
||||
double_matrix_destroy(self->exp_state_trans);
|
||||
@@ -216,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) {
|
||||
}
|
||||
|
||||
if (self->exp_trans != NULL) {
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
#if defined(USE_SSE)
|
||||
double_matrix_destroy_aligned(self->exp_trans);
|
||||
#else
|
||||
double_matrix_destroy(self->exp_trans);
|
||||
|
||||
12
src/expand.c
12
src/expand.c
@@ -15,6 +15,14 @@
|
||||
#include "token_types.h"
|
||||
#include "transliterate.h"
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_STRNDUP
|
||||
#include "strndup.h"
|
||||
#endif
|
||||
|
||||
|
||||
#define DEFAULT_KEY_LEN 32
|
||||
|
||||
@@ -1567,7 +1575,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
||||
|
||||
size_t len = strlen(input);
|
||||
|
||||
language_classifier_response_t *lang_response = NULL;
|
||||
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||
|
||||
if (options.num_languages == 0) {
|
||||
lang_response = classify_languages(input);
|
||||
@@ -1627,7 +1635,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
||||
kh_destroy(str_set, unique_strings);
|
||||
|
||||
if (lang_response != NULL) {
|
||||
language_classifier_response_destroy(lang_response);
|
||||
libpostal_language_classifier_response_destroy(lang_response);
|
||||
}
|
||||
|
||||
char_array_destroy(temp_string);
|
||||
|
||||
@@ -198,7 +198,7 @@ bool file_write_float(FILE *file, float value) {
|
||||
}
|
||||
|
||||
inline uint32_t file_deserialize_uint32(unsigned char *buf) {
|
||||
return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
|
||||
return ((uint32_t)buf[0] << 24) | ((uint32_t)buf[1] << 16) | ((uint32_t)buf[2] << 8) | (uint32_t)buf[3];
|
||||
}
|
||||
|
||||
bool file_read_uint32(FILE *file, uint32_t *value) {
|
||||
@@ -243,7 +243,7 @@ bool file_write_uint32(FILE *file, uint32_t value) {
|
||||
|
||||
|
||||
inline uint16_t file_deserialize_uint16(unsigned char *buf) {
|
||||
return (buf[0] << 8) | buf[1];
|
||||
return ((uint16_t)buf[0] << 8) | buf[1];
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ language_classifier_t *get_language_classifier(void) {
|
||||
return language_classifier;
|
||||
}
|
||||
|
||||
void language_classifier_response_destroy(language_classifier_response_t *self) {
|
||||
void language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
|
||||
if (self == NULL) return;
|
||||
if (self->languages != NULL) {
|
||||
free(self->languages);
|
||||
@@ -59,7 +59,7 @@ void language_classifier_response_destroy(language_classifier_response_t *self)
|
||||
free(self);
|
||||
}
|
||||
|
||||
language_classifier_response_t *classify_languages(char *address) {
|
||||
libpostal_language_classifier_response_t *classify_languages(char *address) {
|
||||
language_classifier_t *classifier = get_language_classifier();
|
||||
|
||||
if (classifier == NULL) {
|
||||
@@ -88,7 +88,7 @@ language_classifier_response_t *classify_languages(char *address) {
|
||||
size_t n = classifier->num_labels;
|
||||
double_matrix_t *p_y = double_matrix_new_zeros(1, n);
|
||||
|
||||
language_classifier_response_t *response = NULL;
|
||||
libpostal_language_classifier_response_t *response = NULL;
|
||||
bool model_exp = false;
|
||||
if (classifier->weights_type == MATRIX_DENSE) {
|
||||
model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y);
|
||||
@@ -129,7 +129,7 @@ language_classifier_response_t *classify_languages(char *address) {
|
||||
|
||||
free(indices);
|
||||
|
||||
response = malloc(sizeof(language_classifier_response_t));
|
||||
response = malloc(sizeof(libpostal_language_classifier_response_t));
|
||||
response->num_languages = num_languages;
|
||||
response->languages = languages;
|
||||
response->probs = probs;
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "libpostal.h"
|
||||
|
||||
#include "collections.h"
|
||||
#include "language_features.h"
|
||||
#include "logistic_regression.h"
|
||||
@@ -29,21 +31,14 @@ typedef struct language_classifier {
|
||||
} weights;
|
||||
} language_classifier_t;
|
||||
|
||||
|
||||
typedef struct language_classifier_response {
|
||||
size_t num_languages;
|
||||
char **languages;
|
||||
double *probs;
|
||||
} language_classifier_response_t;
|
||||
|
||||
// General usage
|
||||
|
||||
language_classifier_t *language_classifier_new(void);
|
||||
language_classifier_t *get_language_classifier(void);
|
||||
language_classifier_t *get_language_classifier_country(void);
|
||||
|
||||
language_classifier_response_t *classify_languages(char *address);
|
||||
void language_classifier_response_destroy(language_classifier_response_t *self);
|
||||
libpostal_language_classifier_response_t *classify_languages(char *address);
|
||||
void language_classifier_response_destroy(libpostal_language_classifier_response_t *self);
|
||||
|
||||
void language_classifier_destroy(language_classifier_t *self);
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
|
||||
language_classifier_response_t *response = classify_languages(address);
|
||||
libpostal_language_classifier_response_t *response = classify_languages(address);
|
||||
if (response == NULL) {
|
||||
printf("Could not classify language\n");
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
@@ -34,7 +34,7 @@ double test_accuracy(char *filename) {
|
||||
continue;
|
||||
}
|
||||
|
||||
language_classifier_response_t *response = classify_languages(address);
|
||||
libpostal_language_classifier_response_t *response = classify_languages(address);
|
||||
if (response == NULL || response->num_languages == 0) {
|
||||
printf("%s\tNULL\t%s\n", language, address);
|
||||
continue;
|
||||
|
||||
@@ -119,7 +119,7 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels
|
||||
|
||||
|
||||
char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
|
||||
language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
|
||||
libpostal_language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
|
||||
if (lang_response == NULL) {
|
||||
*num_languages = 0;
|
||||
return NULL;
|
||||
@@ -297,19 +297,21 @@ bool libpostal_setup_datadir(char *datadir) {
|
||||
address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
|
||||
}
|
||||
|
||||
bool setup_succeed = true;
|
||||
|
||||
if (!transliteration_module_setup(transliteration_path)) {
|
||||
log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
|
||||
return false;
|
||||
setup_succeed = false;
|
||||
}
|
||||
|
||||
if (!numex_module_setup(numex_path)) {
|
||||
if (setup_succeed && !numex_module_setup(numex_path)) {
|
||||
log_error("Error loading numex module, dir=%s\n", numex_path);
|
||||
return false;
|
||||
setup_succeed = false;
|
||||
}
|
||||
|
||||
if (!address_dictionary_module_setup(address_dictionary_path)) {
|
||||
if (setup_succeed && !address_dictionary_module_setup(address_dictionary_path)) {
|
||||
log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
|
||||
return false;
|
||||
setup_succeed = false;
|
||||
}
|
||||
|
||||
if (transliteration_path != NULL) {
|
||||
@@ -324,7 +326,7 @@ bool libpostal_setup_datadir(char *datadir) {
|
||||
free(address_dictionary_path);
|
||||
}
|
||||
|
||||
return true;
|
||||
return setup_succeed;
|
||||
}
|
||||
|
||||
bool libpostal_setup(void) {
|
||||
|
||||
@@ -36,7 +36,7 @@ LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||
|
||||
LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
|
||||
|
||||
if [ $DATAMODEL == "senzing" ]; then
|
||||
if [ "$DATAMODEL" = "senzing" ]; then
|
||||
LIBPOSTAL_DATA_FILE_CHUNKS=1
|
||||
LIBPOSTAL_PARSER_MODEL_CHUNKS=1
|
||||
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
|
||||
|
||||
@@ -62,7 +62,7 @@ typedef enum {
|
||||
matrix->m = m; \
|
||||
matrix->n = n; \
|
||||
\
|
||||
matrix->values = _aligned_malloc(sizeof(type) * m * n, alignment); \
|
||||
matrix->values = aligned_malloc(sizeof(type) * m * n, alignment); \
|
||||
if (matrix->values == NULL) { \
|
||||
free(matrix); \
|
||||
return NULL; \
|
||||
@@ -86,7 +86,7 @@ typedef enum {
|
||||
if (self == NULL) return; \
|
||||
\
|
||||
if (self->values != NULL) { \
|
||||
_aligned_free(self->values); \
|
||||
aligned_free(self->values); \
|
||||
} \
|
||||
\
|
||||
free(self); \
|
||||
@@ -118,7 +118,7 @@ typedef enum {
|
||||
if (self == NULL) return false; \
|
||||
\
|
||||
if (m * n > (self->m * self->n)) { \
|
||||
type *ptr = _aligned_realloc(self->values, sizeof(type) * m * n, alignment); \
|
||||
type *ptr = aligned_resize(self->values, sizeof(type) * self->m * self->n, sizeof(type) * m * n, alignment); \
|
||||
if (ptr == NULL) { \
|
||||
return false; \
|
||||
} \
|
||||
|
||||
@@ -670,7 +670,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels,
|
||||
|
||||
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
|
||||
|
||||
language_classifier_response_t *lang_response = NULL;
|
||||
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||
|
||||
if (num_languages == 0) {
|
||||
lang_response = place_languages(num_components, labels, values);
|
||||
|
||||
@@ -5,6 +5,15 @@
|
||||
|
||||
#include "log/log.h"
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_STRNDUP
|
||||
#include "strndup.h"
|
||||
#endif
|
||||
|
||||
|
||||
#define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB
|
||||
|
||||
#define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n"
|
||||
|
||||
@@ -17,10 +17,10 @@ static inline bool is_address_text_component(char *label) {
|
||||
);
|
||||
}
|
||||
|
||||
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
|
||||
libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
|
||||
if (num_components == 0 || values == NULL || labels == NULL) return NULL;
|
||||
|
||||
language_classifier_response_t *lang_response = NULL;
|
||||
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||
|
||||
char *label;
|
||||
char *value;
|
||||
|
||||
@@ -32,7 +32,7 @@ typedef struct place {
|
||||
char *website;
|
||||
} place_t;
|
||||
|
||||
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
|
||||
libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
|
||||
|
||||
place_t *place_new(void);
|
||||
|
||||
|
||||
@@ -94,15 +94,15 @@ inline bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khas
|
||||
}
|
||||
|
||||
uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) {
|
||||
khash_t(int_set) *unique_columns = kh_init(int_set);
|
||||
khash_t(int_uint32) *unique_columns = kh_init(int_uint32);
|
||||
uint32_array *ret = uint32_array_new();
|
||||
|
||||
if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) {
|
||||
kh_destroy(int_set, unique_columns);
|
||||
kh_destroy(int_uint32, unique_columns);
|
||||
return ret;
|
||||
}
|
||||
|
||||
kh_destroy(int_set, unique_columns);
|
||||
kh_destroy(int_uint32, unique_columns);
|
||||
uint32_array_destroy(ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
47
src/vector.h
47
src/vector.h
@@ -7,43 +7,44 @@
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
|
||||
#include <malloc.h>
|
||||
static inline void *aligned_malloc(size_t size, size_t alignment) {
|
||||
return _aligned_malloc(size, alignment);
|
||||
}
|
||||
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment) {
|
||||
return _aligned_realloc(p, new_size, alignment);
|
||||
}
|
||||
static inline void aligned_free(void *p) {
|
||||
_aligned_free(p);
|
||||
}
|
||||
#else
|
||||
#include <stdlib.h>
|
||||
static inline void *_aligned_malloc(size_t size, size_t alignment)
|
||||
static inline void *aligned_malloc(size_t size, size_t alignment)
|
||||
{
|
||||
void *p;
|
||||
int ret = posix_memalign(&p, alignment, size);
|
||||
return (ret == 0) ? p : NULL;
|
||||
}
|
||||
static inline void *_aligned_realloc(void *p, size_t size, size_t alignment)
|
||||
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment)
|
||||
{
|
||||
if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (size == 0) {
|
||||
if (p == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *rp = realloc(p, size);
|
||||
|
||||
/* If realloc result is not already at an aligned boundary,
|
||||
_aligned_malloc a new block and copy the contents of the realloc'd
|
||||
pointer to the aligned block, free the realloc'd pointer and return
|
||||
the aligned pointer.
|
||||
*/
|
||||
if ( ((size_t)rp & (alignment - 1)) != 0) {
|
||||
void *p1 = _aligned_malloc(size, alignment);
|
||||
if (p1 != NULL) {
|
||||
memcpy(p1, rp, size);
|
||||
}
|
||||
free(rp);
|
||||
rp = p1;
|
||||
void *p1 = aligned_malloc(new_size, alignment);
|
||||
if (p1 == NULL) {
|
||||
free(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return rp;
|
||||
memcpy(p1, p, old_size);
|
||||
free(p);
|
||||
return p1;
|
||||
}
|
||||
static inline void _aligned_free(void *p)
|
||||
static inline void aligned_free(void *p)
|
||||
{
|
||||
free(p);
|
||||
}
|
||||
@@ -79,7 +80,7 @@ static inline void _aligned_free(void *p)
|
||||
name *array = malloc(sizeof(name)); \
|
||||
if (array == NULL) return NULL; \
|
||||
array->n = array->m = 0; \
|
||||
array->a = _aligned_malloc(size * sizeof(type), alignment); \
|
||||
array->a = aligned_malloc(size * sizeof(type), alignment); \
|
||||
if (array->a == NULL) return NULL; \
|
||||
array->m = size; \
|
||||
return array; \
|
||||
@@ -94,7 +95,7 @@ static inline void _aligned_free(void *p)
|
||||
} \
|
||||
static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) { \
|
||||
if (size <= array->m) return true; \
|
||||
type *ptr = _aligned_realloc(array->a, sizeof(type) * size, alignment); \
|
||||
type *ptr = aligned_resize(array->a, sizeof(type) * array->m, sizeof(type) * size, alignment); \
|
||||
if (ptr == NULL) return false; \
|
||||
array->a = ptr; \
|
||||
array->m = size; \
|
||||
@@ -160,7 +161,7 @@ static inline void _aligned_free(void *p)
|
||||
} \
|
||||
static inline void name##_destroy_aligned(name *array) { \
|
||||
if (array == NULL) return; \
|
||||
if (array->a != NULL) _aligned_free(array->a); \
|
||||
if (array->a != NULL) aligned_free(array->a); \
|
||||
free(array); \
|
||||
}
|
||||
|
||||
@@ -182,7 +183,7 @@ static inline void _aligned_free(void *p)
|
||||
free_func(array->a[i]); \
|
||||
} \
|
||||
} \
|
||||
_aligned_free(array->a); \
|
||||
aligned_free(array->a); \
|
||||
free(array); \
|
||||
}
|
||||
|
||||
|
||||
@@ -8,10 +8,8 @@
|
||||
|
||||
#define ks_lt_index(a, b) ((a).value < (b).value)
|
||||
|
||||
#if defined(INTEL_SSE)
|
||||
#if defined(USE_SSE)
|
||||
#include <emmintrin.h>
|
||||
#elif defined(ARM_NEON)
|
||||
#include "sse2neon.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
@@ -340,7 +338,7 @@
|
||||
|
||||
|
||||
|
||||
#if defined(INTEL_SSE) || defined(ARM_NEON)
|
||||
#if defined(USE_SSE)
|
||||
/*
|
||||
From https://github.com/herumi/fmath/blob/master/fastexp.cpp
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ CFLAGS_O2 = $(CFLAGS_BASE) -O2
|
||||
CFLAGS_O3 = $(CFLAGS_BASE) -O3
|
||||
DEFAULT_INCLUDES = -I.. -I/usr/local/include
|
||||
|
||||
CFLAGS = $(SIMDFLAGS) $(CFLAGS_BASE)
|
||||
CFLAGS = $(CFLAGS_BASE)
|
||||
|
||||
TESTS = test_libpostal
|
||||
noinst_PROGRAMS = test_libpostal
|
||||
|
||||
@@ -73,57 +73,20 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl
|
||||
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Architecture-specific options
|
||||
# Checks for SSE2 build
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
# allow enabling hardware optimization on any system:
|
||||
case "$host_cpu" in
|
||||
arm*|aarch64*)
|
||||
enable_arm_neon=yes
|
||||
enable_intel_sse=no
|
||||
AC_DEFINE([ARM_NEON], [1],
|
||||
[Enable ARM_NEON optimizations])
|
||||
;;
|
||||
i?86|x86_64)
|
||||
enable_intel_sse=yes
|
||||
enable_arm_neon=no
|
||||
AC_DEFINE([INTEL_SSE], [1],
|
||||
[Enable Intel SSE optimizations])
|
||||
;;
|
||||
esac
|
||||
|
||||
AC_ARG_ENABLE([neon],
|
||||
AS_HELP_STRING([[[--disable-neon]]],
|
||||
[Disable ARM NEON hardware optimizations]),
|
||||
[
|
||||
enable_arm_neon=no
|
||||
AC_DEFINE([ARM_NEON], [0],
|
||||
[Disable ARM_NEON optimizations])
|
||||
])
|
||||
|
||||
AC_ARG_ENABLE([sse2],
|
||||
AS_HELP_STRING([[[--disable-sse2]]],
|
||||
[Disable Intel SSE2 hardware optimizations]),
|
||||
[
|
||||
enable_intel_sse=no
|
||||
AC_DEFINE([INTEL_SSE], [0],
|
||||
[Disable INTEL_SSE optimizations])
|
||||
])
|
||||
AS_HELP_STRING(
|
||||
[--disable-sse2],
|
||||
[disable SSE2 optimization routines]
|
||||
)
|
||||
)
|
||||
|
||||
SIMDFLAGS=""
|
||||
|
||||
AS_IF([test "x$enable_intel_sse" != "xno"], [
|
||||
SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE"
|
||||
AS_IF([test "x$enable_sse2" != "xno"], [
|
||||
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
|
||||
])
|
||||
|
||||
AS_IF([test "x$enable_arm_neon" != "xno"], [
|
||||
SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON"
|
||||
])
|
||||
|
||||
CFLAGS="${SIMDFLAGS} ${CFLAGS}"
|
||||
|
||||
AC_SUBST([SIMDFLAGS], [$SIMDFLAGS])
|
||||
|
||||
AC_CHECK_HEADER(cblas.h, [AX_CBLAS])
|
||||
|
||||
AC_ARG_ENABLE([data-download],
|
||||
|
||||
Reference in New Issue
Block a user