diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..ea9dca07 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,36 @@ +name: Test + +on: + push: + branches: [master] + pull_request: + branches: [master] + workflow_dispatch: + +jobs: + build_and_test: + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + - name: Install Dependencies Linux + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt-get update -y + sudo apt-get install curl autoconf automake libtool pkg-config + - name: Install Dependencies MacOS + if: runner.os == 'macOS' + run: | + brew update + brew install curl autoconf automake libtool pkg-config + - name: Build + env: + LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data + run: | + ./bootstrap.sh + ./configure --datadir=$LIBPOSTAL_DATA_DIR + make + - name: Test + run: make check diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 1f3dbacf..00000000 --- a/.travis.yml +++ /dev/null @@ -1,83 +0,0 @@ -language: c -branches: - only: - - master -env: - global: - - secure: "bHrAu46oecEj3gjamT+XWXtf2J0ZJCFa8tUdgM4evscaJiiwv1TtsGXyhIj/ai7DlRIPVJUtBUy6uoGGjr6GT43zTrzSxYAOMdVXZYsnTDcdL1/0dbwcIK6/u0EI377s1buGIxG1fHveWKXuXwJWDAw4KS+5HU88a42+zMbhKe4=" - - secure: "SkvNYucKVns9qDjOEW2WIhDlOMKBOwhzVcwY++HWTRtn04ErrqR4k01Mmho0jGBQD9JrPLhDgnX1BNy5s+Kmq/bxn9OZm7K1z24qBKb0mBBiNEnf2jvT0AvF5xxM+cJf4KKNL+CC0MwNf5y7HVPq1xibOV4/CNIrc1ZZc9aqdkE=" - - secure: "am/rRca5akv7gSSMeNQfHnWiTHhk8fQhOZvZ0Ut+PezkQlLgKp7bzmMFkkuQ4L5hpJU40kFzuWmIPgO33dacgq69Vx/Xct1bEnxGBGjriI5qOhMizmzLYPs5uWiRjtJnBqb4JOUh5K7JBlwrgvD72fY5ZK2lwtzTksfWo8N+ahU=" - - secure: "mh/WDQapGJb6MAFvgCjiMAAv1aa8gUaIs2Ohtx7yPrDBwsD8UqlyEM7ktGLZGQ1q/7OJ/Z6QfDMfJQwDKzxyUSY1yHZTNkP3QzkTt2D1Qyvi++O6EkGqSdSS6Lb3aID3IsEaye/yasJ+rxiRSp05O9+OYvhJlqRZnzaimiAv5KI=" - - secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU=" - - GH_REF=github.com/openvenues/libpostal - - DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l) - - NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l) - - TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l) - - TAG_VERSION=$(cat ./versions/base).$TRAVIS_BUILD_NUMBER - - SRC_TARBALL_FILENAME=libpostal-$(cat ./versions/base).tar.gz - - LIBPOSTAL_DATA_DIR=$(pwd)/data - - LIBPOSTAL_DATA_FILENAME=libpostal_data.tar.gz -compiler: - - clang - - gcc -addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-4.8 - - pkg-config -before_script: - - ./bootstrap.sh - - if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi; - - if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi; - - if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi; -install: - - if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi -script: - - ./configure --datadir=$LIBPOSTAL_DATA_DIR - - make -j4 - - if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi; - - if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi; - - if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi; - - make check - -after_success: - - | - if [[ "$CC" == "gcc" && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" ]]; then - if [[ ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then - export PATH=$PATH:env/bin/; - git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1 - cp src/*_data.c _travis/src - echo "$TAG_VERSION" > _travis/versions/base_data - cd _travis - git config user.name "$GIT_COMMITTER_NAME"; - git config user.email "$GIT_COMMITTER_EMAIL"; - git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER"; - git push --quiet origin master; - - tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME $BASIC_MODULE_DIRS - fi - git tag $TAG_VERSION -a -m "[auto][ci skip] Generating tag for Travis build #$TRAVIS_BUILD_NUMBER"; - git push --tags --quiet origin master; - fi; - -before_deploy: - - make dist - -deploy: - - provider: releases - file: - - "$SRC_TARBALL_FILENAME" - on: - tags: true - branch: master - skip_cleanup: true - - provider: releases - file: - - "$LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME" - on: - tags: true - branch: master - condition: "$CC = gcc && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 )" - skip_cleanup: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 26bd9bdb..7e5d2804 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ ## Submitting Issues -When submitting issues to libpostal, please repeect these guildelines: +When submitting issues to libpostal, please respect these guidelines: - Be constructive. Try to help solve the problem. - Always search for existing issues before submitting one. diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index be48a933..70f8476e 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -8,7 +8,7 @@ I was checking out libpostal, and saw something that could be improved. --- #### Here's how I'm using libpostal - + --- #### Here's what I did diff --git a/README.md b/README.md index d8e2cb9c..9016138c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # libpostal: international street address NLP -[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) +[![Build Status](https://github.com/openvenues/libpostal/actions/workflows/test.yml/badge.svg)](https://github.com/openvenues/libpostal/actions) [![Build Status](https://ci.appveyor.com/api/projects/status/github/openvenues/libpostal?branch=master&svg=true)](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master) [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE) [![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors) @@ -98,7 +98,7 @@ Before you install, make sure you have the following prerequisites: **On Ubuntu/Debian** ``` -sudo apt-get install curl autoconf automake libtool pkg-config +sudo apt-get install -y curl build-essential autoconf automake libtool pkg-config ``` **On CentOS/RHEL** @@ -106,19 +106,46 @@ sudo apt-get install curl autoconf automake libtool pkg-config sudo yum install curl autoconf automake libtool pkgconfig ``` -**On Mac OSX** +**On macOS** + +Install with one command via [MacPorts](https://www.macports.org/): +``` +port install libpostal +``` + +Or as follows with [Homebrew](https://brew.sh/): + ``` brew install curl autoconf automake libtool pkg-config ``` Then to install the C library: +If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed. + ``` git clone https://github.com/openvenues/libpostal cd libpostal + +# skip if installing for the first time +make distclean + ./bootstrap.sh -./configure --datadir=[...some dir with a few GB of space...] + +# omit --datadir flag to install data in current directory +./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] make -j4 + +# For Intel/AMD processors and the default model +./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] + +# For Apple / ARM cpus and the default model +./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] --disable-sse2 + +# For the improved Senzing model: +./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] MODEL=senzing + +make -j8 sudo make install # On Linux it's probably a good idea to run @@ -400,23 +427,19 @@ Libpostal is designed to be used by higher-level languages. If you don't see yo - LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal) - Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal) - Elixir: [Expostal](https://github.com/SweetIQ/expostal) +- Haskell: [haskell-postal](http://github.com/netom/haskell-postal) +- Rust: [rust-postal](https://github.com/pnordahl/rust-postal) - Rust: [rustpostal](https://crates.io/crates/rustpostal) -**Database extensions** +**Unofficial database extensions** - PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal) -**Unofficial REST API** +**Unofficial servers** -- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest) - -**Libpostal REST Docker** - -- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker) - -**Libpostal ZeroMQ Docker** - -- Libpostal ZeroMQ Docker image: [pasupulaphani/libpostal-zeromq](https://hub.docker.com/r/pasupulaphani/libpostal-zeromq/) , Source: [Github](https://github.com/pasupulaphani/libpostal-docker) +- Libpostal REST Go Docker: [libpostal-rest-docker](https://github.com/johnlonganecker/libpostal-rest-docker) +- Libpostal REST FastAPI Docker: [libpostal-fastapi](https://github.com/alpha-affinity/libpostal-fastapi) +- Libpostal ZeroMQ Docker: [libpostal-zeromq](https://github.com/pasupulaphani/libpostal-docker) Tests @@ -491,7 +514,7 @@ optionally be separated so Rosenstraße and Rosen Straße are equivalent. for a wide variety of countries and languages, not just US/English. The model is trained on over 1 billion addresses and address-like strings, using the templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted, -tagged traning examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py) +tagged training examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py) are performed to make the training data resemble real messy geocoder input as closely as possible. - **Language classification**: multinomial logistic regression @@ -513,7 +536,7 @@ language (IX => 9) which occur in the names of many monarchs, popes, etc. - **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec, implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian -languages chracter by character instead of on whitespace. +languages character by character instead of on whitespace. - **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form, strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration. @@ -537,6 +560,7 @@ Non-goals - Verifying that a location is a valid address - Actually geocoding addresses to a lat/lon (that requires a database/search index) +- Extracting addresses from free text Raison d'être ------------- @@ -642,7 +666,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions: - Confines almost all mallocs to *name*_new and all frees to *name*_destroy - Efficient existing implementations for simple things like hashtables - Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible -- Data structrues take advantage of sparsity as much as possible +- Data structures take advantage of sparsity as much as possible - Efficient double-array trie implementation for most string dictionaries - Cross-platform as much as possible, particularly for *nix diff --git a/configure.ac b/configure.ac index ed997e32..dc77930e 100644 --- a/configure.ac +++ b/configure.ac @@ -84,57 +84,20 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])]) # ------------------------------------------------------------------ -# Architecture-specific options +# Checks for SSE2 build # ------------------------------------------------------------------ -# allow enabling hardware optimization on any system: -case "$host_cpu" in - arm*|aarch64*) - enable_arm_neon=yes - enable_intel_sse=no - AC_DEFINE([ARM_NEON], [1], - [Enable ARM_NEON optimizations]) - ;; - i?86|x86_64) - enable_intel_sse=yes - enable_arm_neon=no - AC_DEFINE([INTEL_SSE], [1], - [Enable Intel SSE optimizations]) - ;; -esac - -AC_ARG_ENABLE([neon], - AS_HELP_STRING([[[--disable-neon]]], - [Disable ARM NEON hardware optimizations]), - [ - enable_arm_neon=no - AC_DEFINE([ARM_NEON], [0], - [Disable ARM_NEON optimizations]) - ]) - AC_ARG_ENABLE([sse2], - AS_HELP_STRING([[[--disable-sse2]]], - [Disable Intel SSE2 hardware optimizations]), - [ - enable_intel_sse=no - AC_DEFINE([INTEL_SSE], [0], - [Disable INTEL_SSE optimizations]) - ]) + AS_HELP_STRING( + [--disable-sse2], + [disable SSE2 optimization routines] + ) + ) -SIMDFLAGS="" - -AS_IF([test "x$enable_intel_sse" != "xno"], [ - SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE" +AS_IF([test "x$enable_sse2" != "xno" && test "x$(uname -m)" != "xarm64"], [ + CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}" ]) -AS_IF([test "x$enable_arm_neon" != "xno"], [ - SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON" -]) - -CFLAGS="${SIMDFLAGS} ${CFLAGS}" - -AC_SUBST([SIMDFLAGS], [$SIMDFLAGS]) - AC_CHECK_HEADER(cblas.h, [AX_CBLAS]) AC_ARG_ENABLE([data-download], diff --git a/m4/ax_cblas.m4 b/m4/ax_cblas.m4 index 0c87c29f..da89cab9 100644 --- a/m4/ax_cblas.m4 +++ b/m4/ax_cblas.m4 @@ -152,11 +152,21 @@ if test $ax_cblas_ok = no; then [], [-lblas])]) fi +# BLAS in OpenBLAS library? +if test $ax_cblas_ok = no; then + AC_CHECK_LIB(openblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lopenblas"]) +fi + # Generic CBLAS library? if test $ax_cblas_ok = no; then AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"]) fi +# Generic BLAS library? +if test $ax_cblas_ok = no; then + AC_CHECK_LIB(blas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lblas"]) +fi + AC_SUBST(CBLAS_LIBS) LIBS="$ax_cblas_save_LIBS" diff --git a/resources/addresses/de.yaml b/resources/addresses/de.yaml index 40078b51..143e49a5 100644 --- a/resources/addresses/de.yaml +++ b/resources/addresses/de.yaml @@ -63,10 +63,23 @@ numbers: house_numbers: + gebaude: &gebaude + canonical: gebäude + abbreviated: geb + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.5 + sample_probability: 0.05 + numeric: + direction: left alphanumeric: default: *nummer + probability: 0.95 + alternatives: + - alternative: *gebaude + probability: 0.05 - alphanumeric_phrase_probability: 0.0001 + alphanumeric_phrase_probability: 0.05 conscription_numbers: alphanumeric: diff --git a/resources/addresses/uk.yaml b/resources/addresses/uk.yaml index 35cca157..a00bb526 100644 --- a/resources/addresses/uk.yaml +++ b/resources/addresses/uk.yaml @@ -49,7 +49,7 @@ numbers: house_numbers: - budnyok: &budnyok + budynok: &budynok canonical: будинок abbreviated: буд sample: true @@ -58,8 +58,8 @@ house_numbers: sample_probability: 0.1 numeric: direction: left - budnyok_latin: &budnyok_latin - canonical: budnyok + budynok_latin: &budynok_latin + canonical: budynok abbreviated: bud sample: true canonical_probability: 0.6 @@ -88,10 +88,10 @@ house_numbers: direction: left alphanumeric: - default: *budnyok + default: *budynok probability: 0.65 alternatives: - - alternative: *budnyok_latin + - alternative: *budynok_latin probability: 0.05 - alternative: *dom probability: 0.25 diff --git a/resources/boundaries/osm/ar.yaml b/resources/boundaries/osm/ar.yaml index 88f0002b..3f7181d7 100644 --- a/resources/boundaries/osm/ar.yaml +++ b/resources/boundaries/osm/ar.yaml @@ -11,8 +11,9 @@ overrides: id: - # Buenos Aires (state boundary coterminous with city) - "3082668": null + relation: + # Buenos Aires (state boundary coterminous with city) + "3082668": null contained_by: relation: # Buenos Aires diff --git a/resources/boundaries/osm/tw.yaml b/resources/boundaries/osm/tw.yaml index 11827eb9..99f1b868 100644 --- a/resources/boundaries/osm/tw.yaml +++ b/resources/boundaries/osm/tw.yaml @@ -11,9 +11,9 @@ "9": "suburb" overrides: - id: - relation: - # Taiwan Province - "3777248": "state" - # Fujian Province - "3777250": "state" + id: + relation: + # Taiwan Province + "3777248": "state" + # Fujian Province + "3777250": "state" diff --git a/resources/dictionaries/en/street_types.txt b/resources/dictionaries/en/street_types.txt index 8d75c5a1..e8359347 100644 --- a/resources/dictionaries/en/street_types.txt +++ b/resources/dictionaries/en/street_types.txt @@ -132,6 +132,7 @@ falls|fls fare farm|frm farms|frms +farm to market|fm|farm-to-market fern ferry|fry|fy field|fld|fd @@ -407,4 +408,4 @@ well|wl wells|wls wharf|whrf|whf wynd|wyn -yard|yd|yrd \ No newline at end of file +yard|yd|yrd diff --git a/resources/dictionaries/hi/building_types.txt b/resources/dictionaries/hi/building_types.txt new file mode 100644 index 00000000..180e3606 --- /dev/null +++ b/resources/dictionaries/hi/building_types.txt @@ -0,0 +1,5 @@ +mandir|मन्दिर|मंदिर +station +police station +post office +office \ No newline at end of file diff --git a/resources/dictionaries/hi/directionals.txt b/resources/dictionaries/hi/directionals.txt new file mode 100644 index 00000000..4dfff200 --- /dev/null +++ b/resources/dictionaries/hi/directionals.txt @@ -0,0 +1,4 @@ +dakshin|दक्षिण +uttar|उत्तर +poorva|poorav|पूर्व +paschim|पश्चिम \ No newline at end of file diff --git a/resources/dictionaries/hi/people.txt b/resources/dictionaries/hi/people.txt new file mode 100644 index 00000000..b79d681f --- /dev/null +++ b/resources/dictionaries/hi/people.txt @@ -0,0 +1 @@ +mahatma gandhi|mg|m g \ No newline at end of file diff --git a/resources/dictionaries/hi/personal_titles.txt b/resources/dictionaries/hi/personal_titles.txt new file mode 100644 index 00000000..6f60d0cd --- /dev/null +++ b/resources/dictionaries/hi/personal_titles.txt @@ -0,0 +1,24 @@ +baba +babu +bhagat +guru +jagirdar +maharaja|maharaj +mahatma|महात्मा +pandit +raja +rajarshi +rajkumar +rajkumari +rani +rishi +sahib +sant +sardar +senapati +shah +shrimati|smt|srimathi|श्रीमती +shri|shree|sri|श्री +sushri +swami +ustad \ No newline at end of file diff --git a/resources/dictionaries/hi/qualifiers.txt b/resources/dictionaries/hi/qualifiers.txt new file mode 100644 index 00000000..ba5b38f7 --- /dev/null +++ b/resources/dictionaries/hi/qualifiers.txt @@ -0,0 +1,3 @@ +nagar|नगर +colony +cantonment|cantt \ No newline at end of file diff --git a/resources/dictionaries/hi/street_types.txt b/resources/dictionaries/hi/street_types.txt index fde3a96b..b5cb5528 100644 --- a/resources/dictionaries/hi/street_types.txt +++ b/resources/dictionaries/hi/street_types.txt @@ -1,3 +1,4 @@ bazaar|bazar marg -nagar \ No newline at end of file +flyover +रोड \ No newline at end of file diff --git a/resources/dictionaries/hu/ambiguous_expansions.txt b/resources/dictionaries/hu/ambiguous_expansions.txt index 88190f16..13eecc6a 100644 --- a/resources/dictionaries/hu/ambiguous_expansions.txt +++ b/resources/dictionaries/hu/ambiguous_expansions.txt @@ -3,4 +3,4 @@ d e k n -u \ No newline at end of file +u diff --git a/resources/dictionaries/hu/level_types_mezzanine.txt b/resources/dictionaries/hu/level_types_mezzanine.txt index e436dcd6..2fcd8250 100644 --- a/resources/dictionaries/hu/level_types_mezzanine.txt +++ b/resources/dictionaries/hu/level_types_mezzanine.txt @@ -1,2 +1,2 @@ -félemelet|felemelet -magasföldszint|magasfoldszint \ No newline at end of file +félemelet|felemelet|félem|1/2 em|1/2em +magasföldszint|magasfoldszint|mgfszt|mgfsz|mfszt|mfsz \ No newline at end of file diff --git a/resources/dictionaries/hu/personal_titles.txt b/resources/dictionaries/hu/personal_titles.txt index e8af420c..5186864a 100644 --- a/resources/dictionaries/hu/personal_titles.txt +++ b/resources/dictionaries/hu/personal_titles.txt @@ -1 +1,2 @@ -szent|szt \ No newline at end of file +szent|szt +idősebb|id diff --git a/resources/dictionaries/hu/street_types.txt b/resources/dictionaries/hu/street_types.txt index 84863afe..c0548be6 100644 --- a/resources/dictionaries/hu/street_types.txt +++ b/resources/dictionaries/hu/street_types.txt @@ -1,21 +1,34 @@ árok|arok -dűlő|dulo +dűlő|dulo|d.|d fasor +fasora +főközlekedési út|főút|fout +határút|hatarut +kapu kert körönd|korond|krnd +körvasútsor|korvasutsor körút|korut|krt köz|koz +lakótelep|lakotelep|ltp.|ltp lejtő|lejto lépcső|lepcso liget mező|mezo +országút|orszagut park -rakpart|rpt -sétány|setany -sor -sugárút|sugarut +parkja +rakpart|rkpt|rkp|rpt +sétány|setany|stny.|stny +sor|s.|s +sétány|setany|sét +sugárút|sugarut|sgrt.|sgrt|srt.|srt|sgt.|sgt +sziget +telep tér|ter tere -utca|u -út|ut -útja|utja \ No newline at end of file +tanya|t.|t +udvar +utca|u.|u +út|ut|u.|u +útja|utja diff --git a/resources/dictionaries/ms/toponyms.txt b/resources/dictionaries/ms/toponyms.txt new file mode 100644 index 00000000..046f27bf --- /dev/null +++ b/resources/dictionaries/ms/toponyms.txt @@ -0,0 +1,3 @@ +kuala lumpur|federal territory kuala lumpur|federal territory of kuala lumpur|wilayah persekutuan kuala lumpur|kl +labuan|federal territory labuan|federal territory of labuan|wilayah persekutuan labuan +putrajaya|federal territory putrajaya|federal territory of putrajaya|wilayah persekutuan putrajaya diff --git a/resources/dictionaries/pl/street_types.txt b/resources/dictionaries/pl/street_types.txt index a3241ff5..f83ebba3 100644 --- a/resources/dictionaries/pl/street_types.txt +++ b/resources/dictionaries/pl/street_types.txt @@ -1,12 +1,18 @@ aleja|al autostrada boczna -bulwar +bulwar|bulw droga obwodnica +ogród +osiedle|os +park plac|pl rondo rynek +skwer szosa ulica|ul -zaulek \ No newline at end of file +wybrzeże|wyb +wyspa +zaulek diff --git a/resources/dictionaries/pt/street_types.txt b/resources/dictionaries/pt/street_types.txt index b661dd05..6b2756e5 100644 --- a/resources/dictionaries/pt/street_types.txt +++ b/resources/dictionaries/pt/street_types.txt @@ -10,10 +10,10 @@ calçada|calcada|cc calçadinha|caclcadinha|ccnh câmara municipal|camara municipal|cm|c.m.|c. m. caminho|cam|camno -direito|dto +direito|dto|dt esquerdo|esq estrada|estr -astrada marginal|estr marg +estrada marginal|estr marg estrada municipal|em|e m|estr m estrada nacional|en|e n|estr n estrada regional|er|e r|estr r @@ -50,4 +50,4 @@ viaduto|vd|vdto viela|ve vila|vl volta -zona|zn \ No newline at end of file +zona|zn diff --git a/resources/dictionaries/ro/building_types.txt b/resources/dictionaries/ro/building_types.txt new file mode 100644 index 00000000..28ab10eb --- /dev/null +++ b/resources/dictionaries/ro/building_types.txt @@ -0,0 +1,6 @@ +anexa +bloc|blc|bl +casa +cladirea|cladire +complex +garaj diff --git a/resources/dictionaries/ro/company_types.txt b/resources/dictionaries/ro/company_types.txt new file mode 100644 index 00000000..09dc7797 --- /dev/null +++ b/resources/dictionaries/ro/company_types.txt @@ -0,0 +1,5 @@ +banca +organizatie neguvernamentala|ong +societate comerciala|sc +societate cu raspundere limitata|srl +societate pe actiuni|sa diff --git a/resources/dictionaries/ro/cross_streets.txt b/resources/dictionaries/ro/cross_streets.txt index d141a2ba..386b33fc 100644 --- a/resources/dictionaries/ro/cross_streets.txt +++ b/resources/dictionaries/ro/cross_streets.txt @@ -1,5 +1,5 @@ & -colț|colt +colț|colt|colț cu|colt cu între|intre la colțul de pe|la coltul de pe -și|si \ No newline at end of file +și|si diff --git a/resources/dictionaries/ro/entrances.txt b/resources/dictionaries/ro/entrances.txt index 8703e027..b7acf9bd 100644 --- a/resources/dictionaries/ro/entrances.txt +++ b/resources/dictionaries/ro/entrances.txt @@ -1 +1 @@ -intrare \ No newline at end of file +intrare|intrarea diff --git a/resources/dictionaries/ro/near.txt b/resources/dictionaries/ro/near.txt index 00eac643..c8962fba 100644 --- a/resources/dictionaries/ro/near.txt +++ b/resources/dictionaries/ro/near.txt @@ -4,4 +4,4 @@ din in apropiere de în apropiere|in apropiere în jurul aici|in jurul aici -lângă mine|langa mine \ No newline at end of file +lângă mine|langa mine|lângă|langa diff --git a/resources/dictionaries/ro/number.txt b/resources/dictionaries/ro/number.txt index 1e3fad48..33bc26e4 100644 --- a/resources/dictionaries/ro/number.txt +++ b/resources/dictionaries/ro/number.txt @@ -1 +1 @@ -număr|numar|nr|nº|n°|#|№|no \ No newline at end of file +număr|numar|nr|nº|n°|#|№|no|numarul|numărul diff --git a/resources/dictionaries/ro/personal_titles.txt b/resources/dictionaries/ro/personal_titles.txt index 2634183b..a78988f6 100644 --- a/resources/dictionaries/ro/personal_titles.txt +++ b/resources/dictionaries/ro/personal_titles.txt @@ -8,7 +8,8 @@ general|gen major|maj locotenent locotenent colonel +pictor profesor|prof sergent sublocotenent -vice amiral \ No newline at end of file +vice amiral diff --git a/resources/dictionaries/ro/place_names.txt b/resources/dictionaries/ro/place_names.txt new file mode 100644 index 00000000..f51987d7 --- /dev/null +++ b/resources/dictionaries/ro/place_names.txt @@ -0,0 +1,3 @@ +cinema +cafenea +fabrica diff --git a/resources/dictionaries/ro/qualifiers.txt b/resources/dictionaries/ro/qualifiers.txt index 6d021f70..016a6114 100644 --- a/resources/dictionaries/ro/qualifiers.txt +++ b/resources/dictionaries/ro/qualifiers.txt @@ -1 +1,7 @@ -bloc|bl \ No newline at end of file +bloc|bl +cartier|cartierul +comuna|comunā +kilometrul|kilometru|km +sat|satul +sector|sectorul|sect +zona diff --git a/resources/dictionaries/ro/stopwords.txt b/resources/dictionaries/ro/stopwords.txt index 80195e69..406760a0 100644 --- a/resources/dictionaries/ro/stopwords.txt +++ b/resources/dictionaries/ro/stopwords.txt @@ -1,2 +1,3 @@ și|si|& -cel \ No newline at end of file +cel +intre diff --git a/resources/dictionaries/ro/street_types.txt b/resources/dictionaries/ro/street_types.txt index 46758ffa..ca6985d1 100644 --- a/resources/dictionaries/ro/street_types.txt +++ b/resources/dictionaries/ro/street_types.txt @@ -1,13 +1,13 @@ aleea|ale|alea|al bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard -calea|cal -drumul +calea|cale|cal +drumul|drum fundătura|fundatura|fnd fundacul|fdc intrarea|int|intr piaţa|piata|piață|pta|pţa|p-ta|p-ţa -strada|str +strada|str|st stradela|str-la|sdla șoseaua|soseaua|sos|șos splaiul|sp|spl -vârful|varful|virful|vîrful|varf|vf \ No newline at end of file +vârful|varful|virful|vîrful|varf|vf diff --git a/resources/dictionaries/ro/synonyms.txt b/resources/dictionaries/ro/synonyms.txt new file mode 100644 index 00000000..62880a91 --- /dev/null +++ b/resources/dictionaries/ro/synonyms.txt @@ -0,0 +1 @@ +decembrie|dec diff --git a/resources/dictionaries/ro/unit_types_numbered.txt b/resources/dictionaries/ro/unit_types_numbered.txt index d0c72db1..1727c3c6 100644 --- a/resources/dictionaries/ro/unit_types_numbered.txt +++ b/resources/dictionaries/ro/unit_types_numbered.txt @@ -1,4 +1,8 @@ -apartament|ap|apt|apart +apartamentul|apartament|ap|apt|apart birou +cladire|cladirea|clădire|clădirea +corp|corpul +complex +interior|int lotul -sală|sala \ No newline at end of file +sală|sala diff --git a/resources/dictionaries/ru/entrances.txt b/resources/dictionaries/ru/entrances.txt index 7a864ade..8cdf7834 100644 --- a/resources/dictionaries/ru/entrances.txt +++ b/resources/dictionaries/ru/entrances.txt @@ -1,2 +1,4 @@ вход -vkhod \ No newline at end of file +vkhod +подъезд +pod'ezd diff --git a/resources/dictionaries/uk/qualifiers.txt b/resources/dictionaries/uk/qualifiers.txt index 6bf2d5df..595aa9df 100644 --- a/resources/dictionaries/uk/qualifiers.txt +++ b/resources/dictionaries/uk/qualifiers.txt @@ -6,3 +6,5 @@ kvartal|kvart|kv|kv-l oblast|obl район|р-н raion|r-n +місто|міс|м +misto|mis|m diff --git a/resources/states/my.yaml b/resources/states/my.yaml new file mode 100644 index 00000000..436a0bcb --- /dev/null +++ b/resources/states/my.yaml @@ -0,0 +1,93 @@ +"KL": + en: Kuala Lumpur + ms: Kuala Lumpur +"federal territory kuala lumpur": + en: Kuala Lumpur + ms: Kuala Lumpur +"federal territory of kuala lumpur": + en: Kuala Lumpur + ms: Kuala Lumpur +"wilayah persekutuan kuala lumpur": + en: Kuala Lumpur + ms: Kuala Lumpur +"federal territory labuan": + en: Labuan + ms: Labuan +"federal territory of labuan": + en: Labuan + ms: Labuan +"wilayah persekutuan labuan": + en: Labuan + ms: Labuan +"federal territory putrajaya": + en: Putrajaya + ms: Putrajaya +"federal territory of putrajaya": + en: Putrajaya + ms: Putrajaya +"wilayah persekutuan putrajaya": + en: Putrajaya + ms: Putrajaya +"pulau pinang": + en: Penang + ms: Pulau Pinang +"penang": + en: Penang + ms: Pulau Pinang +JHR: + en: Johor + ms: Johor +KDH: + en: Kedah + ms: Kedah +KTN: + en: Kelantan + ms: Kelantan +MLK: + en: Melaka + ms: Melaka +NSN: + en: Negeri Sembilan + ms: Negeri Sembilan +PHG: + en: Pahang + ms: Pahang +PRK: + en: Perak + ms: Perak +PLS: + en: Perlis + ms: Perlis +PNG: + en: Penang + ms: Pulau Pinang +SBH: + en: Sabah + ms: Sabah +SWK: + en: Sarawak + ms: Sarawak +SGR: + en: Selangor + ms: Selangor +TRG: + en: Terengganu + ms: Terengganu +KUL: + en: Kuala Lumpur + ms: Kuala Lumpur +LBN: + en: Labuan + ms: Labuan +PJY: + en: Putrajaya + ms: Putrajaya +KL: + en: Kuala Lumpur + ms: Kuala Lumpur +LB: + en: Labuan + ms: Labuan +PY: + en: Putrajaya + ms: Putrajaya \ No newline at end of file diff --git a/scripts/geodata/neighborhoods/reverse_geocode.py b/scripts/geodata/neighborhoods/reverse_geocode.py index 4c699813..ec8083a1 100644 --- a/scripts/geodata/neighborhoods/reverse_geocode.py +++ b/scripts/geodata/neighborhoods/reverse_geocode.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import argparse +import fnmatch import logging import operator import os @@ -24,7 +25,7 @@ from geodata.osm.components import osm_address_components from geodata.osm.definitions import osm_definitions from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS from geodata.polygons.index import * -from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder +from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder from geodata.statistics.tf_idf import IDFIndex @@ -212,6 +213,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): (ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon tests for neighborhoods. The properties vary by source but each has source has least a "name" key which in practice is what we care about. + + Quattroshapes data is no longer accessible and has been replaced by + WhosOnFirst. ''' PRIORITIES_FILENAME = 'priorities.json' @@ -224,9 +228,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): source_priorities = { 'osm': 0, # Best names/polygons, same coordinate system 'osm_cth': 1, # Prefer the OSM names if possible - 'clickthathood': 2, # Better names/polygons than Quattroshapes - 'osm_quattro': 3, # Prefer OSM names matched with Quattroshapes polygon - 'quattroshapes': 4, # Good results in some countries/areas + 'clickthathood': 2, # Better names/polygons than WhosOnFirst + 'osm_wof': 3, # Prefer OSM names matched with WhosOnFirst polygon + 'wof': 4, # Replacement of Quattroshapes } level_priorities = { @@ -235,7 +239,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): } regex_replacements = [ - # Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes + # Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes (re.compile('^paris-(?=[\d])', re.I), ''), (re.compile('^prague(?= [\d]+$)', re.I), 'Praha'), ] @@ -254,7 +258,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): return doc @classmethod - def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): + def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) @@ -270,17 +274,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): logger = logging.getLogger('neighborhoods') - qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods') - ensure_dir(qs_scratch_dir) - logger.info('Creating ClickThatHood neighborhoods') cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() logger.info('Creating OSM neighborhoods') osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file) - logger.info('Creating Quattroshapes neighborhoods') - qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) + logger.info('Creating WhosOnFirst neighborhoods') + wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods")) country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir) @@ -292,7 +293,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): char_scripts = get_chars_by_script() - for idx in (cth, qs, osmn): + for idx in (cth, wof, osmn): for i in xrange(idx.i): props = idx.get_properties(i) name = props.get('name') @@ -317,11 +318,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): index.index_polygon(poly.context) index.add_polygon(poly.context, props) - qs.matched = [False] * qs.i + wof.matched = [False] * wof.i cth.matched = [False] * cth.i logger.info('Matching OSM points to neighborhood polygons') - # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons + # Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons num_polys = 0 for element_id, attrs, deps in parse_osm(filename): try: @@ -359,14 +360,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))]) - for idx in (cth, qs): + for idx in (cth, wof): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) if candidates: max_sim = 0.0 arg_max = None - normalized_qs_names = {} + normalized_wof_names = {} for osm_name in osm_names: @@ -375,16 +376,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): for i in candidates: props = idx.get_properties(i) - name = normalized_qs_names.get(i) + name = normalized_wof_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) - normalized_qs_names[i] = name + normalized_wof_names[i] = name - if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood': + if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood': continue if not contains_ideographs: @@ -446,7 +447,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): continue source = 'osm_cth' else: - level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) + level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) source = 'osm_quattro' if level == 'neighborhood': @@ -467,7 +468,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) - for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')): + for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')): for i in xrange(idx.i): props = idx.get_properties(i) poly = idx.get_polygon(i) @@ -482,7 +483,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): props['polygon_type'] = 'local_admin' else: continue - elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': + elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood': component = AddressFormatter.SUBURB name = props.get('name') if not name: @@ -525,28 +526,67 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): return sorted(candidates, key=self.priority) -class QuattroshapesNeighborhoodsReverseGeocoder(GeohashPolygonIndex, QuattroshapesReverseGeocoder): +class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex): persistent_polygons = False cache_size = None + NAME = "wof:name" + ASCII_NAME = "gn:asciiname" + LEVEL = "wof:placetype" + GEONAMES_ID = "gn:geonameid" + SUPERSEDED = "wof:superseded_by" + + NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"} + POLYGON_TYPES = {"Polygon", "MultiPolygon"} + @classmethod - def create_neighborhoods_index(cls, quattroshapes_dir, - output_dir, - index_filename=None, - polys_filename=DEFAULT_POLYS_FILENAME): - local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME) - neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME) - return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename], - output_dir, index_filename=index_filename, - polys_filename=polys_filename) + def is_valid_neighbourhood(cls, geojson): + validity = not geojson["properties"].get(cls.SUPERSEDED) + for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}: + validity &= geojson["properties"].get(field) + return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES + + @classmethod + def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None): + index = cls(save_dir=output_dir, index_filename=index_filename) + + for root, dirnames, filenames in os.walk(wof_dir): + for fname in fnmatch.filter(filenames, "*.geojson"): + with open(os.path.join(root, fname)) as f: + geojson = json.load(f) + if cls.is_valid_neighbourhood(geojson): + properties = { + "name": safe_decode(geojson["properties"].get(cls.NAME)), + "name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)), + "qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)), + "gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID)) + } + + poly_type = geojson['geometry']['type'] + if poly_type == 'Polygon': + poly = cls.to_polygon(geojson['geometry']['coordinates'][0]) + index.index_polygon(poly) + poly = index.simplify_polygon(poly) + index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props) + elif poly_type == 'MultiPolygon': + polys = [] + for coords in geojson['geometry']['coordinates']: + poly = cls.to_polygon(coords[0]) + polys.append(poly) + index.index_polygon(poly) + + multi_poly = index.simplify_polygon(MultiPolygon(polys)) + index.add_polygon(multi_poly, dict(geojson['properties'])) + + return index if __name__ == '__main__': # Handle argument parsing here parser = argparse.ArgumentParser() - parser.add_argument('-q', '--quattroshapes-dir', - help='Path to quattroshapes dir') + parser.add_argument('-w', '--wof-dir', + help='Path to WhosOnFirst dir') parser.add_argument('-a', '--osm-admin-rtree-dir', help='Path to OSM admin rtree dir') @@ -567,16 +607,16 @@ if __name__ == '__main__': logging.basicConfig(level=logging.INFO) args = parser.parse_args() - if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file: - index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( + if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file: + index = NeighborhoodReverseGeocoder.create_from_osm_and_wof( args.osm_neighborhoods_file, - args.quattroshapes_dir, + args.wof_dir, args.country_rtree_dir, args.osm_admin_rtree_dir, args.osm_neighborhood_borders_file, args.out_dir ) else: - parser.error('Must specify quattroshapes dir or osm admin borders file') + parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file') index.save() diff --git a/scripts/geodata/polygons/index.py b/scripts/geodata/polygons/index.py index 59010800..41521f81 100644 --- a/scripts/geodata/polygons/index.py +++ b/scripts/geodata/polygons/index.py @@ -226,7 +226,6 @@ class PolygonIndex(object): @classmethod def create_from_geojson_files(cls, inputs, output_dir, index_filename=None, - polys_filename=DEFAULT_POLYS_FILENAME, include_only_properties=None): index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME) for input_file in inputs: diff --git a/scripts/geodata/whosonfirst/download_wof_admin_polygon.py b/scripts/geodata/whosonfirst/download_wof_admin_polygon.py new file mode 100644 index 00000000..51d029d1 --- /dev/null +++ b/scripts/geodata/whosonfirst/download_wof_admin_polygon.py @@ -0,0 +1,27 @@ +import os +import pycountry +import subprocess +import sys + + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + + +WOF_DATA_ADMIN_REPO_URL_PREFIX = "https://github.com/whosonfirst-data/whosonfirst-data/" +WOF_DATA_ADMIN_REPO_PREFIX = "whosonfirst-data-admin-" + + +def download_wof_data_admin(wof_dir): + for country_object in pycountry.countries: + repo_name = WOF_DATA_ADMIN_REPO_PREFIX + country_object.alpha2.lower() + repo_location = os.path.join(wof_dir, repo_name) + if not os.path.exists(repo_location): + subprocess.call(["git", "clone", WOF_DATA_ADMIN_REPO_URL_PREFIX + repo_name]) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + sys.exit('Usage: python download_whosonfirst_data.py wof_dir') + + download_wof_data_admin(sys.argv[1]) diff --git a/scripts/requirements-simple.txt b/scripts/requirements-simple.txt index 2c307904..0d9a3764 100644 --- a/scripts/requirements-simple.txt +++ b/scripts/requirements-simple.txt @@ -1,4 +1,4 @@ -requests==2.20.0 +requests==2.32.2 six==1.10.0 PyYAML==5.4 -ujson==1.33 \ No newline at end of file +ujson==5.4.0 \ No newline at end of file diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 9c2cd513..fbf5d3e6 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,6 +1,6 @@ boto3==1.4.0 botocore==1.4.53 -Fiona==1.6.3.post1 +Fiona==1.10.0 PyYAML==5.4 Rtree==0.8.2 Shapely==1.5.14 @@ -13,14 +13,14 @@ distribute==0.7.3 future==0.15.2 futures==3.0.5 ftfy==4.2.0 -gevent==1.1.2 +gevent==23.9.0 greenlet==0.4.10 jmespath==0.9.0 leveldb==0.193 -lxml==4.6.3 +lxml==4.9.1 lru-dict==1.1.3 marisa-trie==0.7.2 -numpy==1.10.4 +numpy==1.22.0 pycountry==1.20 git+https://github.com/kmike/pymorphy2 pymorphy2-dicts-ru==2.4.394633.4298366 @@ -29,9 +29,9 @@ pyproj==1.9.5.1 pystache==0.5.4 python-Levenshtein==0.12.0 python-geohash==0.8.5 -requests==2.20.0 +requests==2.32.2 s3transfer==0.1.3 six==1.10.0 -ujson==1.35 +ujson==5.4.0 urlnorm==1.1.3 wsgiref==0.1.2 diff --git a/src/crf_context.c b/src/crf_context.c index 0f399a1a..8e1a759e 100644 --- a/src/crf_context.c +++ b/src/crf_context.c @@ -40,8 +40,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { } if (context->flag & CRF_CONTEXT_MARGINALS) { -#if defined(INTEL_SSE) || defined(ARM_NEON) - context->exp_state = double_matrix_new_aligned(T, L, 16); +#if defined(USE_SSE) + context->exp_state = double_matrix_new_aligned(T, L, 32); if (context->exp_state == NULL) goto exit_context_created; double_matrix_zero(context->exp_state); #else @@ -52,8 +52,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { context->mexp_state = double_matrix_new_zeros(T, L); if (context->mexp_state == NULL) goto exit_context_created; -#if defined(INTEL_SSE) || defined(ARM_NEON) - context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16); +#if defined(USE_SSE) + context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32); if (context->exp_state_trans == NULL) goto exit_context_created; double_matrix_zero(context->exp_state_trans); #else @@ -64,8 +64,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) { context->mexp_state_trans = double_matrix_new_zeros(T, L * L); if (context->mexp_state_trans == NULL) goto exit_context_created; -#if defined(INTEL_SSE) || defined(ARM_NEON) - context->exp_trans = double_matrix_new_aligned(L, L, 16); +#if defined(USE_SSE) + context->exp_trans = double_matrix_new_aligned(L, L, 32); if (context->exp_trans == NULL) goto exit_context_created; double_matrix_zero(context->exp_trans); #else @@ -130,14 +130,14 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) { if (self->flag & CRF_CONTEXT_MARGINALS && ( -#if defined(INTEL_SSE) || defined(ARM_NEON) - !double_matrix_resize_aligned(self->exp_state, T, L, 16) || +#if defined(USE_SSE) + !double_matrix_resize_aligned(self->exp_state, T, L, 32) || #else !double_matrix_resize(self->exp_state, T, L) || #endif !double_matrix_resize(self->mexp_state, T, L) || -#if defined(INTEL_SSE) || defined(ARM_NEON) - !double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) || +#if defined(USE_SSE) + !double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) || #else !double_matrix_resize(self->exp_state_trans, T, L * L) || #endif @@ -184,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) { } if (self->exp_state != NULL) { -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) double_matrix_destroy_aligned(self->exp_state); #else double_matrix_destroy(self->exp_state); @@ -200,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) { } if (self->exp_state_trans != NULL) { -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) double_matrix_destroy_aligned(self->exp_state_trans); #else double_matrix_destroy(self->exp_state_trans); @@ -216,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) { } if (self->exp_trans != NULL) { -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) double_matrix_destroy_aligned(self->exp_trans); #else double_matrix_destroy(self->exp_trans); diff --git a/src/expand.c b/src/expand.c index 898c17d1..90ade598 100644 --- a/src/expand.c +++ b/src/expand.c @@ -15,6 +15,14 @@ #include "token_types.h" #include "transliterate.h" +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifndef HAVE_STRNDUP +#include "strndup.h" +#endif + #define DEFAULT_KEY_LEN 32 @@ -1567,7 +1575,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt size_t len = strlen(input); - language_classifier_response_t *lang_response = NULL; + libpostal_language_classifier_response_t *lang_response = NULL; if (options.num_languages == 0) { lang_response = classify_languages(input); @@ -1627,7 +1635,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt kh_destroy(str_set, unique_strings); if (lang_response != NULL) { - language_classifier_response_destroy(lang_response); + libpostal_language_classifier_response_destroy(lang_response); } char_array_destroy(temp_string); diff --git a/src/file_utils.c b/src/file_utils.c index f25e5ee6..5fc2dfbe 100644 --- a/src/file_utils.c +++ b/src/file_utils.c @@ -198,7 +198,7 @@ bool file_write_float(FILE *file, float value) { } inline uint32_t file_deserialize_uint32(unsigned char *buf) { - return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; + return ((uint32_t)buf[0] << 24) | ((uint32_t)buf[1] << 16) | ((uint32_t)buf[2] << 8) | (uint32_t)buf[3]; } bool file_read_uint32(FILE *file, uint32_t *value) { @@ -243,7 +243,7 @@ bool file_write_uint32(FILE *file, uint32_t value) { inline uint16_t file_deserialize_uint16(unsigned char *buf) { - return (buf[0] << 8) | buf[1]; + return ((uint16_t)buf[0] << 8) | buf[1]; } diff --git a/src/language_classifier.c b/src/language_classifier.c index 25273d92..d107d558 100644 --- a/src/language_classifier.c +++ b/src/language_classifier.c @@ -46,7 +46,7 @@ language_classifier_t *get_language_classifier(void) { return language_classifier; } -void language_classifier_response_destroy(language_classifier_response_t *self) { +void language_classifier_response_destroy(libpostal_language_classifier_response_t *self) { if (self == NULL) return; if (self->languages != NULL) { free(self->languages); @@ -59,7 +59,7 @@ void language_classifier_response_destroy(language_classifier_response_t *self) free(self); } -language_classifier_response_t *classify_languages(char *address) { +libpostal_language_classifier_response_t *classify_languages(char *address) { language_classifier_t *classifier = get_language_classifier(); if (classifier == NULL) { @@ -88,7 +88,7 @@ language_classifier_response_t *classify_languages(char *address) { size_t n = classifier->num_labels; double_matrix_t *p_y = double_matrix_new_zeros(1, n); - language_classifier_response_t *response = NULL; + libpostal_language_classifier_response_t *response = NULL; bool model_exp = false; if (classifier->weights_type == MATRIX_DENSE) { model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y); @@ -129,7 +129,7 @@ language_classifier_response_t *classify_languages(char *address) { free(indices); - response = malloc(sizeof(language_classifier_response_t)); + response = malloc(sizeof(libpostal_language_classifier_response_t)); response->num_languages = num_languages; response->languages = languages; response->probs = probs; diff --git a/src/language_classifier.h b/src/language_classifier.h index c5402b39..2a638e9c 100644 --- a/src/language_classifier.h +++ b/src/language_classifier.h @@ -6,6 +6,8 @@ #include #include +#include "libpostal.h" + #include "collections.h" #include "language_features.h" #include "logistic_regression.h" @@ -29,21 +31,14 @@ typedef struct language_classifier { } weights; } language_classifier_t; - -typedef struct language_classifier_response { - size_t num_languages; - char **languages; - double *probs; -} language_classifier_response_t; - // General usage language_classifier_t *language_classifier_new(void); language_classifier_t *get_language_classifier(void); language_classifier_t *get_language_classifier_country(void); -language_classifier_response_t *classify_languages(char *address); -void language_classifier_response_destroy(language_classifier_response_t *self); +libpostal_language_classifier_response_t *classify_languages(char *address); +void language_classifier_response_destroy(libpostal_language_classifier_response_t *self); void language_classifier_destroy(language_classifier_t *self); @@ -58,4 +53,4 @@ bool language_classifier_module_setup(char *dir); void language_classifier_module_teardown(void); -#endif \ No newline at end of file +#endif diff --git a/src/language_classifier_cli.c b/src/language_classifier_cli.c index e67be84a..d612132b 100644 --- a/src/language_classifier_cli.c +++ b/src/language_classifier_cli.c @@ -29,7 +29,7 @@ int main(int argc, char **argv) { } - language_classifier_response_t *response = classify_languages(address); + libpostal_language_classifier_response_t *response = classify_languages(address); if (response == NULL) { printf("Could not classify language\n"); exit(EXIT_FAILURE); diff --git a/src/language_classifier_test.c b/src/language_classifier_test.c index b795be99..262020ee 100644 --- a/src/language_classifier_test.c +++ b/src/language_classifier_test.c @@ -34,7 +34,7 @@ double test_accuracy(char *filename) { continue; } - language_classifier_response_t *response = classify_languages(address); + libpostal_language_classifier_response_t *response = classify_languages(address); if (response == NULL || response->num_languages == 0) { printf("%s\tNULL\t%s\n", language, address); continue; diff --git a/src/libpostal.c b/src/libpostal.c index 066a3015..e0234c32 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -119,7 +119,7 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) { - language_classifier_response_t *lang_response = place_languages(num_components, labels, values); + libpostal_language_classifier_response_t *lang_response = place_languages(num_components, labels, values); if (lang_response == NULL) { *num_languages = 0; return NULL; @@ -296,20 +296,22 @@ bool libpostal_setup_datadir(char *datadir) { numex_path = path_join(3, datadir, LIBPOSTAL_NUMEX_SUBDIR, NUMEX_DATA_FILE); address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE); } + + bool setup_succeed = true; if (!transliteration_module_setup(transliteration_path)) { log_error("Error loading transliteration module, dir=%s\n", transliteration_path); - return false; + setup_succeed = false; } - if (!numex_module_setup(numex_path)) { + if (setup_succeed && !numex_module_setup(numex_path)) { log_error("Error loading numex module, dir=%s\n", numex_path); - return false; + setup_succeed = false; } - if (!address_dictionary_module_setup(address_dictionary_path)) { + if (setup_succeed && !address_dictionary_module_setup(address_dictionary_path)) { log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path); - return false; + setup_succeed = false; } if (transliteration_path != NULL) { @@ -324,7 +326,7 @@ bool libpostal_setup_datadir(char *datadir) { free(address_dictionary_path); } - return true; + return setup_succeed; } bool libpostal_setup(void) { diff --git a/src/libpostal_data.in b/src/libpostal_data.in index 8c18270f..a749a623 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -36,7 +36,7 @@ LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download" -if [ $DATAMODEL == "senzing" ]; then +if [ "$DATAMODEL" = "senzing" ]; then LIBPOSTAL_DATA_FILE_CHUNKS=1 LIBPOSTAL_PARSER_MODEL_CHUNKS=1 LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1 diff --git a/src/matrix.h b/src/matrix.h index f6a31f1a..05b43db2 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -33,7 +33,7 @@ typedef enum { } name##_t; \ \ static name##_t *name##_new(size_t m, size_t n) { \ - name##_t *matrix = malloc(sizeof(name##_t)); \ + name##_t *matrix = malloc(sizeof(name##_t)); \ \ if (matrix == NULL) { \ return NULL; \ @@ -62,7 +62,7 @@ typedef enum { matrix->m = m; \ matrix->n = n; \ \ - matrix->values = _aligned_malloc(sizeof(type) * m * n, alignment); \ + matrix->values = aligned_malloc(sizeof(type) * m * n, alignment); \ if (matrix->values == NULL) { \ free(matrix); \ return NULL; \ @@ -86,7 +86,7 @@ typedef enum { if (self == NULL) return; \ \ if (self->values != NULL) { \ - _aligned_free(self->values); \ + aligned_free(self->values); \ } \ \ free(self); \ @@ -118,7 +118,7 @@ typedef enum { if (self == NULL) return false; \ \ if (m * n > (self->m * self->n)) { \ - type *ptr = _aligned_realloc(self->values, sizeof(type) * m * n, alignment); \ + type *ptr = aligned_resize(self->values, sizeof(type) * self->m * self->n, sizeof(type) * m * n, alignment); \ if (ptr == NULL) { \ return false; \ } \ diff --git a/src/near_dupe.c b/src/near_dupe.c index 06a89ac4..f28b2140 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -670,7 +670,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - language_classifier_response_t *lang_response = NULL; + libpostal_language_classifier_response_t *lang_response = NULL; if (num_languages == 0) { lang_response = place_languages(num_components, labels, values); diff --git a/src/numex.c b/src/numex.c index 6edaca1a..c20efd34 100644 --- a/src/numex.c +++ b/src/numex.c @@ -5,6 +5,15 @@ #include "log/log.h" +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifndef HAVE_STRNDUP +#include "strndup.h" +#endif + + #define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB #define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n" diff --git a/src/place.c b/src/place.c index 549f1f48..f5f05037 100644 --- a/src/place.c +++ b/src/place.c @@ -17,10 +17,10 @@ static inline bool is_address_text_component(char *label) { ); } -language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) { +libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) { if (num_components == 0 || values == NULL || labels == NULL) return NULL; - language_classifier_response_t *lang_response = NULL; + libpostal_language_classifier_response_t *lang_response = NULL; char *label; char *value; diff --git a/src/place.h b/src/place.h index 88920582..22d28b62 100644 --- a/src/place.h +++ b/src/place.h @@ -32,7 +32,7 @@ typedef struct place { char *website; } place_t; -language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values); +libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values); place_t *place_new(void); @@ -40,4 +40,4 @@ place_t *place_from_components(size_t num_components, char **labels, char **valu void place_destroy(place_t *place); -#endif \ No newline at end of file +#endif diff --git a/src/sparse_matrix_utils.c b/src/sparse_matrix_utils.c index 53fcaf97..70be9d14 100644 --- a/src/sparse_matrix_utils.c +++ b/src/sparse_matrix_utils.c @@ -94,15 +94,15 @@ inline bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khas } uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) { - khash_t(int_set) *unique_columns = kh_init(int_set); + khash_t(int_uint32) *unique_columns = kh_init(int_uint32); uint32_array *ret = uint32_array_new(); if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) { - kh_destroy(int_set, unique_columns); + kh_destroy(int_uint32, unique_columns); return ret; } - kh_destroy(int_set, unique_columns); + kh_destroy(int_uint32, unique_columns); uint32_array_destroy(ret); return NULL; } diff --git a/src/vector.h b/src/vector.h index 78a0fad4..52b8b8d0 100644 --- a/src/vector.h +++ b/src/vector.h @@ -7,43 +7,44 @@ #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__) #include +static inline void *aligned_malloc(size_t size, size_t alignment) { + return _aligned_malloc(size, alignment); +} +static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment) { + return _aligned_realloc(p, new_size, alignment); +} +static inline void aligned_free(void *p) { + _aligned_free(p); +} #else #include -static inline void *_aligned_malloc(size_t size, size_t alignment) +static inline void *aligned_malloc(size_t size, size_t alignment) { void *p; int ret = posix_memalign(&p, alignment, size); return (ret == 0) ? p : NULL; } -static inline void *_aligned_realloc(void *p, size_t size, size_t alignment) +static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment) { if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) { return NULL; } - if (size == 0) { + if (p == NULL) { return NULL; } - void *rp = realloc(p, size); - - /* If realloc result is not already at an aligned boundary, - _aligned_malloc a new block and copy the contents of the realloc'd - pointer to the aligned block, free the realloc'd pointer and return - the aligned pointer. - */ - if ( ((size_t)rp & (alignment - 1)) != 0) { - void *p1 = _aligned_malloc(size, alignment); - if (p1 != NULL) { - memcpy(p1, rp, size); - } - free(rp); - rp = p1; + void *p1 = aligned_malloc(new_size, alignment); + if (p1 == NULL) { + free(p); + return NULL; } - return rp; + memcpy(p1, p, old_size); + free(p); + return p1; } -static inline void _aligned_free(void *p) +static inline void aligned_free(void *p) { free(p); } @@ -79,7 +80,7 @@ static inline void _aligned_free(void *p) name *array = malloc(sizeof(name)); \ if (array == NULL) return NULL; \ array->n = array->m = 0; \ - array->a = _aligned_malloc(size * sizeof(type), alignment); \ + array->a = aligned_malloc(size * sizeof(type), alignment); \ if (array->a == NULL) return NULL; \ array->m = size; \ return array; \ @@ -94,7 +95,7 @@ static inline void _aligned_free(void *p) } \ static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) { \ if (size <= array->m) return true; \ - type *ptr = _aligned_realloc(array->a, sizeof(type) * size, alignment); \ + type *ptr = aligned_resize(array->a, sizeof(type) * array->m, sizeof(type) * size, alignment); \ if (ptr == NULL) return false; \ array->a = ptr; \ array->m = size; \ @@ -160,7 +161,7 @@ static inline void _aligned_free(void *p) } \ static inline void name##_destroy_aligned(name *array) { \ if (array == NULL) return; \ - if (array->a != NULL) _aligned_free(array->a); \ + if (array->a != NULL) aligned_free(array->a); \ free(array); \ } @@ -182,7 +183,7 @@ static inline void _aligned_free(void *p) free_func(array->a[i]); \ } \ } \ - _aligned_free(array->a); \ + aligned_free(array->a); \ free(array); \ } diff --git a/src/vector_math.h b/src/vector_math.h index 7dbdb049..eff90466 100644 --- a/src/vector_math.h +++ b/src/vector_math.h @@ -8,10 +8,8 @@ #define ks_lt_index(a, b) ((a).value < (b).value) -#if defined(INTEL_SSE) +#if defined(USE_SSE) #include -#elif defined(ARM_NEON) -#include "sse2neon.h" #endif /* @@ -340,7 +338,7 @@ -#if defined(INTEL_SSE) || defined(ARM_NEON) +#if defined(USE_SSE) /* From https://github.com/herumi/fmath/blob/master/fastexp.cpp diff --git a/test/Makefile.am b/test/Makefile.am index 5289e3c2..f2e911f2 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -5,7 +5,7 @@ CFLAGS_O2 = $(CFLAGS_BASE) -O2 CFLAGS_O3 = $(CFLAGS_BASE) -O3 DEFAULT_INCLUDES = -I.. -I/usr/local/include -CFLAGS = $(SIMDFLAGS) $(CFLAGS_BASE) +CFLAGS = $(CFLAGS_BASE) TESTS = test_libpostal noinst_PROGRAMS = test_libpostal diff --git a/windows/configure.ac b/windows/configure.ac index d19cd967..24e73fec 100644 --- a/windows/configure.ac +++ b/windows/configure.ac @@ -73,57 +73,20 @@ AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf availabl AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])]) # ------------------------------------------------------------------ -# Architecture-specific options +# Checks for SSE2 build # ------------------------------------------------------------------ -# allow enabling hardware optimization on any system: -case "$host_cpu" in - arm*|aarch64*) - enable_arm_neon=yes - enable_intel_sse=no - AC_DEFINE([ARM_NEON], [1], - [Enable ARM_NEON optimizations]) - ;; - i?86|x86_64) - enable_intel_sse=yes - enable_arm_neon=no - AC_DEFINE([INTEL_SSE], [1], - [Enable Intel SSE optimizations]) - ;; -esac - -AC_ARG_ENABLE([neon], - AS_HELP_STRING([[[--disable-neon]]], - [Disable ARM NEON hardware optimizations]), - [ - enable_arm_neon=no - AC_DEFINE([ARM_NEON], [0], - [Disable ARM_NEON optimizations]) - ]) - AC_ARG_ENABLE([sse2], - AS_HELP_STRING([[[--disable-sse2]]], - [Disable Intel SSE2 hardware optimizations]), - [ - enable_intel_sse=no - AC_DEFINE([INTEL_SSE], [0], - [Disable INTEL_SSE optimizations]) - ]) + AS_HELP_STRING( + [--disable-sse2], + [disable SSE2 optimization routines] + ) + ) -SIMDFLAGS="" - -AS_IF([test "x$enable_intel_sse" != "xno"], [ - SIMDFLAGS="-mfpmath=sse -msse2 -DINTEL_SSE" +AS_IF([test "x$enable_sse2" != "xno"], [ + CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}" ]) -AS_IF([test "x$enable_arm_neon" != "xno"], [ - SIMDFLAGS="-march=armv8-a+fp+simd+crypto+crc -DARM_NEON" -]) - -CFLAGS="${SIMDFLAGS} ${CFLAGS}" - -AC_SUBST([SIMDFLAGS], [$SIMDFLAGS]) - AC_CHECK_HEADER(cblas.h, [AX_CBLAS]) AC_ARG_ENABLE([data-download],