Merge branch 'master' into slightly_update_mac_install_instructions

2025-02-08 12:26:17 -05:00
parent 339252c3a1 678806ac37
commit 0f5671a6da
62 changed files with 9322 additions and 273 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -0,0 +1,36 @@
+name: Test
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+  workflow_dispatch:
+
+jobs:
+  build_and_test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install Dependencies Linux
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          sudo apt-get update -y
+          sudo apt-get install curl autoconf automake libtool pkg-config
+      - name: Install Dependencies MacOS
+        if: runner.os == 'macOS'
+        run: |
+          brew update
+          brew install curl autoconf automake libtool pkg-config
+      - name: Build
+        env:
+          LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data
+        run: |
+          ./bootstrap.sh
+          ./configure --datadir=$LIBPOSTAL_DATA_DIR
+          make
+      - name: Test
+        run: make check
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,83 +0,0 @@
-language: c
-branches:
-    only:
-        - master
-env:
-    global:
-        - secure: "bHrAu46oecEj3gjamT+XWXtf2J0ZJCFa8tUdgM4evscaJiiwv1TtsGXyhIj/ai7DlRIPVJUtBUy6uoGGjr6GT43zTrzSxYAOMdVXZYsnTDcdL1/0dbwcIK6/u0EI377s1buGIxG1fHveWKXuXwJWDAw4KS+5HU88a42+zMbhKe4="
-        - secure: "SkvNYucKVns9qDjOEW2WIhDlOMKBOwhzVcwY++HWTRtn04ErrqR4k01Mmho0jGBQD9JrPLhDgnX1BNy5s+Kmq/bxn9OZm7K1z24qBKb0mBBiNEnf2jvT0AvF5xxM+cJf4KKNL+CC0MwNf5y7HVPq1xibOV4/CNIrc1ZZc9aqdkE="
-        - secure: "am/rRca5akv7gSSMeNQfHnWiTHhk8fQhOZvZ0Ut+PezkQlLgKp7bzmMFkkuQ4L5hpJU40kFzuWmIPgO33dacgq69Vx/Xct1bEnxGBGjriI5qOhMizmzLYPs5uWiRjtJnBqb4JOUh5K7JBlwrgvD72fY5ZK2lwtzTksfWo8N+ahU="
-        - secure: "mh/WDQapGJb6MAFvgCjiMAAv1aa8gUaIs2Ohtx7yPrDBwsD8UqlyEM7ktGLZGQ1q/7OJ/Z6QfDMfJQwDKzxyUSY1yHZTNkP3QzkTt2D1Qyvi++O6EkGqSdSS6Lb3aID3IsEaye/yasJ+rxiRSp05O9+OYvhJlqRZnzaimiAv5KI="
-        - secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
-        - GH_REF=github.com/openvenues/libpostal
-        - DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l)
-        - NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
-        - TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
-        - TAG_VERSION=$(cat ./versions/base).$TRAVIS_BUILD_NUMBER
-        - SRC_TARBALL_FILENAME=libpostal-$(cat ./versions/base).tar.gz
-        - LIBPOSTAL_DATA_DIR=$(pwd)/data
-        - LIBPOSTAL_DATA_FILENAME=libpostal_data.tar.gz
-compiler:
-    - clang
-    - gcc
-addons:
-    apt:
-        sources:
-            - ubuntu-toolchain-r-test
-        packages:
-            - gcc-4.8
-            - pkg-config
-before_script:
-    - ./bootstrap.sh
-    - if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
-    - if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
-    - if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
-install:
-    - if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
-script:
-    - ./configure --datadir=$LIBPOSTAL_DATA_DIR
-    - make -j4
-    - if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
-    - if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
-    - if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
-    - make check
-
-after_success:
-    - |
-       if [[ "$CC" == "gcc" && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" ]]; then
-            if [[ ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then
-                export PATH=$PATH:env/bin/;
-                git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1
-                cp src/*_data.c _travis/src
-                echo "$TAG_VERSION" > _travis/versions/base_data
-                cd _travis
-                git config user.name "$GIT_COMMITTER_NAME";
-                git config user.email "$GIT_COMMITTER_EMAIL";
-                git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER";
-                git push --quiet origin master;
-
-                tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME $BASIC_MODULE_DIRS
-            fi
-            git tag $TAG_VERSION -a -m "[auto][ci skip] Generating tag for Travis build #$TRAVIS_BUILD_NUMBER";
-            git push --tags --quiet origin master;
-       fi;
-
-before_deploy:
-    - make dist
-
-deploy:
-    - provider: releases
-      file:
-          - "$SRC_TARBALL_FILENAME"
-      on:
-          tags: true
-          branch: master
-          skip_cleanup: true
-    - provider: releases
-      file:
-          - "$LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME"
-      on:
-          tags: true
-          branch: master
-          condition: "$CC = gcc && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 )"
-          skip_cleanup: true
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
 ## Submitting Issues

-When submitting issues to libpostal, please repeect these guildelines:
+When submitting issues to libpostal, please respect these guidelines:

 - Be constructive. Try to help solve the problem.
 - Always search for existing issues before submitting one.
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -8,7 +8,7 @@ I was checking out libpostal, and saw something that could be improved.

 ---
 #### Here's how I'm using libpostal
-<!-- Always interested to know how people use the library! What are you working on? Which orgnization? What's your use case? -->
+<!-- Always interested to know how people use the library! What are you working on? Which organization? What's your use case? -->

 ---
 #### Here's what I did
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # libpostal: international street address NLP

-[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal)
+[![Build Status](https://github.com/openvenues/libpostal/actions/workflows/test.yml/badge.svg)](https://github.com/openvenues/libpostal/actions)
 [![Build Status](https://ci.appveyor.com/api/projects/status/github/openvenues/libpostal?branch=master&svg=true)](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
 [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE)
 [![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors)
@@ -98,7 +98,7 @@ Before you install, make sure you have the following prerequisites:

 **On Ubuntu/Debian**
 ```
-sudo apt-get install curl autoconf automake libtool pkg-config
+sudo apt-get install -y curl build-essential autoconf automake libtool pkg-config
 ```

 **On CentOS/RHEL**
@@ -106,13 +106,23 @@ sudo apt-get install curl autoconf automake libtool pkg-config
 sudo yum install curl autoconf automake libtool pkgconfig
 ```

-**On Mac OSX**
+**On macOS**
+
+Install with one command via [MacPorts](https://www.macports.org/):
+```
+port install libpostal
+```
+
+Or as follows with [Homebrew](https://brew.sh/):
+
 ```
 brew install curl autoconf automake libtool pkg-config
 ```

 Then to install the C library:

+If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed.
+
 ```
 git clone https://github.com/openvenues/libpostal
 cd libpostal
@@ -123,9 +133,19 @@ make distclean
 ./bootstrap.sh

 # omit --datadir flag to install data in current directory
-./configure --datadir=[...some dir with a few GB of space...]
-
+./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
 make -j4
+
+# For Intel/AMD processors and the default model
+./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
+
+# For Apple / ARM cpus and the default model
+./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] --disable-sse2
+
+# For the improved Senzing model:
+./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] MODEL=senzing
+
+make -j8
 sudo make install

 # On Linux it's probably a good idea to run
@@ -182,6 +202,24 @@ If you require a .lib import library to link this to your application. You can g
 lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64
 ```

+Installation with an alternative data model
+-------------------------------------------
+
+An alternative data model is available for libpostal. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling.
+To enable this add `MODEL=senzing` to the conigure line during installation:
+```
+./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing
+```
+
+The data for this model is gotten from [OpenAddress](https://openaddresses.io/), [OpenStreetMap](https://www.openstreetmap.org/) and data generated by Senzing based on customer feedback (a few hundred records), a total of about 1.2 billion records of data from over 230 countries, in 100+ languages. The data from OpenStreetMap and OpenAddress is good but not perfect so the data set was modified by filtering out badly formed addresses, correcting misclassified address tokens and removing tokens that didn't belong in the addresses, whenever these conditions were encountered.
+
+Senzing created a data set of 12950 addresses from 89 countries that it uses to test and verify the quality of its models. The data set was generated using random addresses from OSM, minimally 50 per country. Hard-to-parse addresses were gotten from Senzing support team and customers and from the libpostal github page and added to this set. The Senzing model got 4.3% better parsing results than the default model, using this test set.
+
+The size of this model is about 2.2GB compared to 1.8GB for the default model so keep that in mind if storages space is important.
+
+Further information about this data model can be found at: https://github.com/Senzing/libpostal-data
+If you run into any issues with this model, whether they have to do with parses, installation or any other problems, then please report them at https://github.com/Senzing/libpostal-data
+
 Examples of parsing
 -------------------

@@ -389,23 +427,19 @@ Libpostal is designed to be used by higher-level languages.  If you don't see yo
 - LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
 - Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
 - Elixir: [Expostal](https://github.com/SweetIQ/expostal)
+- Haskell: [haskell-postal](http://github.com/netom/haskell-postal)
+- Rust: [rust-postal](https://github.com/pnordahl/rust-postal)
 - Rust: [rustpostal](https://crates.io/crates/rustpostal)

-**Database extensions**
+**Unofficial database extensions**

 - PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)

-**Unofficial REST API**
+**Unofficial servers**

- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest)
-
-**Libpostal REST Docker**
-
- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker)
-
-**Libpostal ZeroMQ Docker**
-
- Libpostal ZeroMQ Docker image: [pasupulaphani/libpostal-zeromq](https://hub.docker.com/r/pasupulaphani/libpostal-zeromq/) , Source: [Github](https://github.com/pasupulaphani/libpostal-docker) 
+- Libpostal REST Go Docker: [libpostal-rest-docker](https://github.com/johnlonganecker/libpostal-rest-docker)
+- Libpostal REST FastAPI Docker: [libpostal-fastapi](https://github.com/alpha-affinity/libpostal-fastapi)
+- Libpostal ZeroMQ Docker: [libpostal-zeromq](https://github.com/pasupulaphani/libpostal-docker)


 Tests
@@ -480,7 +514,7 @@ optionally be separated so Rosenstraße and Rosen Straße are equivalent.
 for a wide variety of countries and languages, not just US/English. 
 The model is trained on over 1 billion addresses and address-like strings, using the
 templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted,
-tagged traning examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
+tagged training examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
 are performed to make the training data resemble real messy geocoder input as closely as possible.

 - **Language classification**: multinomial logistic regression
@@ -502,7 +536,7 @@ language (IX => 9) which occur in the names of many monarchs, popes, etc.

 - **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec,
 implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian
-languages chracter by character instead of on whitespace.
+languages character by character instead of on whitespace.

 - **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form,
 strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration.
@@ -526,6 +560,7 @@ Non-goals

 - Verifying that a location is a valid address
 - Actually geocoding addresses to a lat/lon (that requires a database/search index)
+- Extracting addresses from free text

 Raison d'être
 -------------
@@ -631,7 +666,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions:
 - Confines almost all mallocs to *name*_new and all frees to *name*_destroy
 - Efficient existing implementations for simple things like hashtables
 - Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
- Data structrues take advantage of sparsity as much as possible
+- Data structures take advantage of sparsity as much as possible
 - Efficient double-array trie implementation for most string dictionaries
 - Cross-platform as much as possible, particularly for *nix

--- a/configure.ac
+++ b/configure.ac
@@ -60,6 +60,17 @@ AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
 AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
 AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])

+# Senzing data
+AC_SUBST([LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING], [v1])
+
+SENZING_DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/senzing/base_data)
+SENZING_PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/parser)
+SENZING_LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/language_classifier)
+
+AC_SUBST([LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION], [$SENZING_DATA_FILE_LATEST_VERSION])
+AC_SUBST([LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION], [$SENZING_PARSER_MODEL_LATEST_VERSION])
+AC_SUBST([LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION], [$SENZING_LANG_CLASS_MODEL_LATEST_VERSION])
+
 AC_CONFIG_FILES([Makefile
                 libpostal.pc
                 src/Makefile
@@ -75,6 +86,7 @@ AS_IF([test "x$FOUND_GSHUF" = xyes],  [AC_DEFINE([HAVE_GSHUF], [1], [gshuf avail
 # ------------------------------------------------------------------
 # Checks for SSE2 build
 # ------------------------------------------------------------------
+
 AC_ARG_ENABLE([sse2],
    AS_HELP_STRING(
        [--disable-sse2],
@@ -82,7 +94,7 @@ AC_ARG_ENABLE([sse2],
        )
    )

-AS_IF([test "x$enable_sse2" != "xno"], [
+AS_IF([test "x$enable_sse2" != "xno" && test "x$(uname -m)" != "xarm64"], [
    CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
 ])

@@ -96,6 +108,9 @@ AC_ARG_ENABLE([data-download],
                *) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
              esac], [DOWNLOAD_DATA=true])

+AC_ARG_VAR(MODEL, [Option to use alternative data models. Currently available is "senzing" (MODEL=senzing). If this option is not set the default libpostal data model is used.])
+AS_VAR_IF([MODEL], [], [],
+  [AS_VAR_IF([MODEL], [senzing], [], [AC_MSG_FAILURE([Invalid MODEL value set])])])

 AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])

--- a/m4/ax_cblas.m4
+++ b/m4/ax_cblas.m4
@@ -152,11 +152,21 @@ if test $ax_cblas_ok = no; then
            [], [-lblas])])
 fi

+# BLAS in OpenBLAS library?
+if test $ax_cblas_ok = no; then
+    AC_CHECK_LIB(openblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lopenblas"])
+fi
+
 # Generic CBLAS library?
 if test $ax_cblas_ok = no; then
    AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"])
 fi

+# Generic BLAS library?
+if test $ax_cblas_ok = no; then
+    AC_CHECK_LIB(blas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lblas"])
+fi
+
 AC_SUBST(CBLAS_LIBS)

 LIBS="$ax_cblas_save_LIBS"
--- a/resources/addresses/de.yaml
+++ b/resources/addresses/de.yaml
@@ -63,10 +63,23 @@ numbers:


 house_numbers:
+    gebaude: &gebaude
+        canonical: gebäude
+        abbreviated: geb
+        sample: true
+        canonical_probability: 0.5
+        abbreviated_probability: 0.5
+        sample_probability: 0.05
+        numeric:
+            direction: left
    alphanumeric:
        default: *nummer
+        probability: 0.95
+        alternatives:
+            - alternative: *gebaude
+              probability: 0.05

-    alphanumeric_phrase_probability: 0.0001
+    alphanumeric_phrase_probability: 0.05

 conscription_numbers:
    alphanumeric:
--- a/resources/addresses/uk.yaml
+++ b/resources/addresses/uk.yaml
@@ -49,7 +49,7 @@ numbers:


 house_numbers:
-    budnyok: &budnyok
+    budynok: &budynok
        canonical: будинок
        abbreviated: буд
        sample: true
@@ -58,8 +58,8 @@ house_numbers:
        sample_probability: 0.1
        numeric:
            direction: left
-    budnyok_latin: &budnyok_latin
-        canonical: budnyok
+    budynok_latin: &budynok_latin
+        canonical: budynok
        abbreviated: bud
        sample: true
        canonical_probability: 0.6
@@ -88,10 +88,10 @@ house_numbers:
            direction: left

    alphanumeric:
-        default: *budnyok
+        default: *budynok
        probability: 0.65
        alternatives:
-            - alternative: *budnyok_latin
+            - alternative: *budynok_latin
              probability: 0.05
            - alternative: *dom
              probability: 0.25
--- a/resources/boundaries/osm/ar.yaml
+++ b/resources/boundaries/osm/ar.yaml
@@ -11,8 +11,9 @@

    overrides:
        id:
-            # Buenos Aires (state boundary coterminous with city)
-            "3082668": null
+            relation:
+                # Buenos Aires (state boundary coterminous with city)
+                "3082668": null
        contained_by:
            relation:
                # Buenos Aires
--- a/resources/boundaries/osm/tw.yaml
+++ b/resources/boundaries/osm/tw.yaml
@@ -10,10 +10,10 @@
    "8": "city"
    "9": "suburb"

-    overrides:
-        id:
-            relation:
-                # Taiwan Province
-                "3777248": "state"
-                # Fujian Province
-                "3777250": "state"
+  overrides:
+      id:
+           relation:
+               # Taiwan Province
+               "3777248": "state"
+               # Fujian Province
+               "3777250": "state"
--- a/resources/dictionaries/en/street_types.txt
+++ b/resources/dictionaries/en/street_types.txt
@@ -132,6 +132,7 @@ falls|fls
 fare
 farm|frm
 farms|frms
+farm to market|fm|farm-to-market
 fern
 ferry|fry|fy
 field|fld|fd
@@ -407,4 +408,4 @@ well|wl
 wells|wls
 wharf|whrf|whf
 wynd|wyn
-yard|yd|yrd
+yard|yd|yrd
--- a/resources/dictionaries/en/toponyms.txt
+++ b/resources/dictionaries/en/toponyms.txt
@@ -64,7 +64,7 @@ rhode island|ri
 saskatchewan|sk
 south carolina|sc
 south dakota|sd
-southern australia|sa
+south australia|sa
 tasmania|tas
 tennessee|tn
 texas|tx
--- a/resources/dictionaries/hu/ambiguous_expansions.txt
+++ b/resources/dictionaries/hu/ambiguous_expansions.txt
@@ -3,4 +3,4 @@ d
 e
 k
 n
-u
+u
--- a/resources/dictionaries/hu/level_types_mezzanine.txt
+++ b/resources/dictionaries/hu/level_types_mezzanine.txt
@@ -1,2 +1,2 @@
-félemelet|felemelet
-magasföldszint|magasfoldszint
+félemelet|felemelet|félem|1/2 em|1/2em
+magasföldszint|magasfoldszint|mgfszt|mgfsz|mfszt|mfsz
--- a/resources/dictionaries/hu/personal_titles.txt
+++ b/resources/dictionaries/hu/personal_titles.txt
@@ -1 +1,2 @@
-szent|szt
+szent|szt
+idősebb|id
--- a/resources/dictionaries/hu/street_types.txt
+++ b/resources/dictionaries/hu/street_types.txt
@@ -1,21 +1,34 @@
 árok|arok
-dűlő|dulo
+dűlő|dulo|d.|d
 fasor
+fasora
+főközlekedési út|főút|fout
+határút|hatarut
+kapu
 kert
 körönd|korond|krnd
+körvasútsor|korvasutsor
 körút|korut|krt
 köz|koz
+lakótelep|lakotelep|ltp.|ltp
 lejtő|lejto
 lépcső|lepcso
 liget
 mező|mezo
+országút|orszagut
 park
-rakpart|rpt
-sétány|setany
-sor
-sugárút|sugarut
+parkja
+rakpart|rkpt|rkp|rpt
+sétány|setany|stny.|stny
+sor|s.|s
+sétány|setany|sét
+sugárút|sugarut|sgrt.|sgrt|srt.|srt|sgt.|sgt
+sziget
+telep
 tér|ter
 tere
-utca|u
-út|ut
-útja|utja
+tanya|t.|t
+udvar
+utca|u.|u
+út|ut|u.|u
+útja|utja
--- a/resources/dictionaries/pl/street_types.txt
+++ b/resources/dictionaries/pl/street_types.txt
@@ -1,12 +1,18 @@
 aleja|al
 autostrada
 boczna
-bulwar
+bulwar|bulw
 droga
 obwodnica
+ogród
+osiedle|os
+park
 plac|pl
 rondo
 rynek
+skwer
 szosa
 ulica|ul
-zaulek
+wybrzeże|wyb
+wyspa
+zaulek
--- a/resources/dictionaries/pt/street_types.txt
+++ b/resources/dictionaries/pt/street_types.txt
@@ -10,10 +10,10 @@ calçada|calcada|cc
 calçadinha|caclcadinha|ccnh
 câmara municipal|camara municipal|cm|c.m.|c. m.
 caminho|cam|camno
-direito|dto
+direito|dto|dt
 esquerdo|esq
 estrada|estr
-astrada marginal|estr marg
+estrada marginal|estr marg
 estrada municipal|em|e m|estr m
 estrada nacional|en|e n|estr n
 estrada regional|er|e r|estr r
@@ -50,4 +50,4 @@ viaduto|vd|vdto
 viela|ve
 vila|vl
 volta
-zona|zn
+zona|zn
--- a/resources/dictionaries/ro/building_types.txt
+++ b/resources/dictionaries/ro/building_types.txt
@@ -0,0 +1,6 @@
+anexa
+bloc|blc|bl
+casa
+cladirea|cladire
+complex
+garaj
--- a/resources/dictionaries/ro/company_types.txt
+++ b/resources/dictionaries/ro/company_types.txt
@@ -0,0 +1,5 @@
+banca
+organizatie neguvernamentala|ong
+societate comerciala|sc
+societate cu raspundere limitata|srl
+societate pe actiuni|sa
--- a/resources/dictionaries/ro/cross_streets.txt
+++ b/resources/dictionaries/ro/cross_streets.txt
@@ -1,5 +1,5 @@
 &
-colț|colt
+colț|colt|colț cu|colt cu
 între|intre
 la colțul de pe|la coltul de pe
-și|si
+și|si
--- a/resources/dictionaries/ro/entrances.txt
+++ b/resources/dictionaries/ro/entrances.txt
@@ -1 +1 @@
-intrare
+intrare|intrarea
--- a/resources/dictionaries/ro/near.txt
+++ b/resources/dictionaries/ro/near.txt
@@ -4,4 +4,4 @@ din
 in apropiere de
 în apropiere|in apropiere
 în jurul aici|in jurul aici
-lângă mine|langa mine
+lângă mine|langa mine|lângă|langa
--- a/resources/dictionaries/ro/number.txt
+++ b/resources/dictionaries/ro/number.txt
@@ -1 +1 @@
-număr|numar|nr|nº|n°|#|№|no
+număr|numar|nr|nº|n°|#|№|no|numarul|numărul
--- a/resources/dictionaries/ro/personal_titles.txt
+++ b/resources/dictionaries/ro/personal_titles.txt
@@ -8,7 +8,8 @@ general|gen
 major|maj
 locotenent
 locotenent colonel
+pictor
 profesor|prof
 sergent
 sublocotenent
-vice amiral
+vice amiral
--- a/resources/dictionaries/ro/place_names.txt
+++ b/resources/dictionaries/ro/place_names.txt
@@ -0,0 +1,3 @@
+cinema
+cafenea
+fabrica
--- a/resources/dictionaries/ro/qualifiers.txt
+++ b/resources/dictionaries/ro/qualifiers.txt
@@ -1 +1,7 @@
-bloc|bl
+bloc|bl
+cartier|cartierul
+comuna|comunā
+kilometrul|kilometru|km
+sat|satul
+sector|sectorul|sect
+zona
--- a/resources/dictionaries/ro/stopwords.txt
+++ b/resources/dictionaries/ro/stopwords.txt
@@ -1,2 +1,3 @@
 și|si|&
-cel
+cel
+intre
--- a/resources/dictionaries/ro/street_types.txt
+++ b/resources/dictionaries/ro/street_types.txt
@@ -1,13 +1,13 @@
 aleea|ale|alea|al
 bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard
-calea|cal
-drumul
+calea|cale|cal
+drumul|drum
 fundătura|fundatura|fnd
 fundacul|fdc
 intrarea|int|intr
 piaţa|piata|piață|pta|pţa|p-ta|p-ţa
-strada|str
+strada|str|st
 stradela|str-la|sdla
 șoseaua|soseaua|sos|șos
 splaiul|sp|spl
-vârful|varful|virful|vîrful|varf|vf
+vârful|varful|virful|vîrful|varf|vf
--- a/resources/dictionaries/ro/synonyms.txt
+++ b/resources/dictionaries/ro/synonyms.txt
@@ -0,0 +1 @@
+decembrie|dec
--- a/resources/dictionaries/ro/unit_types_numbered.txt
+++ b/resources/dictionaries/ro/unit_types_numbered.txt
@@ -1,4 +1,8 @@
-apartament|ap|apt|apart
+apartamentul|apartament|ap|apt|apart
 birou
+cladire|cladirea|clădire|clădirea
+corp|corpul
+complex
+interior|int
 lotul
-sală|sala
+sală|sala
--- a/resources/dictionaries/ru/entrances.txt
+++ b/resources/dictionaries/ru/entrances.txt
@@ -1,2 +1,4 @@
 вход
-vkhod
+vkhod
+подъезд
+pod'ezd
--- a/resources/dictionaries/uk/qualifiers.txt
+++ b/resources/dictionaries/uk/qualifiers.txt
@@ -6,3 +6,5 @@ kvartal|kvart|kv|kv-l
 oblast|obl
 район|р-н
 raion|r-n
+місто|міс|м
+misto|mis|m
--- a/scripts/geodata/neighborhoods/reverse_geocode.py
+++ b/scripts/geodata/neighborhoods/reverse_geocode.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import argparse
+import fnmatch
 import logging
 import operator
 import os
@@ -24,7 +25,7 @@ from geodata.osm.components import osm_address_components
 from geodata.osm.definitions import osm_definitions
 from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
 from geodata.polygons.index import *
-from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder
+from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder
 from geodata.statistics.tf_idf import IDFIndex


@@ -212,6 +213,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
    (ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
    tests for neighborhoods. The properties vary by source but each has
    source has least a "name" key which in practice is what we care about.
+
+    Quattroshapes data is no longer accessible and has been replaced by
+    WhosOnFirst.
    '''

    PRIORITIES_FILENAME = 'priorities.json'
@@ -224,9 +228,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
    source_priorities = {
        'osm': 0,            # Best names/polygons, same coordinate system
        'osm_cth': 1,        # Prefer the OSM names if possible
-        'clickthathood': 2,  # Better names/polygons than Quattroshapes
-        'osm_quattro': 3,    # Prefer OSM names matched with Quattroshapes polygon
-        'quattroshapes': 4,  # Good results in some countries/areas
+        'clickthathood': 2,  # Better names/polygons than WhosOnFirst
+        'osm_wof': 3,        # Prefer OSM names matched with WhosOnFirst polygon
+        'wof': 4,            # Replacement of Quattroshapes
    }

    level_priorities = {
@@ -235,7 +239,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
    }

    regex_replacements = [
-        # Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
+        # Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes
        (re.compile('^paris-(?=[\d])', re.I), ''),
        (re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
    ]
@@ -254,7 +258,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
        return doc

    @classmethod
-    def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
+    def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)
@@ -270,17 +274,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):

        logger = logging.getLogger('neighborhoods')

-        qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
-        ensure_dir(qs_scratch_dir)
-
        logger.info('Creating ClickThatHood neighborhoods')
        cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()

        logger.info('Creating OSM neighborhoods')
        osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)

-        logger.info('Creating Quattroshapes neighborhoods')
-        qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
+        logger.info('Creating WhosOnFirst neighborhoods')
+        wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods"))

        country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)

@@ -292,7 +293,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):

        char_scripts = get_chars_by_script()

-        for idx in (cth, qs, osmn):
+        for idx in (cth, wof, osmn):
            for i in xrange(idx.i):
                props = idx.get_properties(i)
                name = props.get('name')
@@ -317,11 +318,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
            index.index_polygon(poly.context)
            index.add_polygon(poly.context, props)

-        qs.matched = [False] * qs.i
+        wof.matched = [False] * wof.i
        cth.matched = [False] * cth.i

        logger.info('Matching OSM points to neighborhood polygons')
-        # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons
+        # Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons
        num_polys = 0
        for element_id, attrs, deps in parse_osm(filename):
            try:
@@ -359,14 +360,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
            for name_key in OSM_NAME_TAGS:
                osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])

-            for idx in (cth, qs):
+            for idx in (cth, wof):
                candidates = idx.get_candidate_polygons(lat, lon, return_all=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

-                    normalized_qs_names = {}
+                    normalized_wof_names = {}

                    for osm_name in osm_names:

@@ -375,16 +376,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):

                        for i in candidates:
                            props = idx.get_properties(i)
-                            name = normalized_qs_names.get(i)
+                            name = normalized_wof_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
-                                normalized_qs_names[i] = name
+                                normalized_wof_names[i] = name

-                            if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood':
+                            if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood':
                                continue

                            if not contains_ideographs:
@@ -446,7 +447,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
                        continue
                    source = 'osm_cth'
                else:
-                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
+                    level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None)

                    source = 'osm_quattro'
                    if level == 'neighborhood':
@@ -467,7 +468,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

-        for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')):
+        for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')):
            for i in xrange(idx.i):
                props = idx.get_properties(i)
                poly = idx.get_polygon(i)
@@ -482,7 +483,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
                        props['polygon_type'] = 'local_admin'
                    else:
                        continue
-                elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
+                elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood':
                    component = AddressFormatter.SUBURB
                    name = props.get('name')
                    if not name:
@@ -525,28 +526,67 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
        return sorted(candidates, key=self.priority)


-class QuattroshapesNeighborhoodsReverseGeocoder(GeohashPolygonIndex, QuattroshapesReverseGeocoder):
+class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex):
    persistent_polygons = False
    cache_size = None

+    NAME = "wof:name"
+    ASCII_NAME = "gn:asciiname"
+    LEVEL = "wof:placetype"
+    GEONAMES_ID = "gn:geonameid"
+    SUPERSEDED = "wof:superseded_by"
+
+    NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"}
+    POLYGON_TYPES = {"Polygon", "MultiPolygon"}
+
    @classmethod
-    def create_neighborhoods_index(cls, quattroshapes_dir,
-                                   output_dir,
-                                   index_filename=None,
-                                   polys_filename=DEFAULT_POLYS_FILENAME):
-        local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME)
-        neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME)
-        return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename],
-                                          output_dir, index_filename=index_filename,
-                                          polys_filename=polys_filename)
+    def is_valid_neighbourhood(cls, geojson):
+        validity = not geojson["properties"].get(cls.SUPERSEDED)
+        for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}:
+            validity &= geojson["properties"].get(field)
+        return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES
+
+    @classmethod
+    def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None):
+        index = cls(save_dir=output_dir, index_filename=index_filename)
+
+        for root, dirnames, filenames in os.walk(wof_dir):
+            for fname in fnmatch.filter(filenames, "*.geojson"):
+                with open(os.path.join(root, fname)) as f:
+                    geojson = json.load(f)
+                    if cls.is_valid_neighbourhood(geojson):
+                        properties = {
+                            "name": safe_decode(geojson["properties"].get(cls.NAME)),
+                            "name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)),
+                            "qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)),
+                            "gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID))
+                        }
+
+                    poly_type = geojson['geometry']['type']
+                    if poly_type == 'Polygon':
+                        poly = cls.to_polygon(geojson['geometry']['coordinates'][0])
+                        index.index_polygon(poly)
+                        poly = index.simplify_polygon(poly)
+                        index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props)
+                    elif poly_type == 'MultiPolygon':
+                        polys = []
+                        for coords in geojson['geometry']['coordinates']:
+                            poly = cls.to_polygon(coords[0])
+                            polys.append(poly)
+                            index.index_polygon(poly)
+
+                        multi_poly = index.simplify_polygon(MultiPolygon(polys))
+                        index.add_polygon(multi_poly, dict(geojson['properties']))
+
+        return index


 if __name__ == '__main__':
    # Handle argument parsing here
    parser = argparse.ArgumentParser()

-    parser.add_argument('-q', '--quattroshapes-dir',
-                        help='Path to quattroshapes dir')
+    parser.add_argument('-w', '--wof-dir',
+                        help='Path to WhosOnFirst dir')

    parser.add_argument('-a', '--osm-admin-rtree-dir',
                        help='Path to OSM admin rtree dir')
@@ -567,16 +607,16 @@ if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    args = parser.parse_args()
-    if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
-        index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
+    if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
+        index = NeighborhoodReverseGeocoder.create_from_osm_and_wof(
            args.osm_neighborhoods_file,
-            args.quattroshapes_dir,
+            args.wof_dir,
            args.country_rtree_dir,
            args.osm_admin_rtree_dir,
            args.osm_neighborhood_borders_file,
            args.out_dir
        )
    else:
-        parser.error('Must specify quattroshapes dir or osm admin borders file')
+        parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file')

    index.save()
--- a/scripts/geodata/polygons/index.py
+++ b/scripts/geodata/polygons/index.py
@@ -226,7 +226,6 @@ class PolygonIndex(object):
    @classmethod
    def create_from_geojson_files(cls, inputs, output_dir,
                                  index_filename=None,
-                                  polys_filename=DEFAULT_POLYS_FILENAME,
                                  include_only_properties=None):
        index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
        for input_file in inputs:
--- a/scripts/geodata/whosonfirst/download_wof_admin_polygon.py
+++ b/scripts/geodata/whosonfirst/download_wof_admin_polygon.py
@@ -0,0 +1,27 @@
+import os
+import pycountry
+import subprocess
+import sys
+
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+
+WOF_DATA_ADMIN_REPO_URL_PREFIX = "https://github.com/whosonfirst-data/whosonfirst-data/"
+WOF_DATA_ADMIN_REPO_PREFIX = "whosonfirst-data-admin-"
+
+
+def download_wof_data_admin(wof_dir):
+    for country_object in pycountry.countries:
+        repo_name = WOF_DATA_ADMIN_REPO_PREFIX + country_object.alpha2.lower()
+        repo_location = os.path.join(wof_dir, repo_name)
+        if not os.path.exists(repo_location):
+            subprocess.call(["git", "clone", WOF_DATA_ADMIN_REPO_URL_PREFIX + repo_name])
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        sys.exit('Usage: python download_whosonfirst_data.py wof_dir')
+
+    download_wof_data_admin(sys.argv[1])
--- a/src/acronyms.c
+++ b/src/acronyms.c
@@ -42,7 +42,8 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co
                    address_expansion_t expansion = expansions[i];
                    if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
                        char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
-                        if (string_contains(canonical, " ")) {
+                        bool is_possible_acronym = string_contains(canonical, " ") || (phrase.len == 1 && address_expansion_in_dictionary(expansion, DICTIONARY_DIRECTIONAL));
+                        if (is_possible_acronym) {
                            for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
                                existing_acronyms[j] = 1;
                            }
--- a/src/crf_context.c
+++ b/src/crf_context.c
@@ -40,8 +40,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
    }

    if (context->flag & CRF_CONTEXT_MARGINALS) {
-#ifdef USE_SSE
-        context->exp_state = double_matrix_new_aligned(T, L, 16);
+#if defined(USE_SSE)
+        context->exp_state = double_matrix_new_aligned(T, L, 32);
        if (context->exp_state == NULL) goto exit_context_created;
        double_matrix_zero(context->exp_state);
 #else
@@ -52,8 +52,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
        context->mexp_state = double_matrix_new_zeros(T, L);
        if (context->mexp_state == NULL) goto exit_context_created;

-#ifdef USE_SSE
-        context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16);
+#if defined(USE_SSE)
+        context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32);
        if (context->exp_state_trans == NULL) goto exit_context_created;
        double_matrix_zero(context->exp_state_trans);
 #else
@@ -64,8 +64,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
        context->mexp_state_trans = double_matrix_new_zeros(T, L * L);
        if (context->mexp_state_trans == NULL) goto exit_context_created;

-#ifdef USE_SSE
-        context->exp_trans = double_matrix_new_aligned(L, L, 16);
+#if defined(USE_SSE)
+        context->exp_trans = double_matrix_new_aligned(L, L, 32);
        if (context->exp_trans == NULL) goto exit_context_created;
        double_matrix_zero(context->exp_trans);
 #else
@@ -130,14 +130,14 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) {

    if (self->flag & CRF_CONTEXT_MARGINALS &&
        (
-#ifdef USE_SSE
-            !double_matrix_resize_aligned(self->exp_state, T, L, 16) ||
+#if defined(USE_SSE)
+            !double_matrix_resize_aligned(self->exp_state, T, L, 32) ||
 #else
            !double_matrix_resize(self->exp_state, T, L) ||
 #endif
            !double_matrix_resize(self->mexp_state, T, L) ||
-#ifdef USE_SSE
-            !double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) ||
+#if defined(USE_SSE)
+            !double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) ||
 #else
            !double_matrix_resize(self->exp_state_trans, T, L * L) ||            
 #endif
@@ -184,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) {
    }

    if (self->exp_state != NULL) {
-#ifdef USE_SSE
+#if defined(USE_SSE)
        double_matrix_destroy_aligned(self->exp_state);
 #else
        double_matrix_destroy(self->exp_state);
@@ -200,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) {
    }

    if (self->exp_state_trans != NULL) {
-#ifdef USE_SSE
+#if defined(USE_SSE)
        double_matrix_destroy_aligned(self->exp_state_trans);
 #else
        double_matrix_destroy(self->exp_state_trans);
@@ -216,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) {
    }

    if (self->exp_trans != NULL) {
-#ifdef USE_SSE
+#if defined(USE_SSE)
        double_matrix_destroy_aligned(self->exp_trans);
 #else
        double_matrix_destroy(self->exp_trans);
--- a/src/expand.c
+++ b/src/expand.c
@@ -15,6 +15,14 @@
 #include "token_types.h"
 #include "transliterate.h"

+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifndef HAVE_STRNDUP
+#include "strndup.h"
+#endif
+

 #define DEFAULT_KEY_LEN 32

@@ -1567,7 +1575,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt

    size_t len = strlen(input);

-    language_classifier_response_t *lang_response = NULL;
+    libpostal_language_classifier_response_t *lang_response = NULL;

    if (options.num_languages == 0) {
         lang_response = classify_languages(input);
@@ -1627,7 +1635,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
    kh_destroy(str_set, unique_strings);

    if (lang_response != NULL) {
-        language_classifier_response_destroy(lang_response);
+        libpostal_language_classifier_response_destroy(lang_response);
    }

    char_array_destroy(temp_string);
--- a/src/file_utils.c
+++ b/src/file_utils.c
@@ -198,7 +198,7 @@ bool file_write_float(FILE *file, float value) {
 }

 inline uint32_t file_deserialize_uint32(unsigned char *buf) {
-    return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
+    return ((uint32_t)buf[0] << 24) | ((uint32_t)buf[1] << 16) | ((uint32_t)buf[2] << 8) | (uint32_t)buf[3];
 }

 bool file_read_uint32(FILE *file, uint32_t *value) {
@@ -243,7 +243,7 @@ bool file_write_uint32(FILE *file, uint32_t value) {


 inline uint16_t file_deserialize_uint16(unsigned char *buf) {
-    return (buf[0] << 8) | buf[1];
+    return ((uint16_t)buf[0] << 8) | buf[1];
 }


--- a/src/language_classifier.c
+++ b/src/language_classifier.c
@@ -46,7 +46,7 @@ language_classifier_t *get_language_classifier(void) {
    return language_classifier;
 }

-void language_classifier_response_destroy(language_classifier_response_t *self) {
+void language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
    if (self == NULL) return;
    if (self->languages != NULL) {
        free(self->languages);
@@ -59,7 +59,7 @@ void language_classifier_response_destroy(language_classifier_response_t *self)
    free(self);
 }

-language_classifier_response_t *classify_languages(char *address) {
+libpostal_language_classifier_response_t *classify_languages(char *address) {
    language_classifier_t *classifier = get_language_classifier();
    
    if (classifier == NULL) {
@@ -88,7 +88,7 @@ language_classifier_response_t *classify_languages(char *address) {
    size_t n = classifier->num_labels;
    double_matrix_t *p_y = double_matrix_new_zeros(1, n);

-    language_classifier_response_t *response = NULL;
+    libpostal_language_classifier_response_t *response = NULL;
    bool model_exp = false;
    if (classifier->weights_type == MATRIX_DENSE) {
        model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y);
@@ -129,7 +129,7 @@ language_classifier_response_t *classify_languages(char *address) {

        free(indices);

-        response = malloc(sizeof(language_classifier_response_t));
+        response = malloc(sizeof(libpostal_language_classifier_response_t));
        response->num_languages = num_languages;
        response->languages = languages;
        response->probs = probs;
--- a/src/language_classifier.h
+++ b/src/language_classifier.h
@@ -6,6 +6,8 @@
 #include <stdint.h>
 #include <stdbool.h>

+#include "libpostal.h"
+
 #include "collections.h"
 #include "language_features.h"
 #include "logistic_regression.h"
@@ -29,21 +31,14 @@ typedef struct language_classifier {
    } weights;
 } language_classifier_t;

-
-typedef struct language_classifier_response {
-    size_t num_languages;
-    char **languages;
-    double *probs;
-} language_classifier_response_t;
-
 // General usage

 language_classifier_t *language_classifier_new(void);
 language_classifier_t *get_language_classifier(void);
 language_classifier_t *get_language_classifier_country(void);

-language_classifier_response_t *classify_languages(char *address);
-void language_classifier_response_destroy(language_classifier_response_t *self);
+libpostal_language_classifier_response_t *classify_languages(char *address);
+void language_classifier_response_destroy(libpostal_language_classifier_response_t *self);

 void language_classifier_destroy(language_classifier_t *self);

@@ -58,4 +53,4 @@ bool language_classifier_module_setup(char *dir);
 void language_classifier_module_teardown(void);


-#endif
+#endif
--- a/src/language_classifier_cli.c
+++ b/src/language_classifier_cli.c
@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
    }


-    language_classifier_response_t *response = classify_languages(address);
+    libpostal_language_classifier_response_t *response = classify_languages(address);
    if (response == NULL) {
        printf("Could not classify language\n");
        exit(EXIT_FAILURE);
--- a/src/language_classifier_test.c
+++ b/src/language_classifier_test.c
@@ -34,7 +34,7 @@ double test_accuracy(char *filename) {
            continue;
        }

-        language_classifier_response_t *response = classify_languages(address);
+        libpostal_language_classifier_response_t *response = classify_languages(address);
        if (response == NULL || response->num_languages == 0) {
            printf("%s\tNULL\t%s\n", language, address);
            continue;
--- a/src/libpostal.c
+++ b/src/libpostal.c
@@ -85,6 +85,17 @@ libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(
    return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS;
 }

+char **libpostal_near_dupe_name_hashes(char *name, libpostal_normalize_options_t normalize_options, size_t *num_hashes) {
+    cstring_array *strings = name_word_hashes(name, normalize_options);
+    if (strings == NULL) {
+        *num_hashes = 0;
+        return NULL;
+    }
+    *num_hashes = cstring_array_num_strings(strings);
+    return cstring_array_to_strings(strings);
+}
+
+
 char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) {
    cstring_array *strings = near_dupe_hashes(num_components, labels, values, options);
    if (strings == NULL) {
@@ -108,7 +119,7 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels


 char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
-    language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
+    libpostal_language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
    if (lang_response == NULL) {
        *num_languages = 0;
        return NULL;
@@ -285,20 +296,22 @@ bool libpostal_setup_datadir(char *datadir) {
        numex_path = path_join(3, datadir, LIBPOSTAL_NUMEX_SUBDIR, NUMEX_DATA_FILE);
        address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
    }
+    
+    bool setup_succeed = true;

    if (!transliteration_module_setup(transliteration_path)) {
        log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
-        return false;
+        setup_succeed = false;
    }

-    if (!numex_module_setup(numex_path)) {
+    if (setup_succeed && !numex_module_setup(numex_path)) {
        log_error("Error loading numex module, dir=%s\n", numex_path);
-        return false;
+        setup_succeed = false;
    }

-    if (!address_dictionary_module_setup(address_dictionary_path)) {
+    if (setup_succeed && !address_dictionary_module_setup(address_dictionary_path)) {
        log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
-        return false;
+        setup_succeed = false;
    }

    if (transliteration_path != NULL) {
@@ -313,7 +326,7 @@ bool libpostal_setup_datadir(char *datadir) {
        free(address_dictionary_path);
    }

-    return true;
+    return setup_succeed;
 }

 bool libpostal_setup(void) {
--- a/src/libpostal.h
+++ b/src/libpostal.h
@@ -204,8 +204,8 @@ typedef struct libpostal_near_dupe_hash_options {
    bool address_only_keys;
 } libpostal_near_dupe_hash_options_t;

-
 LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void);
+LIBPOSTAL_EXPORT char **libpostal_near_dupe_name_hashes(char *name, libpostal_normalize_options_t normalize_options, size_t *num_hashes);
 LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes);
 LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes);

--- a/src/libpostal_data.in
+++ b/src/libpostal_data.in
@@ -14,6 +14,8 @@ LIBPOSTAL_DATA_DIR=$3
 MB=$((1024*1024))
 CHUNK_SIZE=$((64*$MB))

+DATAMODEL="@MODEL@"
+
 # Not loving this approach but there appears to be no way to query the size
 # of a release asset without using the Github API
 LIBPOSTAL_DATA_FILE_CHUNKS=1
@@ -34,6 +36,20 @@ LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"

 LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"

+if [ "$DATAMODEL" = "senzing" ]; then
+    LIBPOSTAL_DATA_FILE_CHUNKS=1
+    LIBPOSTAL_PARSER_MODEL_CHUNKS=1
+    LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
+
+    LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING@"
+
+    LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION@"
+    LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION@"
+    LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION@"
+
+    LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com"
+fi
+
 LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
 LIBPOSTAL_DATA_DIR_VERSION=

--- a/src/matrix.h
+++ b/src/matrix.h
@@ -6,7 +6,9 @@
 #include <stdint.h>
 #include <stdbool.h>

+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif

 #include "collections.h"
 #include "file_utils.h"
@@ -31,7 +33,7 @@ typedef enum {
    } name##_t;                                                                                                 \
                                                                                                                \
    static name##_t *name##_new(size_t m, size_t n) {                                                           \
-        name##_t *matrix = malloc(sizeof(name##_t));                                                            \
+        name##_t *matrix = malloc(sizeof(name##_t));                                                \
                                                                                                                \
        if (matrix == NULL) {                                                                                   \
            return NULL;                                                                                        \
@@ -60,7 +62,7 @@ typedef enum {
        matrix->m = m;                                                                                          \
        matrix->n = n;                                                                                          \
                                                                                                                \
-        matrix->values = _aligned_malloc(sizeof(type) * m * n, alignment);                                      \
+        matrix->values = aligned_malloc(sizeof(type) * m * n, alignment);                                       \
        if (matrix->values == NULL) {                                                                           \
            free(matrix);                                                                                       \
            return NULL;                                                                                        \
@@ -84,7 +86,7 @@ typedef enum {
        if (self == NULL) return;                                                                               \
                                                                                                                \
        if (self->values != NULL) {                                                                             \
-            _aligned_free(self->values);                                                                        \
+            aligned_free(self->values);                                                                         \
        }                                                                                                       \
                                                                                                                \
        free(self);                                                                                             \
@@ -116,7 +118,7 @@ typedef enum {
        if (self == NULL) return false;                                                                         \
                                                                                                                \
        if (m * n > (self->m * self->n)) {                                                                      \
-            type *ptr = _aligned_realloc(self->values, sizeof(type) * m * n, alignment);                        \
+            type *ptr = aligned_resize(self->values, sizeof(type) * self->m * self->n, sizeof(type) * m * n, alignment); \
            if (ptr == NULL) {                                                                                  \
                return false;                                                                                   \
            }                                                                                                   \
--- a/src/near_dupe.c
+++ b/src/near_dupe.c
@@ -387,6 +387,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
                log_debug("token_str = %s\n", token_str);

                add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings, ngrams);
+                add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams);
            // For non-Latin words (Arabic, Cyrllic, etc.) just add the word
            // For ideograms, we do two-character shingles, so only add the first character if the string has one token
            } else if (!ideogram || j > 0 || num_tokens == 1) {
@@ -669,7 +670,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels,

    libpostal_normalize_options_t normalize_options = libpostal_get_default_options();

-    language_classifier_response_t *lang_response = NULL;
+    libpostal_language_classifier_response_t *lang_response = NULL;

    if (num_languages == 0) {
        lang_response = place_languages(num_components, labels, values);
--- a/src/near_dupe.h
+++ b/src/near_dupe.h
@@ -8,6 +8,7 @@
 #include "libpostal.h"
 #include "string_utils.h"

+cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options);
 cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options);
 cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages);

--- a/src/numex.c
+++ b/src/numex.c
@@ -5,6 +5,15 @@

 #include "log/log.h"

+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifndef HAVE_STRNDUP
+#include "strndup.h"
+#endif
+
+
 #define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB

 #define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n"
--- a/src/place.c
+++ b/src/place.c
@@ -17,10 +17,10 @@ static inline bool is_address_text_component(char *label) {
            );
 }

-language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
+libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
    if (num_components == 0 || values == NULL || labels == NULL) return NULL;

-    language_classifier_response_t *lang_response = NULL;
+    libpostal_language_classifier_response_t *lang_response = NULL;
    
    char *label;
    char *value;
--- a/src/place.h
+++ b/src/place.h
@@ -32,7 +32,7 @@ typedef struct place {
    char *website;
 } place_t;

-language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
+libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);

 place_t *place_new(void);

@@ -40,4 +40,4 @@ place_t *place_from_components(size_t num_components, char **labels, char **valu

 void place_destroy(place_t *place);

-#endif
+#endif
--- a/src/sparse_matrix_utils.c
+++ b/src/sparse_matrix_utils.c
@@ -94,15 +94,15 @@ inline bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khas
 }

 uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) {
-    khash_t(int_set) *unique_columns = kh_init(int_set);
+    khash_t(int_uint32) *unique_columns = kh_init(int_uint32);
    uint32_array *ret = uint32_array_new();

    if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) {
-        kh_destroy(int_set, unique_columns);
+        kh_destroy(int_uint32, unique_columns);
        return ret;
    }

-    kh_destroy(int_set, unique_columns);
+    kh_destroy(int_uint32, unique_columns);
    uint32_array_destroy(ret);
    return NULL;
 }
--- a/src/sse2neon.h
+++ b/src/sse2neon.h
--- a/src/vector.h
+++ b/src/vector.h
@@ -7,43 +7,44 @@

 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
 #include <malloc.h>
+static inline void *aligned_malloc(size_t size, size_t alignment) {
+    return _aligned_malloc(size, alignment);
+}
+static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment) {
+    return _aligned_realloc(p, new_size, alignment);
+}
+static inline void aligned_free(void *p) {
+    _aligned_free(p);
+}
 #else
 #include <stdlib.h>
-static inline void *_aligned_malloc(size_t size, size_t alignment)
+static inline void *aligned_malloc(size_t size, size_t alignment)
 {
    void *p;
    int ret = posix_memalign(&p, alignment, size);
    return (ret == 0) ? p : NULL;
 }
-static inline void *_aligned_realloc(void *p, size_t size, size_t alignment)
+static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment)
 {
    if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) {
        return NULL;
    }

-    if (size == 0) {
+    if (p == NULL) {
        return NULL;
    }

-    void *rp = realloc(p, size);
-
-    /* If realloc result is not already at an aligned boundary,
-       _aligned_malloc a new block and copy the contents of the realloc'd
-       pointer to the aligned block, free the realloc'd pointer and return
-       the aligned pointer.
-    */
-    if ( ((size_t)rp & (alignment - 1)) != 0) {
-        void *p1 = _aligned_malloc(size, alignment);
-        if (p1 != NULL) {
-            memcpy(p1, rp, size);
-        }
-        free(rp);
-        rp = p1;
+    void *p1 = aligned_malloc(new_size, alignment);
+    if (p1 == NULL) {
+        free(p);
+        return NULL;
    }

-    return rp;
+    memcpy(p1, p, old_size);
+    free(p);
+    return p1;
 }
-static inline void _aligned_free(void *p)
+static inline void aligned_free(void *p)
 {
    free(p);
 }
@@ -79,7 +80,7 @@ static inline void _aligned_free(void *p)
        name *array = malloc(sizeof(name));                                                 \
        if (array == NULL) return NULL;                                                     \
        array->n = array->m = 0;                                                            \
-        array->a = _aligned_malloc(size * sizeof(type), alignment);                         \
+        array->a = aligned_malloc(size * sizeof(type), alignment);                          \
        if (array->a == NULL) return NULL;                                                  \
        array->m = size;                                                                    \
        return array;                                                                       \
@@ -94,7 +95,7 @@ static inline void _aligned_free(void *p)
    }                                                                                       \
    static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) {  \
        if (size <= array->m) return true;                                                  \
-        type *ptr = _aligned_realloc(array->a, sizeof(type) * size, alignment);             \
+        type *ptr = aligned_resize(array->a, sizeof(type) * array->m, sizeof(type) * size, alignment); \
        if (ptr == NULL) return false;                                                      \
        array->a = ptr;                                                                     \
        array->m = size;                                                                    \
@@ -160,7 +161,7 @@ static inline void _aligned_free(void *p)
    }                                                                   \
    static inline void name##_destroy_aligned(name *array) {            \
        if (array == NULL) return;                                      \
-        if (array->a != NULL) _aligned_free(array->a);                  \
+        if (array->a != NULL) aligned_free(array->a);                   \
        free(array);                                                    \
    }

@@ -182,7 +183,7 @@ static inline void _aligned_free(void *p)
                free_func(array->a[i]);                                 \
            }                                                           \
        }                                                               \
-        _aligned_free(array->a);                                        \
+        aligned_free(array->a);                                        \
        free(array);                                                    \
    }

--- a/src/vector_math.h
+++ b/src/vector_math.h
@@ -8,7 +8,7 @@

 #define ks_lt_index(a, b) ((a).value < (b).value)

-#ifdef  USE_SSE
+#if   defined(USE_SSE)
 #include <emmintrin.h>
 #endif

@@ -338,7 +338,7 @@



-#ifdef USE_SSE
+#if defined(USE_SSE)
 /*
 From https://github.com/herumi/fmath/blob/master/fastexp.cpp

@@ -524,4 +524,4 @@ static inline void remez9_0_log2_sse(double *values, size_t num)



-#endif
+#endif
--- a/versions/senzing/base_data
+++ b/versions/senzing/base_data
@@ -0,0 +1 @@
+v1.0.0
--- a/versions/senzing/language_classifier
+++ b/versions/senzing/language_classifier
@@ -0,0 +1 @@
+v1.0.0
--- a/versions/senzing/parser
+++ b/versions/senzing/parser
@@ -0,0 +1 @@
+v1.0.0
--- a/windows/configure.ac
+++ b/windows/configure.ac
@@ -75,6 +75,7 @@ AS_IF([test "x$FOUND_GSHUF" = xyes],  [AC_DEFINE([HAVE_GSHUF], [1], [gshuf avail
 # ------------------------------------------------------------------
 # Checks for SSE2 build
 # ------------------------------------------------------------------
+
 AC_ARG_ENABLE([sse2],
    AS_HELP_STRING(
        [--disable-sse2],