Merge branch 'master' into rust-bindings-mention

This commit is contained in:
Al B
2025-02-08 12:13:03 -05:00
committed by GitHub
61 changed files with 9249 additions and 238 deletions

36
.github/workflows/test.yml vendored Normal file
View File

@@ -0,0 +1,36 @@
name: Test
on:
push:
branches: [master]
pull_request:
branches: [master]
workflow_dispatch:
jobs:
build_and_test:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- name: Install Dependencies Linux
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt-get update -y
sudo apt-get install curl autoconf automake libtool pkg-config
- name: Install Dependencies MacOS
if: runner.os == 'macOS'
run: |
brew update
brew install curl autoconf automake libtool pkg-config
- name: Build
env:
LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data
run: |
./bootstrap.sh
./configure --datadir=$LIBPOSTAL_DATA_DIR
make
- name: Test
run: make check

View File

@@ -1,83 +0,0 @@
language: c
branches:
only:
- master
env:
global:
- secure: "bHrAu46oecEj3gjamT+XWXtf2J0ZJCFa8tUdgM4evscaJiiwv1TtsGXyhIj/ai7DlRIPVJUtBUy6uoGGjr6GT43zTrzSxYAOMdVXZYsnTDcdL1/0dbwcIK6/u0EI377s1buGIxG1fHveWKXuXwJWDAw4KS+5HU88a42+zMbhKe4="
- secure: "SkvNYucKVns9qDjOEW2WIhDlOMKBOwhzVcwY++HWTRtn04ErrqR4k01Mmho0jGBQD9JrPLhDgnX1BNy5s+Kmq/bxn9OZm7K1z24qBKb0mBBiNEnf2jvT0AvF5xxM+cJf4KKNL+CC0MwNf5y7HVPq1xibOV4/CNIrc1ZZc9aqdkE="
- secure: "am/rRca5akv7gSSMeNQfHnWiTHhk8fQhOZvZ0Ut+PezkQlLgKp7bzmMFkkuQ4L5hpJU40kFzuWmIPgO33dacgq69Vx/Xct1bEnxGBGjriI5qOhMizmzLYPs5uWiRjtJnBqb4JOUh5K7JBlwrgvD72fY5ZK2lwtzTksfWo8N+ahU="
- secure: "mh/WDQapGJb6MAFvgCjiMAAv1aa8gUaIs2Ohtx7yPrDBwsD8UqlyEM7ktGLZGQ1q/7OJ/Z6QfDMfJQwDKzxyUSY1yHZTNkP3QzkTt2D1Qyvi++O6EkGqSdSS6Lb3aID3IsEaye/yasJ+rxiRSp05O9+OYvhJlqRZnzaimiAv5KI="
- secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
- GH_REF=github.com/openvenues/libpostal
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l)
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
- TAG_VERSION=$(cat ./versions/base).$TRAVIS_BUILD_NUMBER
- SRC_TARBALL_FILENAME=libpostal-$(cat ./versions/base).tar.gz
- LIBPOSTAL_DATA_DIR=$(pwd)/data
- LIBPOSTAL_DATA_FILENAME=libpostal_data.tar.gz
compiler:
- clang
- gcc
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-4.8
- pkg-config
before_script:
- ./bootstrap.sh
- if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
- if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
install:
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
script:
- ./configure --datadir=$LIBPOSTAL_DATA_DIR
- make -j4
- if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
- if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
- if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
- make check
after_success:
- |
if [[ "$CC" == "gcc" && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" ]]; then
if [[ ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then
export PATH=$PATH:env/bin/;
git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1
cp src/*_data.c _travis/src
echo "$TAG_VERSION" > _travis/versions/base_data
cd _travis
git config user.name "$GIT_COMMITTER_NAME";
git config user.email "$GIT_COMMITTER_EMAIL";
git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER";
git push --quiet origin master;
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME $BASIC_MODULE_DIRS
fi
git tag $TAG_VERSION -a -m "[auto][ci skip] Generating tag for Travis build #$TRAVIS_BUILD_NUMBER";
git push --tags --quiet origin master;
fi;
before_deploy:
- make dist
deploy:
- provider: releases
file:
- "$SRC_TARBALL_FILENAME"
on:
tags: true
branch: master
skip_cleanup: true
- provider: releases
file:
- "$LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME"
on:
tags: true
branch: master
condition: "$CC = gcc && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 )"
skip_cleanup: true

View File

@@ -1,6 +1,6 @@
## Submitting Issues ## Submitting Issues
When submitting issues to libpostal, please repeect these guildelines: When submitting issues to libpostal, please respect these guidelines:
- Be constructive. Try to help solve the problem. - Be constructive. Try to help solve the problem.
- Always search for existing issues before submitting one. - Always search for existing issues before submitting one.

View File

@@ -8,7 +8,7 @@ I was checking out libpostal, and saw something that could be improved.
--- ---
#### Here's how I'm using libpostal #### Here's how I'm using libpostal
<!-- Always interested to know how people use the library! What are you working on? Which orgnization? What's your use case? --> <!-- Always interested to know how people use the library! What are you working on? Which organization? What's your use case? -->
--- ---
#### Here's what I did #### Here's what I did

View File

@@ -1,6 +1,6 @@
# libpostal: international street address NLP # libpostal: international street address NLP
[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) [![Build Status](https://github.com/openvenues/libpostal/actions/workflows/test.yml/badge.svg)](https://github.com/openvenues/libpostal/actions)
[![Build Status](https://ci.appveyor.com/api/projects/status/github/openvenues/libpostal?branch=master&svg=true)](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master) [![Build Status](https://ci.appveyor.com/api/projects/status/github/openvenues/libpostal?branch=master&svg=true)](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
[![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE) [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE)
[![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors) [![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors)
@@ -98,7 +98,7 @@ Before you install, make sure you have the following prerequisites:
**On Ubuntu/Debian** **On Ubuntu/Debian**
``` ```
sudo apt-get install curl autoconf automake libtool pkg-config sudo apt-get install -y curl build-essential autoconf automake libtool pkg-config
``` ```
**On CentOS/RHEL** **On CentOS/RHEL**
@@ -113,12 +113,26 @@ brew install curl autoconf automake libtool pkg-config
Then to install the C library: Then to install the C library:
If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed.
``` ```
git clone https://github.com/openvenues/libpostal git clone https://github.com/openvenues/libpostal
cd libpostal cd libpostal
./bootstrap.sh ./bootstrap.sh
./configure --datadir=[...some dir with a few GB of space...] ./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
make -j4 make -j4
# For Intel/AMD processors and the default model
./configure --datadir=[...some dir with a few GB of space...]
# For Apple / ARM cpus and the default model
./configure --datadir=[...some dir with a few GB of space...] --disable-sse2
# For the improved Senzing model:
./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing
make -j8
sudo make install sudo make install
# On Linux it's probably a good idea to run # On Linux it's probably a good idea to run
@@ -175,6 +189,24 @@ If you require a .lib import library to link this to your application. You can g
lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64 lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64
``` ```
Installation with an alternative data model
-------------------------------------------
An alternative data model is available for libpostal. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling.
To enable this add `MODEL=senzing` to the conigure line during installation:
```
./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing
```
The data for this model is gotten from [OpenAddress](https://openaddresses.io/), [OpenStreetMap](https://www.openstreetmap.org/) and data generated by Senzing based on customer feedback (a few hundred records), a total of about 1.2 billion records of data from over 230 countries, in 100+ languages. The data from OpenStreetMap and OpenAddress is good but not perfect so the data set was modified by filtering out badly formed addresses, correcting misclassified address tokens and removing tokens that didn't belong in the addresses, whenever these conditions were encountered.
Senzing created a data set of 12950 addresses from 89 countries that it uses to test and verify the quality of its models. The data set was generated using random addresses from OSM, minimally 50 per country. Hard-to-parse addresses were gotten from Senzing support team and customers and from the libpostal github page and added to this set. The Senzing model got 4.3% better parsing results than the default model, using this test set.
The size of this model is about 2.2GB compared to 1.8GB for the default model so keep that in mind if storages space is important.
Further information about this data model can be found at: https://github.com/Senzing/libpostal-data
If you run into any issues with this model, whether they have to do with parses, installation or any other problems, then please report them at https://github.com/Senzing/libpostal-data
Examples of parsing Examples of parsing
------------------- -------------------
@@ -382,23 +414,19 @@ Libpostal is designed to be used by higher-level languages. If you don't see yo
- LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal) - LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
- Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal) - Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
- Elixir: [Expostal](https://github.com/SweetIQ/expostal) - Elixir: [Expostal](https://github.com/SweetIQ/expostal)
- Haskell: [haskell-postal](http://github.com/netom/haskell-postal)
- Rust: [rust-postal](https://github.com/pnordahl/rust-postal) - Rust: [rust-postal](https://github.com/pnordahl/rust-postal)
- Rust: [rustpostal](https://crates.io/crates/rustpostal)
**Database extensions** **Unofficial database extensions**
- PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal) - PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)
**Unofficial REST API** **Unofficial servers**
- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest) - Libpostal REST Go Docker: [libpostal-rest-docker](https://github.com/johnlonganecker/libpostal-rest-docker)
- Libpostal REST FastAPI Docker: [libpostal-fastapi](https://github.com/alpha-affinity/libpostal-fastapi)
**Libpostal REST Docker** - Libpostal ZeroMQ Docker: [libpostal-zeromq](https://github.com/pasupulaphani/libpostal-docker)
- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker)
**Libpostal ZeroMQ Docker**
- Libpostal ZeroMQ Docker image: [pasupulaphani/libpostal-zeromq](https://hub.docker.com/r/pasupulaphani/libpostal-zeromq/) , Source: [Github](https://github.com/pasupulaphani/libpostal-docker)
Tests Tests
@@ -473,7 +501,7 @@ optionally be separated so Rosenstraße and Rosen Straße are equivalent.
for a wide variety of countries and languages, not just US/English. for a wide variety of countries and languages, not just US/English.
The model is trained on over 1 billion addresses and address-like strings, using the The model is trained on over 1 billion addresses and address-like strings, using the
templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted, templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted,
tagged traning examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py) tagged training examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
are performed to make the training data resemble real messy geocoder input as closely as possible. are performed to make the training data resemble real messy geocoder input as closely as possible.
- **Language classification**: multinomial logistic regression - **Language classification**: multinomial logistic regression
@@ -495,7 +523,7 @@ language (IX => 9) which occur in the names of many monarchs, popes, etc.
- **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec, - **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec,
implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian
languages chracter by character instead of on whitespace. languages character by character instead of on whitespace.
- **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form, - **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form,
strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration. strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration.
@@ -519,6 +547,7 @@ Non-goals
- Verifying that a location is a valid address - Verifying that a location is a valid address
- Actually geocoding addresses to a lat/lon (that requires a database/search index) - Actually geocoding addresses to a lat/lon (that requires a database/search index)
- Extracting addresses from free text
Raison d'être Raison d'être
------------- -------------
@@ -624,7 +653,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions:
- Confines almost all mallocs to *name*_new and all frees to *name*_destroy - Confines almost all mallocs to *name*_new and all frees to *name*_destroy
- Efficient existing implementations for simple things like hashtables - Efficient existing implementations for simple things like hashtables
- Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible - Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
- Data structrues take advantage of sparsity as much as possible - Data structures take advantage of sparsity as much as possible
- Efficient double-array trie implementation for most string dictionaries - Efficient double-array trie implementation for most string dictionaries
- Cross-platform as much as possible, particularly for *nix - Cross-platform as much as possible, particularly for *nix

View File

@@ -60,6 +60,17 @@ AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION]) AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION]) AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
# Senzing data
AC_SUBST([LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING], [v1])
SENZING_DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/senzing/base_data)
SENZING_PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/parser)
SENZING_LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/language_classifier)
AC_SUBST([LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION], [$SENZING_DATA_FILE_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION], [$SENZING_PARSER_MODEL_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION], [$SENZING_LANG_CLASS_MODEL_LATEST_VERSION])
AC_CONFIG_FILES([Makefile AC_CONFIG_FILES([Makefile
libpostal.pc libpostal.pc
src/Makefile src/Makefile
@@ -75,6 +86,7 @@ AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf avail
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Checks for SSE2 build # Checks for SSE2 build
# ------------------------------------------------------------------ # ------------------------------------------------------------------
AC_ARG_ENABLE([sse2], AC_ARG_ENABLE([sse2],
AS_HELP_STRING( AS_HELP_STRING(
[--disable-sse2], [--disable-sse2],
@@ -82,7 +94,7 @@ AC_ARG_ENABLE([sse2],
) )
) )
AS_IF([test "x$enable_sse2" != "xno"], [ AS_IF([test "x$enable_sse2" != "xno" && test "x$(uname -m)" != "xarm64"], [
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}" CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
]) ])
@@ -96,6 +108,9 @@ AC_ARG_ENABLE([data-download],
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;; *) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
esac], [DOWNLOAD_DATA=true]) esac], [DOWNLOAD_DATA=true])
AC_ARG_VAR(MODEL, [Option to use alternative data models. Currently available is "senzing" (MODEL=senzing). If this option is not set the default libpostal data model is used.])
AS_VAR_IF([MODEL], [], [],
[AS_VAR_IF([MODEL], [senzing], [], [AC_MSG_FAILURE([Invalid MODEL value set])])])
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"]) AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])

View File

@@ -3,9 +3,11 @@ libpostal_get_default_options
libpostal_expand_address libpostal_expand_address
libpostal_expansion_array_destroy libpostal_expansion_array_destroy
libpostal_address_parser_response_destroy libpostal_address_parser_response_destroy
libpostal_language_classifier_response_destroy
libpostal_get_address_parser_default_options libpostal_get_address_parser_default_options
libpostal_parse_address libpostal_parse_address
libpostal_parser_print_features libpostal_parser_print_features
libpostal_classify_language
libpostal_setup libpostal_setup
libpostal_setup_datadir libpostal_setup_datadir
libpostal_teardown libpostal_teardown

View File

@@ -152,11 +152,21 @@ if test $ax_cblas_ok = no; then
[], [-lblas])]) [], [-lblas])])
fi fi
# BLAS in OpenBLAS library?
if test $ax_cblas_ok = no; then
AC_CHECK_LIB(openblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lopenblas"])
fi
# Generic CBLAS library? # Generic CBLAS library?
if test $ax_cblas_ok = no; then if test $ax_cblas_ok = no; then
AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"]) AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"])
fi fi
# Generic BLAS library?
if test $ax_cblas_ok = no; then
AC_CHECK_LIB(blas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lblas"])
fi
AC_SUBST(CBLAS_LIBS) AC_SUBST(CBLAS_LIBS)
LIBS="$ax_cblas_save_LIBS" LIBS="$ax_cblas_save_LIBS"

View File

@@ -63,10 +63,23 @@ numbers:
house_numbers: house_numbers:
gebaude: &gebaude
canonical: gebäude
abbreviated: geb
sample: true
canonical_probability: 0.5
abbreviated_probability: 0.5
sample_probability: 0.05
numeric:
direction: left
alphanumeric: alphanumeric:
default: *nummer default: *nummer
probability: 0.95
alternatives:
- alternative: *gebaude
probability: 0.05
alphanumeric_phrase_probability: 0.0001 alphanumeric_phrase_probability: 0.05
conscription_numbers: conscription_numbers:
alphanumeric: alphanumeric:

View File

@@ -49,7 +49,7 @@ numbers:
house_numbers: house_numbers:
budnyok: &budnyok budynok: &budynok
canonical: будинок canonical: будинок
abbreviated: буд abbreviated: буд
sample: true sample: true
@@ -58,8 +58,8 @@ house_numbers:
sample_probability: 0.1 sample_probability: 0.1
numeric: numeric:
direction: left direction: left
budnyok_latin: &budnyok_latin budynok_latin: &budynok_latin
canonical: budnyok canonical: budynok
abbreviated: bud abbreviated: bud
sample: true sample: true
canonical_probability: 0.6 canonical_probability: 0.6
@@ -88,10 +88,10 @@ house_numbers:
direction: left direction: left
alphanumeric: alphanumeric:
default: *budnyok default: *budynok
probability: 0.65 probability: 0.65
alternatives: alternatives:
- alternative: *budnyok_latin - alternative: *budynok_latin
probability: 0.05 probability: 0.05
- alternative: *dom - alternative: *dom
probability: 0.25 probability: 0.25

View File

@@ -11,8 +11,9 @@
overrides: overrides:
id: id:
# Buenos Aires (state boundary coterminous with city) relation:
"3082668": null # Buenos Aires (state boundary coterminous with city)
"3082668": null
contained_by: contained_by:
relation: relation:
# Buenos Aires # Buenos Aires

View File

@@ -10,10 +10,10 @@
"8": "city" "8": "city"
"9": "suburb" "9": "suburb"
overrides: overrides:
id: id:
relation: relation:
# Taiwan Province # Taiwan Province
"3777248": "state" "3777248": "state"
# Fujian Province # Fujian Province
"3777250": "state" "3777250": "state"

View File

@@ -132,6 +132,7 @@ falls|fls
fare fare
farm|frm farm|frm
farms|frms farms|frms
farm to market|fm|farm-to-market
fern fern
ferry|fry|fy ferry|fry|fy
field|fld|fd field|fld|fd

View File

@@ -64,7 +64,7 @@ rhode island|ri
saskatchewan|sk saskatchewan|sk
south carolina|sc south carolina|sc
south dakota|sd south dakota|sd
southern australia|sa south australia|sa
tasmania|tas tasmania|tas
tennessee|tn tennessee|tn
texas|tx texas|tx

View File

@@ -1,12 +1,18 @@
aleja|al aleja|al
autostrada autostrada
boczna boczna
bulwar bulwar|bulw
droga droga
obwodnica obwodnica
ogród
osiedle|os
park
plac|pl plac|pl
rondo rondo
rynek rynek
skwer
szosa szosa
ulica|ul ulica|ul
wybrzeże|wyb
wyspa
zaulek zaulek

View File

@@ -0,0 +1,6 @@
anexa
bloc|blc|bl
casa
cladirea|cladire
complex
garaj

View File

@@ -0,0 +1,5 @@
banca
organizatie neguvernamentala|ong
societate comerciala|sc
societate cu raspundere limitata|srl
societate pe actiuni|sa

View File

@@ -1,5 +1,5 @@
& &
colț|colt colț|colt|colț cu|colt cu
între|intre între|intre
la colțul de pe|la coltul de pe la colțul de pe|la coltul de pe
și|si și|si

View File

@@ -1 +1 @@
intrare intrare|intrarea

View File

@@ -4,4 +4,4 @@ din
in apropiere de in apropiere de
în apropiere|in apropiere în apropiere|in apropiere
în jurul aici|in jurul aici în jurul aici|in jurul aici
lângă mine|langa mine lângă mine|langa mine|lângă|langa

View File

@@ -1 +1 @@
număr|numar|nr|nº|n°|#|№|no număr|numar|nr|nº|n°|#|№|no|numarul|numărul

View File

@@ -8,6 +8,7 @@ general|gen
major|maj major|maj
locotenent locotenent
locotenent colonel locotenent colonel
pictor
profesor|prof profesor|prof
sergent sergent
sublocotenent sublocotenent

View File

@@ -0,0 +1,3 @@
cinema
cafenea
fabrica

View File

@@ -1 +1,7 @@
bloc|bl bloc|bl
cartier|cartierul
comuna|comunā
kilometrul|kilometru|km
sat|satul
sector|sectorul|sect
zona

View File

@@ -1,2 +1,3 @@
și|si|& și|si|&
cel cel
intre

View File

@@ -1,12 +1,12 @@
aleea|ale|alea|al aleea|ale|alea|al
bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard
calea|cal calea|cale|cal
drumul drumul|drum
fundătura|fundatura|fnd fundătura|fundatura|fnd
fundacul|fdc fundacul|fdc
intrarea|int|intr intrarea|int|intr
piaţa|piata|piață|pta|pţa|p-ta|p-ţa piaţa|piata|piață|pta|pţa|p-ta|p-ţa
strada|str strada|str|st
stradela|str-la|sdla stradela|str-la|sdla
șoseaua|soseaua|sos|șos șoseaua|soseaua|sos|șos
splaiul|sp|spl splaiul|sp|spl

View File

@@ -0,0 +1 @@
decembrie|dec

View File

@@ -1,4 +1,8 @@
apartament|ap|apt|apart apartamentul|apartament|ap|apt|apart
birou birou
cladire|cladirea|clădire|clădirea
corp|corpul
complex
interior|int
lotul lotul
sală|sala sală|sala

View File

@@ -1,4 +1,4 @@
requests==2.9.1 requests==2.20.0
six==1.10.0 six==1.10.0
PyYAML==3.11 PyYAML==5.4
ujson==1.33 ujson==1.33

View File

@@ -1,7 +1,7 @@
boto3==1.4.0 boto3==1.4.0
botocore==1.4.53 botocore==1.4.53
Fiona==1.6.3.post1 Fiona==1.6.3.post1
PyYAML==3.11 PyYAML==5.4
Rtree==0.8.2 Rtree==0.8.2
Shapely==1.5.14 Shapely==1.5.14
Unidecode==0.4.19 Unidecode==0.4.19
@@ -17,7 +17,7 @@ gevent==1.1.2
greenlet==0.4.10 greenlet==0.4.10
jmespath==0.9.0 jmespath==0.9.0
leveldb==0.193 leveldb==0.193
lxml==3.6.0 lxml==4.6.3
lru-dict==1.1.3 lru-dict==1.1.3
marisa-trie==0.7.2 marisa-trie==0.7.2
numpy==1.10.4 numpy==1.10.4
@@ -29,7 +29,7 @@ pyproj==1.9.5.1
pystache==0.5.4 pystache==0.5.4
python-Levenshtein==0.12.0 python-Levenshtein==0.12.0
python-geohash==0.8.5 python-geohash==0.8.5
requests==2.9.1 requests==2.20.0
s3transfer==0.1.3 s3transfer==0.1.3
six==1.10.0 six==1.10.0
ujson==1.35 ujson==1.35

View File

@@ -42,7 +42,8 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co
address_expansion_t expansion = expansions[i]; address_expansion_t expansion = expansions[i];
if (expansion.canonical_index != NULL_CANONICAL_INDEX) { if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
char *canonical = address_dictionary_get_canonical(expansion.canonical_index); char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
if (string_contains(canonical, " ")) { bool is_possible_acronym = string_contains(canonical, " ") || (phrase.len == 1 && address_expansion_in_dictionary(expansion, DICTIONARY_DIRECTIONAL));
if (is_possible_acronym) {
for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
existing_acronyms[j] = 1; existing_acronyms[j] = 1;
} }

View File

@@ -1,4 +1,5 @@
#include "crf_context.h" #include "crf_context.h"
#include "float_utils.h"
crf_context_t *crf_context_new(int flag, size_t L, size_t T) { crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
crf_context_t *context = malloc(sizeof(crf_context_t)); crf_context_t *context = malloc(sizeof(crf_context_t));
@@ -39,8 +40,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
} }
if (context->flag & CRF_CONTEXT_MARGINALS) { if (context->flag & CRF_CONTEXT_MARGINALS) {
#ifdef USE_SSE #if defined(USE_SSE)
context->exp_state = double_matrix_new_aligned(T, L, 16); context->exp_state = double_matrix_new_aligned(T, L, 32);
if (context->exp_state == NULL) goto exit_context_created; if (context->exp_state == NULL) goto exit_context_created;
double_matrix_zero(context->exp_state); double_matrix_zero(context->exp_state);
#else #else
@@ -51,8 +52,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
context->mexp_state = double_matrix_new_zeros(T, L); context->mexp_state = double_matrix_new_zeros(T, L);
if (context->mexp_state == NULL) goto exit_context_created; if (context->mexp_state == NULL) goto exit_context_created;
#ifdef USE_SSE #if defined(USE_SSE)
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16); context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32);
if (context->exp_state_trans == NULL) goto exit_context_created; if (context->exp_state_trans == NULL) goto exit_context_created;
double_matrix_zero(context->exp_state_trans); double_matrix_zero(context->exp_state_trans);
#else #else
@@ -63,8 +64,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
context->mexp_state_trans = double_matrix_new_zeros(T, L * L); context->mexp_state_trans = double_matrix_new_zeros(T, L * L);
if (context->mexp_state_trans == NULL) goto exit_context_created; if (context->mexp_state_trans == NULL) goto exit_context_created;
#ifdef USE_SSE #if defined(USE_SSE)
context->exp_trans = double_matrix_new_aligned(L, L, 16); context->exp_trans = double_matrix_new_aligned(L, L, 32);
if (context->exp_trans == NULL) goto exit_context_created; if (context->exp_trans == NULL) goto exit_context_created;
double_matrix_zero(context->exp_trans); double_matrix_zero(context->exp_trans);
#else #else
@@ -129,14 +130,14 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) {
if (self->flag & CRF_CONTEXT_MARGINALS && if (self->flag & CRF_CONTEXT_MARGINALS &&
( (
#ifdef USE_SSE #if defined(USE_SSE)
!double_matrix_resize_aligned(self->exp_state, T, L, 16) || !double_matrix_resize_aligned(self->exp_state, T, L, 32) ||
#else #else
!double_matrix_resize(self->exp_state, T, L) || !double_matrix_resize(self->exp_state, T, L) ||
#endif #endif
!double_matrix_resize(self->mexp_state, T, L) || !double_matrix_resize(self->mexp_state, T, L) ||
#ifdef USE_SSE #if defined(USE_SSE)
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) || !double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) ||
#else #else
!double_matrix_resize(self->exp_state_trans, T, L * L) || !double_matrix_resize(self->exp_state_trans, T, L * L) ||
#endif #endif
@@ -183,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) {
} }
if (self->exp_state != NULL) { if (self->exp_state != NULL) {
#ifdef USE_SSE #if defined(USE_SSE)
double_matrix_destroy_aligned(self->exp_state); double_matrix_destroy_aligned(self->exp_state);
#else #else
double_matrix_destroy(self->exp_state); double_matrix_destroy(self->exp_state);
@@ -199,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) {
} }
if (self->exp_state_trans != NULL) { if (self->exp_state_trans != NULL) {
#ifdef USE_SSE #if defined(USE_SSE)
double_matrix_destroy_aligned(self->exp_state_trans); double_matrix_destroy_aligned(self->exp_state_trans);
#else #else
double_matrix_destroy(self->exp_state_trans); double_matrix_destroy(self->exp_state_trans);
@@ -215,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) {
} }
if (self->exp_trans != NULL) { if (self->exp_trans != NULL) {
#ifdef USE_SSE #if defined(USE_SSE)
double_matrix_destroy_aligned(self->exp_trans); double_matrix_destroy_aligned(self->exp_trans);
#else #else
double_matrix_destroy(self->exp_trans); double_matrix_destroy(self->exp_trans);

View File

@@ -15,6 +15,14 @@
#include "token_types.h" #include "token_types.h"
#include "transliterate.h" #include "transliterate.h"
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifndef HAVE_STRNDUP
#include "strndup.h"
#endif
#define DEFAULT_KEY_LEN 32 #define DEFAULT_KEY_LEN 32
@@ -1567,7 +1575,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
size_t len = strlen(input); size_t len = strlen(input);
language_classifier_response_t *lang_response = NULL; libpostal_language_classifier_response_t *lang_response = NULL;
if (options.num_languages == 0) { if (options.num_languages == 0) {
lang_response = classify_languages(input); lang_response = classify_languages(input);
@@ -1627,7 +1635,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
kh_destroy(str_set, unique_strings); kh_destroy(str_set, unique_strings);
if (lang_response != NULL) { if (lang_response != NULL) {
language_classifier_response_destroy(lang_response); libpostal_language_classifier_response_destroy(lang_response);
} }
char_array_destroy(temp_string); char_array_destroy(temp_string);

View File

@@ -198,7 +198,7 @@ bool file_write_float(FILE *file, float value) {
} }
inline uint32_t file_deserialize_uint32(unsigned char *buf) { inline uint32_t file_deserialize_uint32(unsigned char *buf) {
return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; return ((uint32_t)buf[0] << 24) | ((uint32_t)buf[1] << 16) | ((uint32_t)buf[2] << 8) | (uint32_t)buf[3];
} }
bool file_read_uint32(FILE *file, uint32_t *value) { bool file_read_uint32(FILE *file, uint32_t *value) {
@@ -243,7 +243,7 @@ bool file_write_uint32(FILE *file, uint32_t value) {
inline uint16_t file_deserialize_uint16(unsigned char *buf) { inline uint16_t file_deserialize_uint16(unsigned char *buf) {
return (buf[0] << 8) | buf[1]; return ((uint16_t)buf[0] << 8) | buf[1];
} }

View File

@@ -292,16 +292,28 @@ static int geohashstr_to_interleaved(char *r, size_t length, uint16_t *interleav
if(j== 0) i[0] = map[c[ 0]]<<11; if(j== 0) i[0] = map[c[ 0]]<<11;
if(j== 1) i[0] += map[c[ 1]]<< 6; if(j== 1) i[0] += map[c[ 1]]<< 6;
if(j== 2) i[0] += map[c[ 2]]<< 1; if(j== 2) i[0] += map[c[ 2]]<< 1;
if(j== 3) i[0] += map[c[ 3]]>> 4; if(j== 3) {
i[0] += map[c[ 3]]>> 4;
i[1] = map[c[ 3]]<<12;
}
if(j== 4) i[1] += map[c[ 4]]<< 7; if(j== 4) i[1] += map[c[ 4]]<< 7;
if(j== 5) i[1] += map[c[ 5]]<< 2; if(j== 5) i[1] += map[c[ 5]]<< 2;
if(j== 6) i[1] += map[c[ 6]]>> 3; if(j== 6) {
i[1] += map[c[ 6]]>> 3;
i[2] = map[c[ 6]]<<13;
}
if(j== 7) i[2] += map[c[ 7]]<< 8; if(j== 7) i[2] += map[c[ 7]]<< 8;
if(j== 8) i[2] += map[c[ 8]]<< 3; if(j== 8) i[2] += map[c[ 8]]<< 3;
if(j== 9) i[2] += map[c[ 9]]>> 2; if(j== 9) {
i[2] += map[c[ 9]]>> 2;
i[3] = map[c[ 9]]<<14;
}
if(j==10) i[3] += map[c[10]]<< 9; if(j==10) i[3] += map[c[10]]<< 9;
if(j==11) i[3] += map[c[11]]<< 4; if(j==11) i[3] += map[c[11]]<< 4;
if(j==12) i[3] += map[c[12]]>> 1; if(j==12) {
i[3] += map[c[12]]>> 1;
i[4] = map[c[12]]<<15;
}
if(j==13) i[4] += map[c[13]]<<10; if(j==13) i[4] += map[c[13]]<<10;
if(j==14) i[4] += map[c[14]]<< 5; if(j==14) i[4] += map[c[14]]<< 5;
if(j==15) i[4] += map[c[15]]>> 0; if(j==15) i[4] += map[c[15]]>> 0;

View File

@@ -29,12 +29,6 @@
#define RAND48_MULT_2 (0x0005) #define RAND48_MULT_2 (0x0005)
#define RAND48_ADD (0x000b) #define RAND48_ADD (0x000b)
unsigned short _rand48_seed[3];
unsigned short _rand48_mult[3];
unsigned short _rand48_add;
void _dorand48(unsigned short xseed[3]); void _dorand48(unsigned short xseed[3]);
double erand48(unsigned short xseed[3]); double erand48(unsigned short xseed[3]);

View File

@@ -46,7 +46,7 @@ language_classifier_t *get_language_classifier(void) {
return language_classifier; return language_classifier;
} }
void language_classifier_response_destroy(language_classifier_response_t *self) { void language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
if (self == NULL) return; if (self == NULL) return;
if (self->languages != NULL) { if (self->languages != NULL) {
free(self->languages); free(self->languages);
@@ -59,7 +59,7 @@ void language_classifier_response_destroy(language_classifier_response_t *self)
free(self); free(self);
} }
language_classifier_response_t *classify_languages(char *address) { libpostal_language_classifier_response_t *classify_languages(char *address) {
language_classifier_t *classifier = get_language_classifier(); language_classifier_t *classifier = get_language_classifier();
if (classifier == NULL) { if (classifier == NULL) {
@@ -88,7 +88,7 @@ language_classifier_response_t *classify_languages(char *address) {
size_t n = classifier->num_labels; size_t n = classifier->num_labels;
double_matrix_t *p_y = double_matrix_new_zeros(1, n); double_matrix_t *p_y = double_matrix_new_zeros(1, n);
language_classifier_response_t *response = NULL; libpostal_language_classifier_response_t *response = NULL;
bool model_exp = false; bool model_exp = false;
if (classifier->weights_type == MATRIX_DENSE) { if (classifier->weights_type == MATRIX_DENSE) {
model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y); model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y);
@@ -129,7 +129,7 @@ language_classifier_response_t *classify_languages(char *address) {
free(indices); free(indices);
response = malloc(sizeof(language_classifier_response_t)); response = malloc(sizeof(libpostal_language_classifier_response_t));
response->num_languages = num_languages; response->num_languages = num_languages;
response->languages = languages; response->languages = languages;
response->probs = probs; response->probs = probs;

View File

@@ -6,6 +6,8 @@
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#include "libpostal.h"
#include "collections.h" #include "collections.h"
#include "language_features.h" #include "language_features.h"
#include "logistic_regression.h" #include "logistic_regression.h"
@@ -29,21 +31,14 @@ typedef struct language_classifier {
} weights; } weights;
} language_classifier_t; } language_classifier_t;
typedef struct language_classifier_response {
size_t num_languages;
char **languages;
double *probs;
} language_classifier_response_t;
// General usage // General usage
language_classifier_t *language_classifier_new(void); language_classifier_t *language_classifier_new(void);
language_classifier_t *get_language_classifier(void); language_classifier_t *get_language_classifier(void);
language_classifier_t *get_language_classifier_country(void); language_classifier_t *get_language_classifier_country(void);
language_classifier_response_t *classify_languages(char *address); libpostal_language_classifier_response_t *classify_languages(char *address);
void language_classifier_response_destroy(language_classifier_response_t *self); void language_classifier_response_destroy(libpostal_language_classifier_response_t *self);
void language_classifier_destroy(language_classifier_t *self); void language_classifier_destroy(language_classifier_t *self);

View File

@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
} }
language_classifier_response_t *response = classify_languages(address); libpostal_language_classifier_response_t *response = classify_languages(address);
if (response == NULL) { if (response == NULL) {
printf("Could not classify language\n"); printf("Could not classify language\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);

View File

@@ -7,6 +7,7 @@
#include "language_classifier_io.h" #include "language_classifier_io.h"
#include "string_utils.h" #include "string_utils.h"
#include "trie_utils.h" #include "trie_utils.h"
#include "transliterate.h"
double test_accuracy(char *filename) { double test_accuracy(char *filename) {
@@ -33,7 +34,7 @@ double test_accuracy(char *filename) {
continue; continue;
} }
language_classifier_response_t *response = classify_languages(address); libpostal_language_classifier_response_t *response = classify_languages(address);
if (response == NULL || response->num_languages == 0) { if (response == NULL || response->num_languages == 0) {
printf("%s\tNULL\t%s\n", language, address); printf("%s\tNULL\t%s\n", language, address);
continue; continue;

View File

@@ -85,6 +85,17 @@ libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(
return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS; return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS;
} }
char **libpostal_near_dupe_name_hashes(char *name, libpostal_normalize_options_t normalize_options, size_t *num_hashes) {
cstring_array *strings = name_word_hashes(name, normalize_options);
if (strings == NULL) {
*num_hashes = 0;
return NULL;
}
*num_hashes = cstring_array_num_strings(strings);
return cstring_array_to_strings(strings);
}
char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) { char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) {
cstring_array *strings = near_dupe_hashes(num_components, labels, values, options); cstring_array *strings = near_dupe_hashes(num_components, labels, values, options);
if (strings == NULL) { if (strings == NULL) {
@@ -108,7 +119,7 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels
char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) { char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
language_classifier_response_t *lang_response = place_languages(num_components, labels, values); libpostal_language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
if (lang_response == NULL) { if (lang_response == NULL) {
*num_languages = 0; *num_languages = 0;
return NULL; return NULL;
@@ -202,6 +213,30 @@ libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t nu
return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
} }
libpostal_language_classifier_response_t *libpostal_classify_language(char *address) {
libpostal_language_classifier_response_t *response = classify_languages(address);
if (response == NULL) {
log_error("Language classification returned NULL\n");
return NULL;
}
return response;
}
void libpostal_language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
if (self == NULL) return;
if (self->languages != NULL) {
free(self->languages);
}
if (self->probs) {
free(self->probs);
}
free(self);
}
void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) { void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) {
if (self == NULL) return; if (self == NULL) return;
@@ -262,19 +297,21 @@ bool libpostal_setup_datadir(char *datadir) {
address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE); address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
} }
bool setup_succeed = true;
if (!transliteration_module_setup(transliteration_path)) { if (!transliteration_module_setup(transliteration_path)) {
log_error("Error loading transliteration module, dir=%s\n", transliteration_path); log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
return false; setup_succeed = false;
} }
if (!numex_module_setup(numex_path)) { if (setup_succeed && !numex_module_setup(numex_path)) {
log_error("Error loading numex module, dir=%s\n", numex_path); log_error("Error loading numex module, dir=%s\n", numex_path);
return false; setup_succeed = false;
} }
if (!address_dictionary_module_setup(address_dictionary_path)) { if (setup_succeed && !address_dictionary_module_setup(address_dictionary_path)) {
log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path); log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
return false; setup_succeed = false;
} }
if (transliteration_path != NULL) { if (transliteration_path != NULL) {
@@ -289,7 +326,7 @@ bool libpostal_setup_datadir(char *datadir) {
free(address_dictionary_path); free(address_dictionary_path);
} }
return true; return setup_succeed;
} }
bool libpostal_setup(void) { bool libpostal_setup(void) {

View File

@@ -167,6 +167,19 @@ LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(ch
LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features); LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features);
/*
Language classification
*/
typedef struct libpostal_language_classifier_response {
size_t num_languages;
char **languages;
double *probs;
} libpostal_language_classifier_response_t;
LIBPOSTAL_EXPORT libpostal_language_classifier_response_t *libpostal_classify_language(char *address);
LIBPOSTAL_EXPORT void libpostal_language_classifier_response_destroy(libpostal_language_classifier_response_t *self);
/* /*
Deduping Deduping
@@ -191,8 +204,8 @@ typedef struct libpostal_near_dupe_hash_options {
bool address_only_keys; bool address_only_keys;
} libpostal_near_dupe_hash_options_t; } libpostal_near_dupe_hash_options_t;
LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void); LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void);
LIBPOSTAL_EXPORT char **libpostal_near_dupe_name_hashes(char *name, libpostal_normalize_options_t normalize_options, size_t *num_hashes);
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes);
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes);

View File

@@ -14,6 +14,8 @@ LIBPOSTAL_DATA_DIR=$3
MB=$((1024*1024)) MB=$((1024*1024))
CHUNK_SIZE=$((64*$MB)) CHUNK_SIZE=$((64*$MB))
DATAMODEL="@MODEL@"
# Not loving this approach but there appears to be no way to query the size # Not loving this approach but there appears to be no way to query the size
# of a release asset without using the Github API # of a release asset without using the Github API
LIBPOSTAL_DATA_FILE_CHUNKS=1 LIBPOSTAL_DATA_FILE_CHUNKS=1
@@ -34,6 +36,20 @@ LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download" LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
if [ "$DATAMODEL" = "senzing" ]; then
LIBPOSTAL_DATA_FILE_CHUNKS=1
LIBPOSTAL_PARSER_MODEL_CHUNKS=1
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING@"
LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION@"
LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION@"
LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION@"
LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com"
fi
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
LIBPOSTAL_DATA_DIR_VERSION= LIBPOSTAL_DATA_DIR_VERSION=

View File

@@ -6,7 +6,9 @@
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#ifdef HAVE_CONFIG_H
#include <config.h> #include <config.h>
#endif
#include "collections.h" #include "collections.h"
#include "file_utils.h" #include "file_utils.h"
@@ -31,7 +33,7 @@ typedef enum {
} name##_t; \ } name##_t; \
\ \
static name##_t *name##_new(size_t m, size_t n) { \ static name##_t *name##_new(size_t m, size_t n) { \
name##_t *matrix = malloc(sizeof(name##_t)); \ name##_t *matrix = malloc(sizeof(name##_t)); \
\ \
if (matrix == NULL) { \ if (matrix == NULL) { \
return NULL; \ return NULL; \
@@ -60,7 +62,7 @@ typedef enum {
matrix->m = m; \ matrix->m = m; \
matrix->n = n; \ matrix->n = n; \
\ \
matrix->values = _aligned_malloc(sizeof(type) * m * n, alignment); \ matrix->values = aligned_malloc(sizeof(type) * m * n, alignment); \
if (matrix->values == NULL) { \ if (matrix->values == NULL) { \
free(matrix); \ free(matrix); \
return NULL; \ return NULL; \
@@ -84,7 +86,7 @@ typedef enum {
if (self == NULL) return; \ if (self == NULL) return; \
\ \
if (self->values != NULL) { \ if (self->values != NULL) { \
_aligned_free(self->values); \ aligned_free(self->values); \
} \ } \
\ \
free(self); \ free(self); \
@@ -116,7 +118,7 @@ typedef enum {
if (self == NULL) return false; \ if (self == NULL) return false; \
\ \
if (m * n > (self->m * self->n)) { \ if (m * n > (self->m * self->n)) { \
type *ptr = _aligned_realloc(self->values, sizeof(type) * m * n, alignment); \ type *ptr = aligned_resize(self->values, sizeof(type) * self->m * self->n, sizeof(type) * m * n, alignment); \
if (ptr == NULL) { \ if (ptr == NULL) { \
return false; \ return false; \
} \ } \

View File

@@ -387,6 +387,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
log_debug("token_str = %s\n", token_str); log_debug("token_str = %s\n", token_str);
add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings, ngrams); add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings, ngrams);
add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams);
// For non-Latin words (Arabic, Cyrllic, etc.) just add the word // For non-Latin words (Arabic, Cyrllic, etc.) just add the word
// For ideograms, we do two-character shingles, so only add the first character if the string has one token // For ideograms, we do two-character shingles, so only add the first character if the string has one token
} else if (!ideogram || j > 0 || num_tokens == 1) { } else if (!ideogram || j > 0 || num_tokens == 1) {
@@ -669,7 +670,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels,
libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
language_classifier_response_t *lang_response = NULL; libpostal_language_classifier_response_t *lang_response = NULL;
if (num_languages == 0) { if (num_languages == 0) {
lang_response = place_languages(num_components, labels, values); lang_response = place_languages(num_components, labels, values);

View File

@@ -8,6 +8,7 @@
#include "libpostal.h" #include "libpostal.h"
#include "string_utils.h" #include "string_utils.h"
cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options);
cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options); cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options);
cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages); cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages);

View File

@@ -434,7 +434,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
char_array_append(array, " "); char_array_append(array, " ");
append_char = false; append_char = false;
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) { } else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
append_char = !is_hyphen_between_letter_and_number; append_char = is_hyphen_between_letter_and_number;
} }
if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) { if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) {

View File

@@ -5,6 +5,15 @@
#include "log/log.h" #include "log/log.h"
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifndef HAVE_STRNDUP
#include "strndup.h"
#endif
#define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB #define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB
#define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n" #define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n"

View File

@@ -17,10 +17,10 @@ static inline bool is_address_text_component(char *label) {
); );
} }
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) { libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
if (num_components == 0 || values == NULL || labels == NULL) return NULL; if (num_components == 0 || values == NULL || labels == NULL) return NULL;
language_classifier_response_t *lang_response = NULL; libpostal_language_classifier_response_t *lang_response = NULL;
char *label; char *label;
char *value; char *value;

View File

@@ -32,7 +32,7 @@ typedef struct place {
char *website; char *website;
} place_t; } place_t;
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values); libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
place_t *place_new(void); place_t *place_new(void);

View File

@@ -4,6 +4,8 @@
#include "string_utils.h" #include "string_utils.h"
#include "file_utils.h"
// Run shuf/gshuf on a file in-place if the shuf command is available. // Run shuf/gshuf on a file in-place if the shuf command is available.
bool shuffle_file(char *filename) { bool shuffle_file(char *filename) {
char *shuffle_command = NULL; char *shuffle_command = NULL;

View File

@@ -157,8 +157,8 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
uint32_array **t1_tokens_unicode = NULL; uint32_array **t1_tokens_unicode = NULL;
uint32_array **t2_tokens_unicode = NULL; uint32_array **t2_tokens_unicode = NULL;
uint32_array *t1_unicode; uint32_array *t1_unicode = NULL;
uint32_array *t2_unicode; uint32_array *t2_unicode = NULL;
int64_array *phrase_memberships_array1 = NULL; int64_array *phrase_memberships_array1 = NULL;
int64_array *phrase_memberships_array2 = NULL; int64_array *phrase_memberships_array2 = NULL;
@@ -232,8 +232,8 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
} }
} }
uint32_t *suffixes1; uint32_t *suffixes1 = NULL;
uint32_t *suffixes2; uint32_t *suffixes2 = NULL;
if (ordinal_suffixes1 != NULL && ordinal_suffixes2 != NULL) { if (ordinal_suffixes1 != NULL && ordinal_suffixes2 != NULL) {
suffixes1 = ordinal_suffixes1->a; suffixes1 = ordinal_suffixes1->a;

View File

@@ -94,15 +94,15 @@ inline bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khas
} }
uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) { uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) {
khash_t(int_set) *unique_columns = kh_init(int_set); khash_t(int_uint32) *unique_columns = kh_init(int_uint32);
uint32_array *ret = uint32_array_new(); uint32_array *ret = uint32_array_new();
if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) { if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) {
kh_destroy(int_set, unique_columns); kh_destroy(int_uint32, unique_columns);
return ret; return ret;
} }
kh_destroy(int_set, unique_columns); kh_destroy(int_uint32, unique_columns);
uint32_array_destroy(ret); uint32_array_destroy(ret);
return NULL; return NULL;
} }

8853
src/sse2neon.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -7,43 +7,44 @@
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__) #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
#include <malloc.h> #include <malloc.h>
static inline void *aligned_malloc(size_t size, size_t alignment) {
return _aligned_malloc(size, alignment);
}
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment) {
return _aligned_realloc(p, new_size, alignment);
}
static inline void aligned_free(void *p) {
_aligned_free(p);
}
#else #else
#include <stdlib.h> #include <stdlib.h>
static inline void *_aligned_malloc(size_t size, size_t alignment) static inline void *aligned_malloc(size_t size, size_t alignment)
{ {
void *p; void *p;
int ret = posix_memalign(&p, alignment, size); int ret = posix_memalign(&p, alignment, size);
return (ret == 0) ? p : NULL; return (ret == 0) ? p : NULL;
} }
static inline void *_aligned_realloc(void *p, size_t size, size_t alignment) static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment)
{ {
if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) { if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) {
return NULL; return NULL;
} }
if (size == 0) { if (p == NULL) {
return NULL; return NULL;
} }
void *rp = realloc(p, size); void *p1 = aligned_malloc(new_size, alignment);
if (p1 == NULL) {
/* If realloc result is not already at an aligned boundary, free(p);
_aligned_malloc a new block and copy the contents of the realloc'd return NULL;
pointer to the aligned block, free the realloc'd pointer and return
the aligned pointer.
*/
if ( ((size_t)rp & (alignment - 1)) != 0) {
void *p1 = _aligned_malloc(size, alignment);
if (p1 != NULL) {
memcpy(p1, rp, size);
}
free(rp);
rp = p1;
} }
return rp; memcpy(p1, p, old_size);
free(p);
return p1;
} }
static inline void _aligned_free(void *p) static inline void aligned_free(void *p)
{ {
free(p); free(p);
} }
@@ -79,7 +80,7 @@ static inline void _aligned_free(void *p)
name *array = malloc(sizeof(name)); \ name *array = malloc(sizeof(name)); \
if (array == NULL) return NULL; \ if (array == NULL) return NULL; \
array->n = array->m = 0; \ array->n = array->m = 0; \
array->a = _aligned_malloc(size * sizeof(type), alignment); \ array->a = aligned_malloc(size * sizeof(type), alignment); \
if (array->a == NULL) return NULL; \ if (array->a == NULL) return NULL; \
array->m = size; \ array->m = size; \
return array; \ return array; \
@@ -94,7 +95,7 @@ static inline void _aligned_free(void *p)
} \ } \
static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) { \ static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) { \
if (size <= array->m) return true; \ if (size <= array->m) return true; \
type *ptr = _aligned_realloc(array->a, sizeof(type) * size, alignment); \ type *ptr = aligned_resize(array->a, sizeof(type) * array->m, sizeof(type) * size, alignment); \
if (ptr == NULL) return false; \ if (ptr == NULL) return false; \
array->a = ptr; \ array->a = ptr; \
array->m = size; \ array->m = size; \
@@ -160,7 +161,7 @@ static inline void _aligned_free(void *p)
} \ } \
static inline void name##_destroy_aligned(name *array) { \ static inline void name##_destroy_aligned(name *array) { \
if (array == NULL) return; \ if (array == NULL) return; \
if (array->a != NULL) _aligned_free(array->a); \ if (array->a != NULL) aligned_free(array->a); \
free(array); \ free(array); \
} }
@@ -182,7 +183,7 @@ static inline void _aligned_free(void *p)
free_func(array->a[i]); \ free_func(array->a[i]); \
} \ } \
} \ } \
_aligned_free(array->a); \ aligned_free(array->a); \
free(array); \ free(array); \
} }

View File

@@ -8,7 +8,7 @@
#define ks_lt_index(a, b) ((a).value < (b).value) #define ks_lt_index(a, b) ((a).value < (b).value)
#ifdef USE_SSE #if defined(USE_SSE)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
@@ -338,7 +338,7 @@
#ifdef USE_SSE #if defined(USE_SSE)
/* /*
From https://github.com/herumi/fmath/blob/master/fastexp.cpp From https://github.com/herumi/fmath/blob/master/fastexp.cpp

View File

@@ -132,6 +132,9 @@ TEST test_expansions(void) {
CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs-elysees", options, 1, "fr"));
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs elysees", options, 1, "fr"));
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champselysees", options, 1, "fr"));
CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de")); CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de"));
CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl")); CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));
CHECK_CALL(test_expansion_contains_with_languages("มงแตร", "มงแตร", options, 1, "th")); CHECK_CALL(test_expansion_contains_with_languages("มงแตร", "มงแตร", options, 1, "th"));

View File

@@ -0,0 +1 @@
v1.0.0

View File

@@ -0,0 +1 @@
v1.0.0

1
versions/senzing/parser Normal file
View File

@@ -0,0 +1 @@
v1.0.0

View File

@@ -75,6 +75,7 @@ AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf avail
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Checks for SSE2 build # Checks for SSE2 build
# ------------------------------------------------------------------ # ------------------------------------------------------------------
AC_ARG_ENABLE([sse2], AC_ARG_ENABLE([sse2],
AS_HELP_STRING( AS_HELP_STRING(
[--disable-sse2], [--disable-sse2],