Merge branch 'master' into patch-1
This commit is contained in:
36
.github/workflows/test.yml
vendored
Normal file
36
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches: [master]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build_and_test:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install Dependencies Linux
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install curl autoconf automake libtool pkg-config
|
||||
- name: Install Dependencies MacOS
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
brew update
|
||||
brew install curl autoconf automake libtool pkg-config
|
||||
- name: Build
|
||||
env:
|
||||
LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data
|
||||
run: |
|
||||
./bootstrap.sh
|
||||
./configure --datadir=$LIBPOSTAL_DATA_DIR
|
||||
make
|
||||
- name: Test
|
||||
run: make check
|
||||
53
.travis.yml
53
.travis.yml
@@ -1,53 +0,0 @@
|
||||
language: c
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
env:
|
||||
global:
|
||||
- secure: "bHrAu46oecEj3gjamT+XWXtf2J0ZJCFa8tUdgM4evscaJiiwv1TtsGXyhIj/ai7DlRIPVJUtBUy6uoGGjr6GT43zTrzSxYAOMdVXZYsnTDcdL1/0dbwcIK6/u0EI377s1buGIxG1fHveWKXuXwJWDAw4KS+5HU88a42+zMbhKe4="
|
||||
- secure: "SkvNYucKVns9qDjOEW2WIhDlOMKBOwhzVcwY++HWTRtn04ErrqR4k01Mmho0jGBQD9JrPLhDgnX1BNy5s+Kmq/bxn9OZm7K1z24qBKb0mBBiNEnf2jvT0AvF5xxM+cJf4KKNL+CC0MwNf5y7HVPq1xibOV4/CNIrc1ZZc9aqdkE="
|
||||
- secure: "am/rRca5akv7gSSMeNQfHnWiTHhk8fQhOZvZ0Ut+PezkQlLgKp7bzmMFkkuQ4L5hpJU40kFzuWmIPgO33dacgq69Vx/Xct1bEnxGBGjriI5qOhMizmzLYPs5uWiRjtJnBqb4JOUh5K7JBlwrgvD72fY5ZK2lwtzTksfWo8N+ahU="
|
||||
- secure: "mh/WDQapGJb6MAFvgCjiMAAv1aa8gUaIs2Ohtx7yPrDBwsD8UqlyEM7ktGLZGQ1q/7OJ/Z6QfDMfJQwDKzxyUSY1yHZTNkP3QzkTt2D1Qyvi++O6EkGqSdSS6Lb3aID3IsEaye/yasJ+rxiRSp05O9+OYvhJlqRZnzaimiAv5KI="
|
||||
- secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
|
||||
- GH_REF=github.com/openvenues/libpostal
|
||||
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l)
|
||||
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
|
||||
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
|
||||
compiler:
|
||||
- clang
|
||||
- gcc
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- pkg-config
|
||||
before_script:
|
||||
- ./bootstrap.sh
|
||||
- if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
|
||||
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
|
||||
- if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
|
||||
install:
|
||||
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
|
||||
script:
|
||||
- ./configure --datadir=$(pwd)/data
|
||||
- make -j4
|
||||
- if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
|
||||
- if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
|
||||
- if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
|
||||
- make check
|
||||
after_success:
|
||||
- |
|
||||
if [[ "$CC" == gcc* && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then
|
||||
env/bin/pip install awscli;
|
||||
export PATH=$PATH:env/bin/;
|
||||
./src/libpostal_data upload base $(pwd)/data/libpostal;
|
||||
git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1
|
||||
cp src/*_data.c _travis/src
|
||||
cd _travis
|
||||
git config user.name "$GIT_COMMITTER_NAME";
|
||||
git config user.email "$GIT_COMMITTER_EMAIL";
|
||||
git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER";
|
||||
git push --quiet origin master;
|
||||
fi;
|
||||
@@ -1,6 +1,6 @@
|
||||
## Submitting Issues
|
||||
|
||||
When submitting issues to libpostal, please repeect these guildelines:
|
||||
When submitting issues to libpostal, please respect these guidelines:
|
||||
|
||||
- Be constructive. Try to help solve the problem.
|
||||
- Always search for existing issues before submitting one.
|
||||
|
||||
@@ -8,7 +8,7 @@ I was checking out libpostal, and saw something that could be improved.
|
||||
|
||||
---
|
||||
#### Here's how I'm using libpostal
|
||||
<!-- Always interested to know how people use the library! What are you working on? Which orgnization? What's your use case? -->
|
||||
<!-- Always interested to know how people use the library! What are you working on? Which organization? What's your use case? -->
|
||||
|
||||
---
|
||||
#### Here's what I did
|
||||
|
||||
57
README.md
57
README.md
@@ -1,6 +1,6 @@
|
||||
# libpostal: international street address NLP
|
||||
|
||||
[](https://travis-ci.org/openvenues/libpostal)
|
||||
[](https://github.com/openvenues/libpostal/actions)
|
||||
[](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
|
||||
[](https://github.com/openvenues/libpostal/blob/master/LICENSE)
|
||||
[](#sponsors)
|
||||
@@ -98,7 +98,7 @@ Before you install, make sure you have the following prerequisites:
|
||||
|
||||
**On Ubuntu/Debian**
|
||||
```
|
||||
sudo apt-get install curl autoconf automake libtool pkg-config
|
||||
sudo apt-get install -y curl build-essential autoconf automake libtool pkg-config
|
||||
```
|
||||
|
||||
**On CentOS/RHEL**
|
||||
@@ -113,12 +113,26 @@ brew install curl autoconf automake libtool pkg-config
|
||||
|
||||
Then to install the C library:
|
||||
|
||||
If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed.
|
||||
|
||||
```
|
||||
git clone https://github.com/openvenues/libpostal
|
||||
cd libpostal
|
||||
|
||||
./bootstrap.sh
|
||||
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
|
||||
make -j4
|
||||
|
||||
# For Intel/AMD processors and the default model
|
||||
./configure --datadir=[...some dir with a few GB of space...]
|
||||
|
||||
# For Apple / ARM cpus and the default model
|
||||
./configure --datadir=[...some dir with a few GB of space...] --disable-sse2
|
||||
|
||||
# For the improved Senzing model:
|
||||
./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing
|
||||
|
||||
make -j8
|
||||
sudo make install
|
||||
|
||||
# On Linux it's probably a good idea to run
|
||||
@@ -175,6 +189,24 @@ If you require a .lib import library to link this to your application. You can g
|
||||
lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64
|
||||
```
|
||||
|
||||
Installation with an alternative data model
|
||||
-------------------------------------------
|
||||
|
||||
An alternative data model is available for libpostal. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling.
|
||||
To enable this add `MODEL=senzing` to the conigure line during installation:
|
||||
```
|
||||
./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing
|
||||
```
|
||||
|
||||
The data for this model is gotten from [OpenAddress](https://openaddresses.io/), [OpenStreetMap](https://www.openstreetmap.org/) and data generated by Senzing based on customer feedback (a few hundred records), a total of about 1.2 billion records of data from over 230 countries, in 100+ languages. The data from OpenStreetMap and OpenAddress is good but not perfect so the data set was modified by filtering out badly formed addresses, correcting misclassified address tokens and removing tokens that didn't belong in the addresses, whenever these conditions were encountered.
|
||||
|
||||
Senzing created a data set of 12950 addresses from 89 countries that it uses to test and verify the quality of its models. The data set was generated using random addresses from OSM, minimally 50 per country. Hard-to-parse addresses were gotten from Senzing support team and customers and from the libpostal github page and added to this set. The Senzing model got 4.3% better parsing results than the default model, using this test set.
|
||||
|
||||
The size of this model is about 2.2GB compared to 1.8GB for the default model so keep that in mind if storages space is important.
|
||||
|
||||
Further information about this data model can be found at: https://github.com/Senzing/libpostal-data
|
||||
If you run into any issues with this model, whether they have to do with parses, installation or any other problems, then please report them at https://github.com/Senzing/libpostal-data
|
||||
|
||||
Examples of parsing
|
||||
-------------------
|
||||
|
||||
@@ -382,22 +414,18 @@ Libpostal is designed to be used by higher-level languages. If you don't see yo
|
||||
- LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
|
||||
- Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
|
||||
- Elixir: [Expostal](https://github.com/SweetIQ/expostal)
|
||||
- Haskell: [haskell-postal](http://github.com/netom/haskell-postal)
|
||||
- Rust: [rustpostal](https://crates.io/crates/rustpostal)
|
||||
|
||||
**Database extensions**
|
||||
**Unofficial database extensions**
|
||||
|
||||
- PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)
|
||||
|
||||
**Unofficial REST API**
|
||||
**Unofficial servers**
|
||||
|
||||
- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest)
|
||||
|
||||
**Libpostal REST Docker**
|
||||
|
||||
- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker)
|
||||
|
||||
**Libpostal ZeroMQ Docker**
|
||||
|
||||
- Libpostal ZeroMQ Docker image: [pasupulaphani/libpostal-zeromq](https://hub.docker.com/r/pasupulaphani/libpostal-zeromq/) , Source: [Github](https://github.com/pasupulaphani/libpostal-docker)
|
||||
- Libpostal REST Go Docker: [libpostal-rest-docker](https://github.com/johnlonganecker/libpostal-rest-docker)
|
||||
- Libpostal REST FastAPI Docker: [libpostal-fastapi](https://github.com/alpha-affinity/libpostal-fastapi)
|
||||
- Libpostal ZeroMQ Docker: [libpostal-zeromq](https://github.com/pasupulaphani/libpostal-docker)
|
||||
|
||||
|
||||
Tests
|
||||
@@ -494,7 +522,7 @@ language (IX => 9) which occur in the names of many monarchs, popes, etc.
|
||||
|
||||
- **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec,
|
||||
implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian
|
||||
languages chracter by character instead of on whitespace.
|
||||
languages character by character instead of on whitespace.
|
||||
|
||||
- **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form,
|
||||
strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration.
|
||||
@@ -518,6 +546,7 @@ Non-goals
|
||||
|
||||
- Verifying that a location is a valid address
|
||||
- Actually geocoding addresses to a lat/lon (that requires a database/search index)
|
||||
- Extracting addresses from free text
|
||||
|
||||
Raison d'être
|
||||
-------------
|
||||
|
||||
33
configure.ac
33
configure.ac
@@ -2,7 +2,7 @@
|
||||
# Process this file with autoconf to produce a configure script.
|
||||
|
||||
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
|
||||
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
|
||||
m4_define(LIBPOSTAL_MINOR_VERSION, [1])
|
||||
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
|
||||
|
||||
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
|
||||
@@ -50,10 +50,32 @@ AC_CHECK_TYPES([ptrdiff_t])
|
||||
# Checks for library functions.
|
||||
AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
|
||||
|
||||
AC_SUBST([LIBPOSTAL_DATA_DIR_VERSION_STRING], [v1])
|
||||
|
||||
DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/base_data)
|
||||
PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/parser)
|
||||
LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/language_classifier)
|
||||
|
||||
AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
|
||||
|
||||
# Senzing data
|
||||
AC_SUBST([LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING], [v1])
|
||||
|
||||
SENZING_DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/senzing/base_data)
|
||||
SENZING_PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/parser)
|
||||
SENZING_LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/language_classifier)
|
||||
|
||||
AC_SUBST([LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION], [$SENZING_DATA_FILE_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION], [$SENZING_PARSER_MODEL_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION], [$SENZING_LANG_CLASS_MODEL_LATEST_VERSION])
|
||||
|
||||
AC_CONFIG_FILES([Makefile
|
||||
libpostal.pc
|
||||
src/Makefile
|
||||
test/Makefile])
|
||||
src/libpostal_data
|
||||
test/Makefile], [chmod +x src/libpostal_data])
|
||||
|
||||
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
||||
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
||||
@@ -64,6 +86,7 @@ AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf avail
|
||||
# ------------------------------------------------------------------
|
||||
# Checks for SSE2 build
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
AC_ARG_ENABLE([sse2],
|
||||
AS_HELP_STRING(
|
||||
[--disable-sse2],
|
||||
@@ -71,7 +94,7 @@ AC_ARG_ENABLE([sse2],
|
||||
)
|
||||
)
|
||||
|
||||
AS_IF([test "x$enable_sse2" != "xno"], [
|
||||
AS_IF([test "x$enable_sse2" != "xno" && test "x$(uname -m)" != "xarm64"], [
|
||||
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
|
||||
])
|
||||
|
||||
@@ -85,6 +108,10 @@ AC_ARG_ENABLE([data-download],
|
||||
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
|
||||
esac], [DOWNLOAD_DATA=true])
|
||||
|
||||
AC_ARG_VAR(MODEL, [Option to use alternative data models. Currently available is "senzing" (MODEL=senzing). If this option is not set the default libpostal data model is used.])
|
||||
AS_VAR_IF([MODEL], [], [],
|
||||
[AS_VAR_IF([MODEL], [senzing], [], [AC_MSG_FAILURE([Invalid MODEL value set])])])
|
||||
|
||||
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
|
||||
|
||||
AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],
|
||||
|
||||
@@ -3,9 +3,11 @@ libpostal_get_default_options
|
||||
libpostal_expand_address
|
||||
libpostal_expansion_array_destroy
|
||||
libpostal_address_parser_response_destroy
|
||||
libpostal_language_classifier_response_destroy
|
||||
libpostal_get_address_parser_default_options
|
||||
libpostal_parse_address
|
||||
libpostal_parser_print_features
|
||||
libpostal_classify_language
|
||||
libpostal_setup
|
||||
libpostal_setup_datadir
|
||||
libpostal_teardown
|
||||
|
||||
@@ -152,11 +152,21 @@ if test $ax_cblas_ok = no; then
|
||||
[], [-lblas])])
|
||||
fi
|
||||
|
||||
# BLAS in OpenBLAS library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(openblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lopenblas"])
|
||||
fi
|
||||
|
||||
# Generic CBLAS library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"])
|
||||
fi
|
||||
|
||||
# Generic BLAS library?
|
||||
if test $ax_cblas_ok = no; then
|
||||
AC_CHECK_LIB(blas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lblas"])
|
||||
fi
|
||||
|
||||
AC_SUBST(CBLAS_LIBS)
|
||||
|
||||
LIBS="$ax_cblas_save_LIBS"
|
||||
|
||||
@@ -63,10 +63,23 @@ numbers:
|
||||
|
||||
|
||||
house_numbers:
|
||||
gebaude: &gebaude
|
||||
canonical: gebäude
|
||||
abbreviated: geb
|
||||
sample: true
|
||||
canonical_probability: 0.5
|
||||
abbreviated_probability: 0.5
|
||||
sample_probability: 0.05
|
||||
numeric:
|
||||
direction: left
|
||||
alphanumeric:
|
||||
default: *nummer
|
||||
probability: 0.95
|
||||
alternatives:
|
||||
- alternative: *gebaude
|
||||
probability: 0.05
|
||||
|
||||
alphanumeric_phrase_probability: 0.0001
|
||||
alphanumeric_phrase_probability: 0.05
|
||||
|
||||
conscription_numbers:
|
||||
alphanumeric:
|
||||
|
||||
@@ -49,7 +49,7 @@ numbers:
|
||||
|
||||
|
||||
house_numbers:
|
||||
budnyok: &budnyok
|
||||
budynok: &budynok
|
||||
canonical: будинок
|
||||
abbreviated: буд
|
||||
sample: true
|
||||
@@ -58,8 +58,8 @@ house_numbers:
|
||||
sample_probability: 0.1
|
||||
numeric:
|
||||
direction: left
|
||||
budnyok_latin: &budnyok_latin
|
||||
canonical: budnyok
|
||||
budynok_latin: &budynok_latin
|
||||
canonical: budynok
|
||||
abbreviated: bud
|
||||
sample: true
|
||||
canonical_probability: 0.6
|
||||
@@ -88,10 +88,10 @@ house_numbers:
|
||||
direction: left
|
||||
|
||||
alphanumeric:
|
||||
default: *budnyok
|
||||
default: *budynok
|
||||
probability: 0.65
|
||||
alternatives:
|
||||
- alternative: *budnyok_latin
|
||||
- alternative: *budynok_latin
|
||||
probability: 0.05
|
||||
- alternative: *dom
|
||||
probability: 0.25
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
overrides:
|
||||
id:
|
||||
relation:
|
||||
# Buenos Aires (state boundary coterminous with city)
|
||||
"3082668": null
|
||||
contained_by:
|
||||
|
||||
@@ -64,7 +64,7 @@ rhode island|ri
|
||||
saskatchewan|sk
|
||||
south carolina|sc
|
||||
south dakota|sd
|
||||
southern australia|sa
|
||||
south australia|sa
|
||||
tasmania|tas
|
||||
tennessee|tn
|
||||
texas|tx
|
||||
|
||||
@@ -1,12 +1,18 @@
|
||||
aleja|al
|
||||
autostrada
|
||||
boczna
|
||||
bulwar
|
||||
bulwar|bulw
|
||||
droga
|
||||
obwodnica
|
||||
ogród
|
||||
osiedle|os
|
||||
park
|
||||
plac|pl
|
||||
rondo
|
||||
rynek
|
||||
skwer
|
||||
szosa
|
||||
ulica|ul
|
||||
wybrzeże|wyb
|
||||
wyspa
|
||||
zaulek
|
||||
6
resources/dictionaries/ro/building_types.txt
Normal file
6
resources/dictionaries/ro/building_types.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
anexa
|
||||
bloc|blc|bl
|
||||
casa
|
||||
cladirea|cladire
|
||||
complex
|
||||
garaj
|
||||
5
resources/dictionaries/ro/company_types.txt
Normal file
5
resources/dictionaries/ro/company_types.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
banca
|
||||
organizatie neguvernamentala|ong
|
||||
societate comerciala|sc
|
||||
societate cu raspundere limitata|srl
|
||||
societate pe actiuni|sa
|
||||
@@ -1,5 +1,5 @@
|
||||
&
|
||||
colț|colt
|
||||
colț|colt|colț cu|colt cu
|
||||
între|intre
|
||||
la colțul de pe|la coltul de pe
|
||||
și|si
|
||||
@@ -1 +1 @@
|
||||
intrare
|
||||
intrare|intrarea
|
||||
|
||||
@@ -4,4 +4,4 @@ din
|
||||
in apropiere de
|
||||
în apropiere|in apropiere
|
||||
în jurul aici|in jurul aici
|
||||
lângă mine|langa mine
|
||||
lângă mine|langa mine|lângă|langa
|
||||
|
||||
@@ -1 +1 @@
|
||||
număr|numar|nr|nº|n°|#|№|no
|
||||
număr|numar|nr|nº|n°|#|№|no|numarul|numărul
|
||||
|
||||
@@ -8,6 +8,7 @@ general|gen
|
||||
major|maj
|
||||
locotenent
|
||||
locotenent colonel
|
||||
pictor
|
||||
profesor|prof
|
||||
sergent
|
||||
sublocotenent
|
||||
|
||||
3
resources/dictionaries/ro/place_names.txt
Normal file
3
resources/dictionaries/ro/place_names.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
cinema
|
||||
cafenea
|
||||
fabrica
|
||||
@@ -1 +1,7 @@
|
||||
bloc|bl
|
||||
cartier|cartierul
|
||||
comuna|comunā
|
||||
kilometrul|kilometru|km
|
||||
sat|satul
|
||||
sector|sectorul|sect
|
||||
zona
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
și|si|&
|
||||
cel
|
||||
intre
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
aleea|ale|alea|al
|
||||
bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard
|
||||
calea|cal
|
||||
drumul
|
||||
calea|cale|cal
|
||||
drumul|drum
|
||||
fundătura|fundatura|fnd
|
||||
fundacul|fdc
|
||||
intrarea|int|intr
|
||||
piaţa|piata|piață|pta|pţa|p-ta|p-ţa
|
||||
strada|str
|
||||
strada|str|st
|
||||
stradela|str-la|sdla
|
||||
șoseaua|soseaua|sos|șos
|
||||
splaiul|sp|spl
|
||||
|
||||
1
resources/dictionaries/ro/synonyms.txt
Normal file
1
resources/dictionaries/ro/synonyms.txt
Normal file
@@ -0,0 +1 @@
|
||||
decembrie|dec
|
||||
@@ -1,4 +1,8 @@
|
||||
apartament|ap|apt|apart
|
||||
apartamentul|apartament|ap|apt|apart
|
||||
birou
|
||||
cladire|cladirea|clădire|clădirea
|
||||
corp|corpul
|
||||
complex
|
||||
interior|int
|
||||
lotul
|
||||
sală|sala
|
||||
@@ -1,4 +1,4 @@
|
||||
requests==2.9.1
|
||||
requests==2.20.0
|
||||
six==1.10.0
|
||||
PyYAML==3.11
|
||||
PyYAML==5.4
|
||||
ujson==1.33
|
||||
@@ -1,7 +1,7 @@
|
||||
boto3==1.4.0
|
||||
botocore==1.4.53
|
||||
Fiona==1.6.3.post1
|
||||
PyYAML==3.11
|
||||
PyYAML==5.4
|
||||
Rtree==0.8.2
|
||||
Shapely==1.5.14
|
||||
Unidecode==0.4.19
|
||||
@@ -17,7 +17,7 @@ gevent==1.1.2
|
||||
greenlet==0.4.10
|
||||
jmespath==0.9.0
|
||||
leveldb==0.193
|
||||
lxml==3.6.0
|
||||
lxml==4.6.3
|
||||
lru-dict==1.1.3
|
||||
marisa-trie==0.7.2
|
||||
numpy==1.10.4
|
||||
@@ -29,7 +29,7 @@ pyproj==1.9.5.1
|
||||
pystache==0.5.4
|
||||
python-Levenshtein==0.12.0
|
||||
python-geohash==0.8.5
|
||||
requests==2.9.1
|
||||
requests==2.20.0
|
||||
s3transfer==0.1.3
|
||||
six==1.10.0
|
||||
ujson==1.35
|
||||
|
||||
@@ -42,7 +42,8 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co
|
||||
address_expansion_t expansion = expansions[i];
|
||||
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
|
||||
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
|
||||
if (string_contains(canonical, " ")) {
|
||||
bool is_possible_acronym = string_contains(canonical, " ") || (phrase.len == 1 && address_expansion_in_dictionary(expansion, DICTIONARY_DIRECTIONAL));
|
||||
if (is_possible_acronym) {
|
||||
for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
|
||||
existing_acronyms[j] = 1;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "crf_context.h"
|
||||
#include "float_utils.h"
|
||||
|
||||
crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
||||
crf_context_t *context = malloc(sizeof(crf_context_t));
|
||||
@@ -39,8 +40,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
||||
}
|
||||
|
||||
if (context->flag & CRF_CONTEXT_MARGINALS) {
|
||||
#ifdef USE_SSE
|
||||
context->exp_state = double_matrix_new_aligned(T, L, 16);
|
||||
#if defined(USE_SSE)
|
||||
context->exp_state = double_matrix_new_aligned(T, L, 32);
|
||||
if (context->exp_state == NULL) goto exit_context_created;
|
||||
double_matrix_zero(context->exp_state);
|
||||
#else
|
||||
@@ -51,8 +52,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
||||
context->mexp_state = double_matrix_new_zeros(T, L);
|
||||
if (context->mexp_state == NULL) goto exit_context_created;
|
||||
|
||||
#ifdef USE_SSE
|
||||
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16);
|
||||
#if defined(USE_SSE)
|
||||
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32);
|
||||
if (context->exp_state_trans == NULL) goto exit_context_created;
|
||||
double_matrix_zero(context->exp_state_trans);
|
||||
#else
|
||||
@@ -63,8 +64,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
|
||||
context->mexp_state_trans = double_matrix_new_zeros(T, L * L);
|
||||
if (context->mexp_state_trans == NULL) goto exit_context_created;
|
||||
|
||||
#ifdef USE_SSE
|
||||
context->exp_trans = double_matrix_new_aligned(L, L, 16);
|
||||
#if defined(USE_SSE)
|
||||
context->exp_trans = double_matrix_new_aligned(L, L, 32);
|
||||
if (context->exp_trans == NULL) goto exit_context_created;
|
||||
double_matrix_zero(context->exp_trans);
|
||||
#else
|
||||
@@ -129,14 +130,14 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) {
|
||||
|
||||
if (self->flag & CRF_CONTEXT_MARGINALS &&
|
||||
(
|
||||
#ifdef USE_SSE
|
||||
!double_matrix_resize_aligned(self->exp_state, T, L, 16) ||
|
||||
#if defined(USE_SSE)
|
||||
!double_matrix_resize_aligned(self->exp_state, T, L, 32) ||
|
||||
#else
|
||||
!double_matrix_resize(self->exp_state, T, L) ||
|
||||
#endif
|
||||
!double_matrix_resize(self->mexp_state, T, L) ||
|
||||
#ifdef USE_SSE
|
||||
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) ||
|
||||
#if defined(USE_SSE)
|
||||
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) ||
|
||||
#else
|
||||
!double_matrix_resize(self->exp_state_trans, T, L * L) ||
|
||||
#endif
|
||||
@@ -183,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) {
|
||||
}
|
||||
|
||||
if (self->exp_state != NULL) {
|
||||
#ifdef USE_SSE
|
||||
#if defined(USE_SSE)
|
||||
double_matrix_destroy_aligned(self->exp_state);
|
||||
#else
|
||||
double_matrix_destroy(self->exp_state);
|
||||
@@ -199,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) {
|
||||
}
|
||||
|
||||
if (self->exp_state_trans != NULL) {
|
||||
#ifdef USE_SSE
|
||||
#if defined(USE_SSE)
|
||||
double_matrix_destroy_aligned(self->exp_state_trans);
|
||||
#else
|
||||
double_matrix_destroy(self->exp_state_trans);
|
||||
@@ -215,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) {
|
||||
}
|
||||
|
||||
if (self->exp_trans != NULL) {
|
||||
#ifdef USE_SSE
|
||||
#if defined(USE_SSE)
|
||||
double_matrix_destroy_aligned(self->exp_trans);
|
||||
#else
|
||||
double_matrix_destroy(self->exp_trans);
|
||||
|
||||
43
src/expand.c
43
src/expand.c
@@ -15,6 +15,14 @@
|
||||
#include "token_types.h"
|
||||
#include "transliterate.h"
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_STRNDUP
|
||||
#include "strndup.h"
|
||||
#endif
|
||||
|
||||
|
||||
#define DEFAULT_KEY_LEN 32
|
||||
|
||||
@@ -878,6 +886,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
|
||||
log_debug("have_ambiguous = %d\n", have_ambiguous);
|
||||
log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable);
|
||||
log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
|
||||
|
||||
}
|
||||
|
||||
bool skipped_last_edge_phrase = false;
|
||||
@@ -913,7 +922,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
|
||||
}
|
||||
|
||||
if (token.type != WHITESPACE) {
|
||||
if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) ) {
|
||||
if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) || (prev_phrase.start == phrase.start && prev_phrase.len == phrase.len) ) {
|
||||
log_debug("Adding space\n");
|
||||
string_tree_add_string(tree, " ");
|
||||
string_tree_finalize_token(tree);
|
||||
@@ -1536,6 +1545,29 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *
|
||||
|
||||
|
||||
|
||||
void expand_alternative_phrase_option_languages(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
|
||||
char **temp_languages = calloc(1, sizeof(char *));
|
||||
libpostal_normalize_options_t temp_options = options;
|
||||
|
||||
for (size_t i = 0; i < options.num_languages; i++) {
|
||||
char *lang = options.languages[i];
|
||||
|
||||
temp_languages[0] = lang;
|
||||
temp_options.languages = temp_languages;
|
||||
temp_options.num_languages = 1;
|
||||
expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option);
|
||||
}
|
||||
|
||||
if (options.num_languages == 0) {
|
||||
temp_options.languages = options.languages;
|
||||
temp_options.num_languages = options.num_languages;
|
||||
expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option);
|
||||
}
|
||||
|
||||
free(temp_languages);
|
||||
}
|
||||
|
||||
|
||||
cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) {
|
||||
options.address_components |= LIBPOSTAL_ADDRESS_ANY;
|
||||
|
||||
@@ -1543,7 +1575,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
||||
|
||||
size_t len = strlen(input);
|
||||
|
||||
language_classifier_response_t *lang_response = NULL;
|
||||
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||
|
||||
if (options.num_languages == 0) {
|
||||
lang_response = classify_languages(input);
|
||||
@@ -1566,7 +1598,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
||||
|
||||
if (string_tree_num_strings(tree) == 1) {
|
||||
char *normalized = string_tree_get_alternative(tree, 0, 0);
|
||||
expand_alternative_phrase_option(strings, unique_strings, normalized, options, phrase_option);
|
||||
expand_alternative_phrase_option_languages(strings, unique_strings, normalized, options, phrase_option);
|
||||
|
||||
} else {
|
||||
log_debug("Adding alternatives for multiple normalizations\n");
|
||||
@@ -1587,7 +1619,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
||||
char_array_terminate(temp_string);
|
||||
token = char_array_get_string(temp_string);
|
||||
log_debug("current permutation = %s\n", token);
|
||||
expand_alternative_phrase_option(strings, unique_strings, token, options, phrase_option);
|
||||
expand_alternative_phrase_option_languages(strings, unique_strings, token, options, phrase_option);
|
||||
}
|
||||
|
||||
string_tree_iterator_destroy(iter);
|
||||
@@ -1603,7 +1635,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
||||
kh_destroy(str_set, unique_strings);
|
||||
|
||||
if (lang_response != NULL) {
|
||||
language_classifier_response_destroy(lang_response);
|
||||
libpostal_language_classifier_response_destroy(lang_response);
|
||||
}
|
||||
|
||||
char_array_destroy(temp_string);
|
||||
@@ -1612,7 +1644,6 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
|
||||
*n = cstring_array_num_strings(strings);
|
||||
|
||||
return strings;
|
||||
|
||||
}
|
||||
|
||||
cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
|
||||
|
||||
@@ -198,7 +198,7 @@ bool file_write_float(FILE *file, float value) {
|
||||
}
|
||||
|
||||
inline uint32_t file_deserialize_uint32(unsigned char *buf) {
|
||||
return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
|
||||
return ((uint32_t)buf[0] << 24) | ((uint32_t)buf[1] << 16) | ((uint32_t)buf[2] << 8) | (uint32_t)buf[3];
|
||||
}
|
||||
|
||||
bool file_read_uint32(FILE *file, uint32_t *value) {
|
||||
@@ -243,7 +243,7 @@ bool file_write_uint32(FILE *file, uint32_t value) {
|
||||
|
||||
|
||||
inline uint16_t file_deserialize_uint16(unsigned char *buf) {
|
||||
return (buf[0] << 8) | buf[1];
|
||||
return ((uint16_t)buf[0] << 8) | buf[1];
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -292,16 +292,28 @@ static int geohashstr_to_interleaved(char *r, size_t length, uint16_t *interleav
|
||||
if(j== 0) i[0] = map[c[ 0]]<<11;
|
||||
if(j== 1) i[0] += map[c[ 1]]<< 6;
|
||||
if(j== 2) i[0] += map[c[ 2]]<< 1;
|
||||
if(j== 3) i[0] += map[c[ 3]]>> 4;
|
||||
if(j== 3) {
|
||||
i[0] += map[c[ 3]]>> 4;
|
||||
i[1] = map[c[ 3]]<<12;
|
||||
}
|
||||
if(j== 4) i[1] += map[c[ 4]]<< 7;
|
||||
if(j== 5) i[1] += map[c[ 5]]<< 2;
|
||||
if(j== 6) i[1] += map[c[ 6]]>> 3;
|
||||
if(j== 6) {
|
||||
i[1] += map[c[ 6]]>> 3;
|
||||
i[2] = map[c[ 6]]<<13;
|
||||
}
|
||||
if(j== 7) i[2] += map[c[ 7]]<< 8;
|
||||
if(j== 8) i[2] += map[c[ 8]]<< 3;
|
||||
if(j== 9) i[2] += map[c[ 9]]>> 2;
|
||||
if(j== 9) {
|
||||
i[2] += map[c[ 9]]>> 2;
|
||||
i[3] = map[c[ 9]]<<14;
|
||||
}
|
||||
if(j==10) i[3] += map[c[10]]<< 9;
|
||||
if(j==11) i[3] += map[c[11]]<< 4;
|
||||
if(j==12) i[3] += map[c[12]]>> 1;
|
||||
if(j==12) {
|
||||
i[3] += map[c[12]]>> 1;
|
||||
i[4] = map[c[12]]<<15;
|
||||
}
|
||||
if(j==13) i[4] += map[c[13]]<<10;
|
||||
if(j==14) i[4] += map[c[14]]<< 5;
|
||||
if(j==15) i[4] += map[c[15]]>> 0;
|
||||
|
||||
@@ -29,12 +29,6 @@
|
||||
#define RAND48_MULT_2 (0x0005)
|
||||
#define RAND48_ADD (0x000b)
|
||||
|
||||
unsigned short _rand48_seed[3];
|
||||
|
||||
unsigned short _rand48_mult[3];
|
||||
|
||||
unsigned short _rand48_add;
|
||||
|
||||
void _dorand48(unsigned short xseed[3]);
|
||||
|
||||
double erand48(unsigned short xseed[3]);
|
||||
|
||||
@@ -46,7 +46,7 @@ language_classifier_t *get_language_classifier(void) {
|
||||
return language_classifier;
|
||||
}
|
||||
|
||||
void language_classifier_response_destroy(language_classifier_response_t *self) {
|
||||
void language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
|
||||
if (self == NULL) return;
|
||||
if (self->languages != NULL) {
|
||||
free(self->languages);
|
||||
@@ -59,7 +59,7 @@ void language_classifier_response_destroy(language_classifier_response_t *self)
|
||||
free(self);
|
||||
}
|
||||
|
||||
language_classifier_response_t *classify_languages(char *address) {
|
||||
libpostal_language_classifier_response_t *classify_languages(char *address) {
|
||||
language_classifier_t *classifier = get_language_classifier();
|
||||
|
||||
if (classifier == NULL) {
|
||||
@@ -88,7 +88,7 @@ language_classifier_response_t *classify_languages(char *address) {
|
||||
size_t n = classifier->num_labels;
|
||||
double_matrix_t *p_y = double_matrix_new_zeros(1, n);
|
||||
|
||||
language_classifier_response_t *response = NULL;
|
||||
libpostal_language_classifier_response_t *response = NULL;
|
||||
bool model_exp = false;
|
||||
if (classifier->weights_type == MATRIX_DENSE) {
|
||||
model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y);
|
||||
@@ -129,7 +129,7 @@ language_classifier_response_t *classify_languages(char *address) {
|
||||
|
||||
free(indices);
|
||||
|
||||
response = malloc(sizeof(language_classifier_response_t));
|
||||
response = malloc(sizeof(libpostal_language_classifier_response_t));
|
||||
response->num_languages = num_languages;
|
||||
response->languages = languages;
|
||||
response->probs = probs;
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "libpostal.h"
|
||||
|
||||
#include "collections.h"
|
||||
#include "language_features.h"
|
||||
#include "logistic_regression.h"
|
||||
@@ -29,21 +31,14 @@ typedef struct language_classifier {
|
||||
} weights;
|
||||
} language_classifier_t;
|
||||
|
||||
|
||||
typedef struct language_classifier_response {
|
||||
size_t num_languages;
|
||||
char **languages;
|
||||
double *probs;
|
||||
} language_classifier_response_t;
|
||||
|
||||
// General usage
|
||||
|
||||
language_classifier_t *language_classifier_new(void);
|
||||
language_classifier_t *get_language_classifier(void);
|
||||
language_classifier_t *get_language_classifier_country(void);
|
||||
|
||||
language_classifier_response_t *classify_languages(char *address);
|
||||
void language_classifier_response_destroy(language_classifier_response_t *self);
|
||||
libpostal_language_classifier_response_t *classify_languages(char *address);
|
||||
void language_classifier_response_destroy(libpostal_language_classifier_response_t *self);
|
||||
|
||||
void language_classifier_destroy(language_classifier_t *self);
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
|
||||
language_classifier_response_t *response = classify_languages(address);
|
||||
libpostal_language_classifier_response_t *response = classify_languages(address);
|
||||
if (response == NULL) {
|
||||
printf("Could not classify language\n");
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "language_classifier_io.h"
|
||||
#include "string_utils.h"
|
||||
#include "trie_utils.h"
|
||||
#include "transliterate.h"
|
||||
|
||||
|
||||
double test_accuracy(char *filename) {
|
||||
@@ -33,7 +34,7 @@ double test_accuracy(char *filename) {
|
||||
continue;
|
||||
}
|
||||
|
||||
language_classifier_response_t *response = classify_languages(address);
|
||||
libpostal_language_classifier_response_t *response = classify_languages(address);
|
||||
if (response == NULL || response->num_languages == 0) {
|
||||
printf("%s\tNULL\t%s\n", language, address);
|
||||
continue;
|
||||
|
||||
@@ -85,6 +85,17 @@ libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(
|
||||
return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS;
|
||||
}
|
||||
|
||||
char **libpostal_near_dupe_name_hashes(char *name, libpostal_normalize_options_t normalize_options, size_t *num_hashes) {
|
||||
cstring_array *strings = name_word_hashes(name, normalize_options);
|
||||
if (strings == NULL) {
|
||||
*num_hashes = 0;
|
||||
return NULL;
|
||||
}
|
||||
*num_hashes = cstring_array_num_strings(strings);
|
||||
return cstring_array_to_strings(strings);
|
||||
}
|
||||
|
||||
|
||||
char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) {
|
||||
cstring_array *strings = near_dupe_hashes(num_components, labels, values, options);
|
||||
if (strings == NULL) {
|
||||
@@ -108,7 +119,7 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels
|
||||
|
||||
|
||||
char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
|
||||
language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
|
||||
libpostal_language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
|
||||
if (lang_response == NULL) {
|
||||
*num_languages = 0;
|
||||
return NULL;
|
||||
@@ -202,6 +213,30 @@ libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t nu
|
||||
return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
|
||||
}
|
||||
|
||||
libpostal_language_classifier_response_t *libpostal_classify_language(char *address) {
|
||||
libpostal_language_classifier_response_t *response = classify_languages(address);
|
||||
|
||||
if (response == NULL) {
|
||||
log_error("Language classification returned NULL\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
void libpostal_language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
|
||||
if (self == NULL) return;
|
||||
if (self->languages != NULL) {
|
||||
free(self->languages);
|
||||
}
|
||||
|
||||
if (self->probs) {
|
||||
free(self->probs);
|
||||
}
|
||||
|
||||
free(self);
|
||||
}
|
||||
|
||||
|
||||
void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) {
|
||||
if (self == NULL) return;
|
||||
@@ -262,19 +297,21 @@ bool libpostal_setup_datadir(char *datadir) {
|
||||
address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
|
||||
}
|
||||
|
||||
bool setup_succeed = true;
|
||||
|
||||
if (!transliteration_module_setup(transliteration_path)) {
|
||||
log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
|
||||
return false;
|
||||
setup_succeed = false;
|
||||
}
|
||||
|
||||
if (!numex_module_setup(numex_path)) {
|
||||
if (setup_succeed && !numex_module_setup(numex_path)) {
|
||||
log_error("Error loading numex module, dir=%s\n", numex_path);
|
||||
return false;
|
||||
setup_succeed = false;
|
||||
}
|
||||
|
||||
if (!address_dictionary_module_setup(address_dictionary_path)) {
|
||||
if (setup_succeed && !address_dictionary_module_setup(address_dictionary_path)) {
|
||||
log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
|
||||
return false;
|
||||
setup_succeed = false;
|
||||
}
|
||||
|
||||
if (transliteration_path != NULL) {
|
||||
@@ -289,7 +326,7 @@ bool libpostal_setup_datadir(char *datadir) {
|
||||
free(address_dictionary_path);
|
||||
}
|
||||
|
||||
return true;
|
||||
return setup_succeed;
|
||||
}
|
||||
|
||||
bool libpostal_setup(void) {
|
||||
|
||||
@@ -167,6 +167,19 @@ LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(ch
|
||||
|
||||
LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features);
|
||||
|
||||
/*
|
||||
Language classification
|
||||
*/
|
||||
|
||||
typedef struct libpostal_language_classifier_response {
|
||||
size_t num_languages;
|
||||
char **languages;
|
||||
double *probs;
|
||||
} libpostal_language_classifier_response_t;
|
||||
|
||||
LIBPOSTAL_EXPORT libpostal_language_classifier_response_t *libpostal_classify_language(char *address);
|
||||
|
||||
LIBPOSTAL_EXPORT void libpostal_language_classifier_response_destroy(libpostal_language_classifier_response_t *self);
|
||||
|
||||
/*
|
||||
Deduping
|
||||
@@ -191,8 +204,8 @@ typedef struct libpostal_near_dupe_hash_options {
|
||||
bool address_only_keys;
|
||||
} libpostal_near_dupe_hash_options_t;
|
||||
|
||||
|
||||
LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void);
|
||||
LIBPOSTAL_EXPORT char **libpostal_near_dupe_name_hashes(char *name, libpostal_normalize_options_t normalize_options, size_t *num_hashes);
|
||||
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes);
|
||||
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes);
|
||||
|
||||
|
||||
@@ -1,232 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
|
||||
if [ "$#" -lt 3 ]; then
|
||||
echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LIBPOSTAL_VERSION_STRING="v1"
|
||||
LIBPOSTAL_RELEASE_VERSION_STRING="v1.0.0"
|
||||
|
||||
LIBPOSTAL_REPO_NAME="openvenues/libpostal"
|
||||
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
||||
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
||||
|
||||
GITHUB_API_URL="https://api.github.com"
|
||||
LIBPOSTAL_RELEASE_API_URL="$GITHUB_API_URL/repos/$LIBPOSTAL_REPO_NAME/releases"
|
||||
|
||||
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
||||
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
||||
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||
|
||||
LIBPOSTAL_DATA_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/parser.tar.gz"
|
||||
LIBPOSTAL_PARSER_S3_PREFIX="$LIBPOSTAL_LATEST_DATA_VERSION_STRING/libpostal_data.tar.gz"
|
||||
LIBPOSTAL_LANG_CLASS_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/language_classifier.tar.gz"
|
||||
|
||||
COMMAND=$1
|
||||
FILE=$2
|
||||
LIBPOSTAL_DATA_DIR=$3
|
||||
|
||||
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
|
||||
LIBPOSTAL_DATA_DIR_VERSION=
|
||||
|
||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||
|
||||
LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
|
||||
LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser
|
||||
LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier
|
||||
|
||||
BASIC_MODULE_DIRS="address_expansions numex transliteration"
|
||||
PARSER_MODULE_DIR=address_parser
|
||||
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
|
||||
|
||||
export LC_ALL=C
|
||||
|
||||
EPOCH_DATE="Jan 1 00:00:00 1970"
|
||||
|
||||
MB=$((1024*1024))
|
||||
CHUNK_SIZE=$((64*$MB))
|
||||
|
||||
LARGE_FILE_SIZE=$((CHUNK_SIZE*2))
|
||||
|
||||
|
||||
NUM_WORKERS=12
|
||||
|
||||
kill_background_processes() {
|
||||
jobs -p | xargs kill;
|
||||
exit
|
||||
}
|
||||
|
||||
trap kill_background_processes INT
|
||||
|
||||
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
|
||||
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
|
||||
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
|
||||
|
||||
|
||||
download_multipart() {
|
||||
url=$1
|
||||
filename=$2
|
||||
size=$3
|
||||
|
||||
num_chunks=$((size/CHUNK_SIZE))
|
||||
echo "Downloading multipart: $url, size=$size, num_chunks=$num_chunks"
|
||||
offset=0
|
||||
i=0
|
||||
while [ $i -lt $num_chunks ]; do
|
||||
i=$((i+1))
|
||||
part_filename="$filename.$i"
|
||||
if [ $i -lt $num_chunks ]; then
|
||||
max=$((offset+CHUNK_SIZE-1));
|
||||
else
|
||||
max=$size;
|
||||
fi;
|
||||
printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
|
||||
offset=$((offset+CHUNK_SIZE))
|
||||
done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
|
||||
|
||||
> $local_path
|
||||
|
||||
i=0
|
||||
while [ $i -lt $num_chunks ]; do
|
||||
i=$((i+1))
|
||||
part_filename="$filename.$i"
|
||||
cat $part_filename >> $local_path
|
||||
rm $part_filename
|
||||
done;
|
||||
|
||||
}
|
||||
|
||||
|
||||
download_file() {
|
||||
updated_path=$1
|
||||
data_dir=$2
|
||||
metadata_url=$3
|
||||
url=$4
|
||||
size=$5
|
||||
filename=$6
|
||||
name=$7
|
||||
shift 7
|
||||
subdirs=$@
|
||||
|
||||
local_path=$data_dir/$filename
|
||||
|
||||
if [ ! -e $updated_path ]; then
|
||||
echo "$EPOCH_DATE" > $updated_path;
|
||||
fi;
|
||||
|
||||
echo "Checking for new libpostal $name..."
|
||||
|
||||
if [ $(curl -LsI $metadata_url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
|
||||
echo "New libpostal $name available"
|
||||
|
||||
if [ $size -ge $LARGE_FILE_SIZE ]; then
|
||||
download_multipart $url $local_path $size
|
||||
else
|
||||
curl -L $url --retry 3 --retry-delay 2 -o $local_path
|
||||
fi
|
||||
|
||||
if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then
|
||||
echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path;
|
||||
elif stat -f %Sm . >/dev/null 2>&1; then
|
||||
echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
|
||||
fi;
|
||||
for subdir in $subdirs; do
|
||||
rm -rf $data_dir/$subdir;
|
||||
done
|
||||
tar -xvzf $local_path --no-same-owner -C $data_dir;
|
||||
rm $local_path;
|
||||
else
|
||||
echo "libpostal $name up to date"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $COMMAND = "download" ]; then
|
||||
if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
|
||||
LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)
|
||||
fi
|
||||
|
||||
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
|
||||
echo "Old version of datadir detected, removing..."
|
||||
for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
|
||||
rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
|
||||
done
|
||||
|
||||
# Legacy, blow it away too to be nice
|
||||
if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
|
||||
rm -rf $LIBPOSTAL_DATA_DIR/geodb;
|
||||
fi
|
||||
|
||||
rm -f $LIBPOSTAL_DATA_UPDATED_PATH
|
||||
rm -f $LIBPOSTAL_LANG_CLASS_UPDATED_PATH
|
||||
rm -f $LIBPOSTAL_PARSER_UPDATED_PATH
|
||||
fi
|
||||
|
||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||
|
||||
release_id=$(curl -s $LIBPOSTAL_RELEASE_API_URL/tags/$LIBPOSTAL_RELEASE_VERSION_STRING | grep "\"id\"" | head -n1 | grep -o '[0-9][0-9]*')
|
||||
release_assets="$(curl -s $LIBPOSTAL_RELEASE_API_URL/$release_id/assets)"
|
||||
|
||||
asset_names_tempfile="$LIBPOSTAL_DATA_DIR/asset_names.tmp"
|
||||
echo "$release_assets" | grep -o '"name": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_names_tempfile
|
||||
asset_metadata_tempfile="$LIBPOSTAL_DATA_DIR/asset_metadata.tmp"
|
||||
echo "$release_assets" | grep -o '"url": *"[^"]*/releases/assets/[0-9]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_metadata_tempfile
|
||||
asset_urls_tempfile="$LIBPOSTAL_DATA_DIR/asset_urls.tmp"
|
||||
echo "$release_assets" | grep -o '"browser_download_url": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_urls_tempfile
|
||||
asset_sizes_tempfile="$LIBPOSTAL_DATA_DIR/asset_sizes.tmp"
|
||||
echo "$release_assets" | grep -o '"size": *[0-9]*' | grep -o '[0-9]*$' > $asset_sizes_tempfile
|
||||
|
||||
assets_tempfile="$LIBPOSTAL_DATA_DIR/assets.tmp"
|
||||
paste -d' ' $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile > $assets_tempfile
|
||||
|
||||
rm $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile
|
||||
|
||||
while read -r line; do
|
||||
asset=$(echo "$line" | cut -f1 -d' ')
|
||||
asset_metadata_url=$(echo "$line" | cut -f2 -d' ')
|
||||
asset_url=$(echo "$line" | cut -f3 -d' ')
|
||||
asset_size=$(echo "$line" | cut -f4 -d' ')
|
||||
|
||||
if [ $asset = $LIBPOSTAL_DATA_FILE ] && ([ $FILE = "base" ] || [ $FILE = "all" ]); then
|
||||
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
||||
fi
|
||||
if [ $asset = $LIBPOSTAL_PARSER_FILE ] && ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
|
||||
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
||||
fi
|
||||
if [ $asset = $LIBPOSTAL_LANG_CLASS_FILE ] && ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
|
||||
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||
fi
|
||||
|
||||
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
|
||||
echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE;
|
||||
fi
|
||||
done < $assets_tempfile;
|
||||
rm $assets_tempfile
|
||||
|
||||
elif [ $COMMAND = "upload" ]; then
|
||||
echo "upload not implemented yet"
|
||||
|
||||
#if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
||||
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
|
||||
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/
|
||||
#fi
|
||||
|
||||
#if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
||||
# latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
|
||||
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
||||
# parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
|
||||
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
|
||||
#fi
|
||||
|
||||
#if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
||||
# latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
|
||||
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||
# lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/"
|
||||
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
|
||||
#fi
|
||||
else
|
||||
echo "Invalid command: $COMMAND"
|
||||
exit 1
|
||||
fi
|
||||
189
src/libpostal_data.in
Executable file
189
src/libpostal_data.in
Executable file
@@ -0,0 +1,189 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
|
||||
if [ "$#" -lt 3 ]; then
|
||||
echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
COMMAND=$1
|
||||
FILE=$2
|
||||
LIBPOSTAL_DATA_DIR=$3
|
||||
|
||||
MB=$((1024*1024))
|
||||
CHUNK_SIZE=$((64*$MB))
|
||||
|
||||
DATAMODEL="@MODEL@"
|
||||
|
||||
# Not loving this approach but there appears to be no way to query the size
|
||||
# of a release asset without using the Github API
|
||||
LIBPOSTAL_DATA_FILE_CHUNKS=1
|
||||
LIBPOSTAL_PARSER_MODEL_CHUNKS=12
|
||||
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
|
||||
|
||||
LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_DATA_DIR_VERSION_STRING@"
|
||||
|
||||
LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_DATA_FILE_LATEST_VERSION@"
|
||||
LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_PARSER_MODEL_LATEST_VERSION@"
|
||||
LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION@"
|
||||
|
||||
LIBPOSTAL_REPO_NAME="openvenues/libpostal"
|
||||
|
||||
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
||||
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
||||
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||
|
||||
LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
|
||||
|
||||
if [ "$DATAMODEL" = "senzing" ]; then
|
||||
LIBPOSTAL_DATA_FILE_CHUNKS=1
|
||||
LIBPOSTAL_PARSER_MODEL_CHUNKS=1
|
||||
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
|
||||
|
||||
LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING@"
|
||||
|
||||
LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION@"
|
||||
LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION@"
|
||||
LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION@"
|
||||
|
||||
LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com"
|
||||
fi
|
||||
|
||||
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
|
||||
LIBPOSTAL_DATA_DIR_VERSION=
|
||||
|
||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||
|
||||
LIBPOSTAL_DATA_FILE_VERSION_PATH=$LIBPOSTAL_DATA_DIR/base_data_file_version
|
||||
LIBPOSTAL_PARSER_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/parser_model_file_version
|
||||
LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/language_classifier_model_file_version
|
||||
|
||||
BASIC_MODULE_DIRS="address_expansions numex transliteration"
|
||||
PARSER_MODULE_DIR=address_parser
|
||||
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
|
||||
|
||||
NUM_WORKERS=12
|
||||
|
||||
kill_background_processes() {
|
||||
jobs -p | xargs kill;
|
||||
exit
|
||||
}
|
||||
|
||||
trap kill_background_processes INT
|
||||
|
||||
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
|
||||
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
|
||||
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
|
||||
|
||||
|
||||
download_release_multipart() {
|
||||
url=$1
|
||||
filename=$2
|
||||
num_chunks=$3
|
||||
|
||||
echo "Downloading multipart: $url, num_chunks=$num_chunks"
|
||||
offset=0
|
||||
i=0
|
||||
while [ $i -lt $num_chunks ]; do
|
||||
i=$((i+1))
|
||||
part_filename="$filename.$i"
|
||||
max=$((offset+CHUNK_SIZE-1));
|
||||
printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
|
||||
offset=$((offset+CHUNK_SIZE))
|
||||
done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
|
||||
|
||||
> $filename
|
||||
|
||||
i=0
|
||||
while [ $i -lt $num_chunks ]; do
|
||||
i=$((i+1))
|
||||
part_filename="$filename.$i"
|
||||
cat $part_filename >> $filename
|
||||
rm $part_filename
|
||||
done;
|
||||
|
||||
}
|
||||
|
||||
|
||||
download_release() {
|
||||
version_file_path=$1
|
||||
version=$2
|
||||
data_dir=$3
|
||||
num_chunks=$4
|
||||
filename=$5
|
||||
name=$6
|
||||
shift 6
|
||||
subdirs=$@
|
||||
|
||||
local_path=$data_dir/$filename
|
||||
|
||||
url=$LIBPOSTAL_BASE_URL/$version/$filename
|
||||
|
||||
if [ ! -e $version_file_path ]; then
|
||||
current_version=""
|
||||
else
|
||||
current_version="$(cat $version_file_path)"
|
||||
|
||||
fi;
|
||||
|
||||
echo "Checking for new libpostal $name..."
|
||||
|
||||
if [ "$current_version" != "$version" ]; then
|
||||
echo "New libpostal $name available"
|
||||
|
||||
if [ $num_chunks -gt 1 ]; then
|
||||
download_release_multipart $url $local_path $num_chunks
|
||||
else
|
||||
curl -L $url --retry 3 --retry-delay 2 -o $local_path
|
||||
fi
|
||||
|
||||
for subdir in $subdirs; do
|
||||
rm -rf $data_dir/$subdir;
|
||||
done
|
||||
tar -xvzf $local_path --no-same-owner -C $data_dir;
|
||||
rm $local_path;
|
||||
echo "$version" > $version_file_path;
|
||||
else
|
||||
echo "libpostal $name up to date"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $COMMAND = "download" ]; then
|
||||
if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
|
||||
LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)
|
||||
|
||||
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_DATA_DIR_VERSION_STRING" ]; then
|
||||
echo "Old version of datadir detected, removing..."
|
||||
for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
|
||||
rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
|
||||
done
|
||||
|
||||
# Legacy, blow it away too to be nice
|
||||
if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
|
||||
rm -rf $LIBPOSTAL_DATA_DIR/geodb;
|
||||
fi
|
||||
|
||||
rm -f $LIBPOSTAL_DATA_DIR/last_updated*
|
||||
rm -f $LIBPOSTAL_DATA_DIR/*_version
|
||||
fi
|
||||
fi
|
||||
|
||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||
|
||||
if ([ $FILE = "base" ] || [ $FILE = "all" ]); then
|
||||
download_release $LIBPOSTAL_DATA_FILE_VERSION_PATH $LIBPOSTAL_DATA_FILE_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE_CHUNKS $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
||||
fi
|
||||
if ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
|
||||
download_release $LIBPOSTAL_PARSER_MODEL_VERSION_PATH $LIBPOSTAL_PARSER_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_PARSER_MODEL_CHUNKS $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
||||
fi
|
||||
if ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
|
||||
download_release $LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH $LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||
fi
|
||||
|
||||
echo "$LIBPOSTAL_DATA_DIR_VERSION_STRING" > $LIBPOSTAL_DATA_VERSION_FILE
|
||||
|
||||
else
|
||||
echo "Invalid command: $COMMAND"
|
||||
exit 1
|
||||
fi
|
||||
18
src/main.c
18
src/main.c
@@ -13,10 +13,16 @@
|
||||
|
||||
#define LIBPOSTAL_USAGE "Usage: ./libpostal address [...languages] [--json]\n"
|
||||
|
||||
static inline void print_output(char *address, libpostal_normalize_options_t options, bool use_json) {
|
||||
static inline void print_output(char *address, libpostal_normalize_options_t options, bool use_json, bool root_expansions) {
|
||||
size_t num_expansions;
|
||||
|
||||
char **expansions = libpostal_expand_address(address, options, &num_expansions);
|
||||
char **expansions;
|
||||
|
||||
if (!root_expansions) {
|
||||
expansions = libpostal_expand_address(address, options, &num_expansions);
|
||||
} else {
|
||||
expansions = libpostal_expand_address_root(address, options, &num_expansions);
|
||||
}
|
||||
|
||||
char *normalized;
|
||||
|
||||
@@ -45,9 +51,9 @@ int main(int argc, char **argv) {
|
||||
char *arg;
|
||||
|
||||
char *address = NULL;
|
||||
char *language = NULL;
|
||||
|
||||
bool use_json = false;
|
||||
bool root_expansions = false;
|
||||
|
||||
string_array *languages = NULL;
|
||||
|
||||
@@ -58,6 +64,8 @@ int main(int argc, char **argv) {
|
||||
exit(EXIT_SUCCESS);
|
||||
} else if (string_equals(arg, "--json")) {
|
||||
use_json = true;
|
||||
} else if (string_equals(arg, "--root")) {
|
||||
root_expansions = true;
|
||||
} else if (address == NULL) {
|
||||
address = arg;
|
||||
} else if (!string_starts_with(arg, "-")) {
|
||||
@@ -87,11 +95,11 @@ int main(int argc, char **argv) {
|
||||
if (address == NULL) {
|
||||
char *line;
|
||||
while ((line = file_getline(stdin)) != NULL) {
|
||||
print_output(line, options, use_json);
|
||||
print_output(line, options, use_json, root_expansions);
|
||||
free(line);
|
||||
}
|
||||
} else {
|
||||
print_output(address, options, use_json);
|
||||
print_output(address, options, use_json, root_expansions);
|
||||
}
|
||||
|
||||
if (languages != NULL) {
|
||||
|
||||
@@ -6,7 +6,9 @@
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif
|
||||
|
||||
#include "collections.h"
|
||||
#include "file_utils.h"
|
||||
@@ -60,7 +62,7 @@ typedef enum {
|
||||
matrix->m = m; \
|
||||
matrix->n = n; \
|
||||
\
|
||||
matrix->values = _aligned_malloc(sizeof(type) * m * n, alignment); \
|
||||
matrix->values = aligned_malloc(sizeof(type) * m * n, alignment); \
|
||||
if (matrix->values == NULL) { \
|
||||
free(matrix); \
|
||||
return NULL; \
|
||||
@@ -84,7 +86,7 @@ typedef enum {
|
||||
if (self == NULL) return; \
|
||||
\
|
||||
if (self->values != NULL) { \
|
||||
_aligned_free(self->values); \
|
||||
aligned_free(self->values); \
|
||||
} \
|
||||
\
|
||||
free(self); \
|
||||
@@ -116,7 +118,7 @@ typedef enum {
|
||||
if (self == NULL) return false; \
|
||||
\
|
||||
if (m * n > (self->m * self->n)) { \
|
||||
type *ptr = _aligned_realloc(self->values, sizeof(type) * m * n, alignment); \
|
||||
type *ptr = aligned_resize(self->values, sizeof(type) * self->m * self->n, sizeof(type) * m * n, alignment); \
|
||||
if (ptr == NULL) { \
|
||||
return false; \
|
||||
} \
|
||||
|
||||
@@ -144,6 +144,7 @@ bool cstring_array_add_string_no_whitespace(cstring_array *strings, char *str) {
|
||||
|
||||
|
||||
cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, bool remove_spaces, size_t *n) {
|
||||
char *expansion;
|
||||
size_t num_expansions = 0;
|
||||
cstring_array *expansions = expand_address(input, options, &num_expansions);
|
||||
|
||||
@@ -160,7 +161,6 @@ cstring_array *expanded_component_combined(char *input, libpostal_normalize_opti
|
||||
return root_expansions;
|
||||
} else {
|
||||
khash_t(str_set) *unique_strings = kh_init(str_set);
|
||||
char *expansion;
|
||||
khiter_t k;
|
||||
int ret;
|
||||
|
||||
@@ -387,6 +387,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
||||
log_debug("token_str = %s\n", token_str);
|
||||
|
||||
add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings, ngrams);
|
||||
add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams);
|
||||
// For non-Latin words (Arabic, Cyrllic, etc.) just add the word
|
||||
// For ideograms, we do two-character shingles, so only add the first character if the string has one token
|
||||
} else if (!ideogram || j > 0 || num_tokens == 1) {
|
||||
@@ -640,7 +641,7 @@ static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes,
|
||||
|
||||
|
||||
cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) {
|
||||
if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_postal_code) return NULL;
|
||||
if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_small_containing_boundaries && !options.with_postal_code) return NULL;
|
||||
|
||||
place_t *place = place_from_components(num_components, labels, values);
|
||||
log_debug("created place\n");
|
||||
@@ -669,7 +670,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels,
|
||||
|
||||
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
|
||||
|
||||
language_classifier_response_t *lang_response = NULL;
|
||||
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||
|
||||
if (num_languages == 0) {
|
||||
lang_response = place_languages(num_components, labels, values);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "libpostal.h"
|
||||
#include "string_utils.h"
|
||||
|
||||
cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options);
|
||||
cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options);
|
||||
cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages);
|
||||
|
||||
|
||||
@@ -434,7 +434,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
|
||||
char_array_append(array, " ");
|
||||
append_char = false;
|
||||
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
|
||||
append_char = !is_hyphen_between_letter_and_number;
|
||||
append_char = is_hyphen_between_letter_and_number;
|
||||
}
|
||||
|
||||
if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) {
|
||||
|
||||
@@ -5,6 +5,15 @@
|
||||
|
||||
#include "log/log.h"
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_STRNDUP
|
||||
#include "strndup.h"
|
||||
#endif
|
||||
|
||||
|
||||
#define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB
|
||||
|
||||
#define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n"
|
||||
|
||||
@@ -17,10 +17,10 @@ static inline bool is_address_text_component(char *label) {
|
||||
);
|
||||
}
|
||||
|
||||
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
|
||||
libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
|
||||
if (num_components == 0 || values == NULL || labels == NULL) return NULL;
|
||||
|
||||
language_classifier_response_t *lang_response = NULL;
|
||||
libpostal_language_classifier_response_t *lang_response = NULL;
|
||||
|
||||
char *label;
|
||||
char *value;
|
||||
|
||||
@@ -32,7 +32,7 @@ typedef struct place {
|
||||
char *website;
|
||||
} place_t;
|
||||
|
||||
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
|
||||
libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
|
||||
|
||||
place_t *place_new(void);
|
||||
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
|
||||
#include "string_utils.h"
|
||||
|
||||
#include "file_utils.h"
|
||||
|
||||
// Run shuf/gshuf on a file in-place if the shuf command is available.
|
||||
bool shuffle_file(char *filename) {
|
||||
char *shuffle_command = NULL;
|
||||
|
||||
@@ -157,8 +157,8 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
||||
uint32_array **t1_tokens_unicode = NULL;
|
||||
uint32_array **t2_tokens_unicode = NULL;
|
||||
|
||||
uint32_array *t1_unicode;
|
||||
uint32_array *t2_unicode;
|
||||
uint32_array *t1_unicode = NULL;
|
||||
uint32_array *t2_unicode = NULL;
|
||||
|
||||
int64_array *phrase_memberships_array1 = NULL;
|
||||
int64_array *phrase_memberships_array2 = NULL;
|
||||
@@ -232,8 +232,8 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t *suffixes1;
|
||||
uint32_t *suffixes2;
|
||||
uint32_t *suffixes1 = NULL;
|
||||
uint32_t *suffixes2 = NULL;
|
||||
|
||||
if (ordinal_suffixes1 != NULL && ordinal_suffixes2 != NULL) {
|
||||
suffixes1 = ordinal_suffixes1->a;
|
||||
|
||||
@@ -94,15 +94,15 @@ inline bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khas
|
||||
}
|
||||
|
||||
uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) {
|
||||
khash_t(int_set) *unique_columns = kh_init(int_set);
|
||||
khash_t(int_uint32) *unique_columns = kh_init(int_uint32);
|
||||
uint32_array *ret = uint32_array_new();
|
||||
|
||||
if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) {
|
||||
kh_destroy(int_set, unique_columns);
|
||||
kh_destroy(int_uint32, unique_columns);
|
||||
return ret;
|
||||
}
|
||||
|
||||
kh_destroy(int_set, unique_columns);
|
||||
kh_destroy(int_uint32, unique_columns);
|
||||
uint32_array_destroy(ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
8853
src/sse2neon.h
Normal file
8853
src/sse2neon.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -78,7 +78,7 @@ typedef struct {
|
||||
#define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 0, 1, 1, 0, 0, 0, 0}
|
||||
|
||||
|
||||
static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result, char *str, size_t start_index) {
|
||||
static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result) {
|
||||
uint32_t node_id = result.node_id;
|
||||
if (node_id == NULL_NODE_ID) return NULL;
|
||||
|
||||
@@ -834,11 +834,11 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
||||
log_debug("Context match\n");
|
||||
match_state = match_candidate_state;
|
||||
match_state.state = TRANS_STATE_MATCH;
|
||||
replacement = get_replacement(trie, context_result, str, match_state.phrase_start);
|
||||
replacement = get_replacement(trie, context_result);
|
||||
} else {
|
||||
if (match_state.state == TRANS_STATE_MATCH) {
|
||||
log_debug("Context no match and previous match\n");
|
||||
replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
|
||||
replacement = get_replacement(trie, match_state.result);
|
||||
if (state.state != TRANS_STATE_PARTIAL_MATCH) {
|
||||
state.advance_index = false;
|
||||
}
|
||||
@@ -869,7 +869,7 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
||||
|
||||
if (match_state.state == TRANS_STATE_MATCH) {
|
||||
log_debug("Match no context\n");
|
||||
replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
|
||||
replacement = get_replacement(trie, match_state.result);
|
||||
} else {
|
||||
|
||||
log_debug("Tried context for %s at char '%.*s', no match\n", str, (int)char_len, ptr);
|
||||
@@ -934,14 +934,24 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
||||
match_state = TRANSLITERATION_DEFAULT_STATE;
|
||||
}
|
||||
|
||||
bool added_previous_phrase = false;
|
||||
|
||||
if (context_no_match && !prev_state.empty_transition && prev_state.phrase_len > 0) {
|
||||
log_debug("Previous phrase stays as is %.*s\n", (int)prev_state.phrase_len, str+prev_state.phrase_start);
|
||||
char_array_cat_len(new_str, str + prev_state.phrase_start, prev_state.phrase_len);
|
||||
added_previous_phrase = true;
|
||||
|
||||
if (match_candidate_state.state != TRANS_STATE_PARTIAL_MATCH) {
|
||||
state = start_state;
|
||||
}
|
||||
|
||||
if (state.state == TRANS_STATE_BEGIN && !prev_state.empty_transition) {
|
||||
}
|
||||
|
||||
if (match_candidate_state.state != TRANS_STATE_PARTIAL_MATCH && !prev_state.empty_transition && idx + char_len == len) {
|
||||
log_debug("No replacement for %.*s\n", (int)char_len, ptr);
|
||||
char_array_cat_len(new_str, str + idx, char_len);
|
||||
state = start_state;
|
||||
} else if (state.state == TRANS_STATE_BEGIN && !prev_state.empty_transition) {
|
||||
log_debug("TRANS_STATE_BEGIN && !prev_state.empty_transition\n");
|
||||
state.advance_index = false;
|
||||
} else if (prev_state.empty_transition) {
|
||||
|
||||
47
src/vector.h
47
src/vector.h
@@ -7,43 +7,44 @@
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
|
||||
#include <malloc.h>
|
||||
static inline void *aligned_malloc(size_t size, size_t alignment) {
|
||||
return _aligned_malloc(size, alignment);
|
||||
}
|
||||
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment) {
|
||||
return _aligned_realloc(p, new_size, alignment);
|
||||
}
|
||||
static inline void aligned_free(void *p) {
|
||||
_aligned_free(p);
|
||||
}
|
||||
#else
|
||||
#include <stdlib.h>
|
||||
static inline void *_aligned_malloc(size_t size, size_t alignment)
|
||||
static inline void *aligned_malloc(size_t size, size_t alignment)
|
||||
{
|
||||
void *p;
|
||||
int ret = posix_memalign(&p, alignment, size);
|
||||
return (ret == 0) ? p : NULL;
|
||||
}
|
||||
static inline void *_aligned_realloc(void *p, size_t size, size_t alignment)
|
||||
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment)
|
||||
{
|
||||
if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (size == 0) {
|
||||
if (p == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *rp = realloc(p, size);
|
||||
|
||||
/* If realloc result is not already at an aligned boundary,
|
||||
_aligned_malloc a new block and copy the contents of the realloc'd
|
||||
pointer to the aligned block, free the realloc'd pointer and return
|
||||
the aligned pointer.
|
||||
*/
|
||||
if ( ((size_t)rp & (alignment - 1)) != 0) {
|
||||
void *p1 = _aligned_malloc(size, alignment);
|
||||
if (p1 != NULL) {
|
||||
memcpy(p1, rp, size);
|
||||
}
|
||||
free(rp);
|
||||
rp = p1;
|
||||
void *p1 = aligned_malloc(new_size, alignment);
|
||||
if (p1 == NULL) {
|
||||
free(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return rp;
|
||||
memcpy(p1, p, old_size);
|
||||
free(p);
|
||||
return p1;
|
||||
}
|
||||
static inline void _aligned_free(void *p)
|
||||
static inline void aligned_free(void *p)
|
||||
{
|
||||
free(p);
|
||||
}
|
||||
@@ -79,7 +80,7 @@ static inline void _aligned_free(void *p)
|
||||
name *array = malloc(sizeof(name)); \
|
||||
if (array == NULL) return NULL; \
|
||||
array->n = array->m = 0; \
|
||||
array->a = _aligned_malloc(size * sizeof(type), alignment); \
|
||||
array->a = aligned_malloc(size * sizeof(type), alignment); \
|
||||
if (array->a == NULL) return NULL; \
|
||||
array->m = size; \
|
||||
return array; \
|
||||
@@ -94,7 +95,7 @@ static inline void _aligned_free(void *p)
|
||||
} \
|
||||
static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) { \
|
||||
if (size <= array->m) return true; \
|
||||
type *ptr = _aligned_realloc(array->a, sizeof(type) * size, alignment); \
|
||||
type *ptr = aligned_resize(array->a, sizeof(type) * array->m, sizeof(type) * size, alignment); \
|
||||
if (ptr == NULL) return false; \
|
||||
array->a = ptr; \
|
||||
array->m = size; \
|
||||
@@ -160,7 +161,7 @@ static inline void _aligned_free(void *p)
|
||||
} \
|
||||
static inline void name##_destroy_aligned(name *array) { \
|
||||
if (array == NULL) return; \
|
||||
if (array->a != NULL) _aligned_free(array->a); \
|
||||
if (array->a != NULL) aligned_free(array->a); \
|
||||
free(array); \
|
||||
}
|
||||
|
||||
@@ -182,7 +183,7 @@ static inline void _aligned_free(void *p)
|
||||
free_func(array->a[i]); \
|
||||
} \
|
||||
} \
|
||||
_aligned_free(array->a); \
|
||||
aligned_free(array->a); \
|
||||
free(array); \
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
#define ks_lt_index(a, b) ((a).value < (b).value)
|
||||
|
||||
#ifdef USE_SSE
|
||||
#if defined(USE_SSE)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
@@ -338,7 +338,7 @@
|
||||
|
||||
|
||||
|
||||
#ifdef USE_SSE
|
||||
#if defined(USE_SSE)
|
||||
/*
|
||||
From https://github.com/herumi/fmath/blob/master/fastexp.cpp
|
||||
|
||||
|
||||
@@ -96,13 +96,13 @@ static greatest_test_res test_expansion_contains_phrase_option_with_languages(ch
|
||||
|
||||
static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) {
|
||||
bool root = false;
|
||||
if (num_languages > 0) {
|
||||
va_list args;
|
||||
if (num_languages > 0) {
|
||||
va_start(args, num_languages);
|
||||
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args));
|
||||
va_end(args);
|
||||
} else {
|
||||
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, NULL));
|
||||
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args));
|
||||
}
|
||||
PASS();
|
||||
}
|
||||
@@ -110,13 +110,13 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha
|
||||
|
||||
static greatest_test_res test_root_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) {
|
||||
bool root = true;
|
||||
if (num_languages > 0) {
|
||||
va_list args;
|
||||
if (num_languages > 0) {
|
||||
va_start(args, num_languages);
|
||||
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args));
|
||||
va_end(args);
|
||||
} else {
|
||||
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, NULL));
|
||||
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args));
|
||||
}
|
||||
PASS();
|
||||
}
|
||||
@@ -132,6 +132,9 @@ TEST test_expansions(void) {
|
||||
CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs-elysees", options, 1, "fr"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs elysees", options, 1, "fr"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champselysees", options, 1, "fr"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("มงแตร", "มงแตร", options, 1, "th"));
|
||||
@@ -182,6 +185,9 @@ TEST test_street_root_expansions(void) {
|
||||
CHECK_CALL(test_root_expansion_contains("Center Street E", "center", options));
|
||||
CHECK_CALL(test_root_expansion_contains("Ctr Street E", "center", options));
|
||||
|
||||
CHECK_CALL(test_root_expansion_contains_with_languages("W. UNION STREET", "union", options, 2, "en", "es"));
|
||||
|
||||
|
||||
// Spanish
|
||||
CHECK_CALL(test_root_expansion_contains("C/ Ocho", "8", options));
|
||||
PASS();
|
||||
|
||||
1
versions/base_data
Normal file
1
versions/base_data
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
1
versions/language_classifier
Normal file
1
versions/language_classifier
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
1
versions/parser
Normal file
1
versions/parser
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
1
versions/senzing/base_data
Normal file
1
versions/senzing/base_data
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
1
versions/senzing/language_classifier
Normal file
1
versions/senzing/language_classifier
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
1
versions/senzing/parser
Normal file
1
versions/senzing/parser
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
@@ -2,7 +2,7 @@
|
||||
# Process this file with autoconf to produce a configure script.
|
||||
|
||||
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
|
||||
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
|
||||
m4_define(LIBPOSTAL_MINOR_VERSION, [1])
|
||||
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
|
||||
|
||||
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
|
||||
@@ -50,10 +50,21 @@ AC_CHECK_TYPES([ptrdiff_t])
|
||||
# Checks for library functions.
|
||||
AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
|
||||
|
||||
AC_SUBST([LIBPOSTAL_DATA_DIR_VERSION_STRING], [v1])
|
||||
|
||||
DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/base_data)
|
||||
PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/parser)
|
||||
LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/language_classifier)
|
||||
|
||||
AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
|
||||
|
||||
AC_CONFIG_FILES([Makefile
|
||||
libpostal.pc
|
||||
src/Makefile
|
||||
test/Makefile])
|
||||
src/libpostal_data
|
||||
test/Makefile], [chmod +x src/libpostal_data])
|
||||
|
||||
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
||||
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
||||
@@ -64,6 +75,7 @@ AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf avail
|
||||
# ------------------------------------------------------------------
|
||||
# Checks for SSE2 build
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
AC_ARG_ENABLE([sse2],
|
||||
AS_HELP_STRING(
|
||||
[--disable-sse2],
|
||||
|
||||
Reference in New Issue
Block a user