Merge pull request #616 from oskar700/ot-senzing-datamodel
Adding senzing model from @oskar700 and @brianmacy, along with a new MODEL switch in configure
This commit is contained in:
18
README.md
18
README.md
@@ -175,6 +175,24 @@ If you require a .lib import library to link this to your application. You can g
|
||||
lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64
|
||||
```
|
||||
|
||||
Installation with an alternative data model
|
||||
-------------------------------------------
|
||||
|
||||
An alternative data model is available for libpostal. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling.
|
||||
To enable this add `MODEL=senzing` to the conigure line during installation:
|
||||
```
|
||||
./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing
|
||||
```
|
||||
|
||||
The data for this model is gotten from [OpenAddress](https://openaddresses.io/), [OpenStreetMap](https://www.openstreetmap.org/) and data generated by Senzing based on customer feedback (a few hundred records), a total of about 1.2 billion records of data from over 230 countries, in 100+ languages. The data from OpenStreetMap and OpenAddress is good but not perfect so the data set was modified by filtering out badly formed addresses, correcting misclassified address tokens and removing tokens that didn't belong in the addresses, whenever these conditions were encountered.
|
||||
|
||||
Senzing created a data set of 12950 addresses from 89 countries that it uses to test and verify the quality of its models. The data set was generated using random addresses from OSM, minimally 50 per country. Hard-to-parse addresses were gotten from Senzing support team and customers and from the libpostal github page and added to this set. The Senzing model got 4.3% better parsing results than the default model, using this test set.
|
||||
|
||||
The size of this model is about 2.2GB compared to 1.8GB for the default model so keep that in mind if storages space is important.
|
||||
|
||||
Further information about this data model can be found at: https://github.com/Senzing/libpostal-data
|
||||
If you run into any issues with this model, whether they have to do with parses, installation or any other problems, then please report them at https://github.com/Senzing/libpostal-data
|
||||
|
||||
Examples of parsing
|
||||
-------------------
|
||||
|
||||
|
||||
14
configure.ac
14
configure.ac
@@ -60,6 +60,17 @@ AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
|
||||
|
||||
# Senzing data
|
||||
AC_SUBST([LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING], [v1])
|
||||
|
||||
SENZING_DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/senzing/base_data)
|
||||
SENZING_PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/parser)
|
||||
SENZING_LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/language_classifier)
|
||||
|
||||
AC_SUBST([LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION], [$SENZING_DATA_FILE_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION], [$SENZING_PARSER_MODEL_LATEST_VERSION])
|
||||
AC_SUBST([LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION], [$SENZING_LANG_CLASS_MODEL_LATEST_VERSION])
|
||||
|
||||
AC_CONFIG_FILES([Makefile
|
||||
libpostal.pc
|
||||
src/Makefile
|
||||
@@ -134,6 +145,9 @@ AC_ARG_ENABLE([data-download],
|
||||
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
|
||||
esac], [DOWNLOAD_DATA=true])
|
||||
|
||||
AC_ARG_VAR(MODEL, [Option to use alternative data models. Currently available is "senzing" (MODEL=senzing). If this option is not set the default libpostal data model is used.])
|
||||
AS_VAR_IF([MODEL], [], [],
|
||||
[AS_VAR_IF([MODEL], [senzing], [], [AC_MSG_FAILURE([Invalid MODEL value set])])])
|
||||
|
||||
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ LIBPOSTAL_DATA_DIR=$3
|
||||
MB=$((1024*1024))
|
||||
CHUNK_SIZE=$((64*$MB))
|
||||
|
||||
DATAMODEL="@MODEL@"
|
||||
|
||||
# Not loving this approach but there appears to be no way to query the size
|
||||
# of a release asset without using the Github API
|
||||
LIBPOSTAL_DATA_FILE_CHUNKS=1
|
||||
@@ -34,6 +36,20 @@ LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||
|
||||
LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
|
||||
|
||||
if [ $DATAMODEL == "senzing" ]; then
|
||||
LIBPOSTAL_DATA_FILE_CHUNKS=1
|
||||
LIBPOSTAL_PARSER_MODEL_CHUNKS=1
|
||||
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
|
||||
|
||||
LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING@"
|
||||
|
||||
LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION@"
|
||||
LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION@"
|
||||
LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION@"
|
||||
|
||||
LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com"
|
||||
fi
|
||||
|
||||
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
|
||||
LIBPOSTAL_DATA_DIR_VERSION=
|
||||
|
||||
|
||||
1
versions/senzing/base_data
Normal file
1
versions/senzing/base_data
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
1
versions/senzing/language_classifier
Normal file
1
versions/senzing/language_classifier
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
1
versions/senzing/parser
Normal file
1
versions/senzing/parser
Normal file
@@ -0,0 +1 @@
|
||||
v1.0.0
|
||||
Reference in New Issue
Block a user