diff --git a/README.md b/README.md index 5cd327bc..d8e2cb9c 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,24 @@ If you require a .lib import library to link this to your application. You can g lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64 ``` +Installation with an alternative data model +------------------------------------------- + +An alternative data model is available for libpostal. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling. +To enable this add `MODEL=senzing` to the conigure line during installation: +``` +./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing +``` + +The data for this model is gotten from [OpenAddress](https://openaddresses.io/), [OpenStreetMap](https://www.openstreetmap.org/) and data generated by Senzing based on customer feedback (a few hundred records), a total of about 1.2 billion records of data from over 230 countries, in 100+ languages. The data from OpenStreetMap and OpenAddress is good but not perfect so the data set was modified by filtering out badly formed addresses, correcting misclassified address tokens and removing tokens that didn't belong in the addresses, whenever these conditions were encountered. + +Senzing created a data set of 12950 addresses from 89 countries that it uses to test and verify the quality of its models. The data set was generated using random addresses from OSM, minimally 50 per country. Hard-to-parse addresses were gotten from Senzing support team and customers and from the libpostal github page and added to this set. The Senzing model got 4.3% better parsing results than the default model, using this test set. + +The size of this model is about 2.2GB compared to 1.8GB for the default model so keep that in mind if storages space is important. + +Further information about this data model can be found at: https://github.com/Senzing/libpostal-data +If you run into any issues with this model, whether they have to do with parses, installation or any other problems, then please report them at https://github.com/Senzing/libpostal-data + Examples of parsing ------------------- diff --git a/configure.ac b/configure.ac index f740be12..ed997e32 100644 --- a/configure.ac +++ b/configure.ac @@ -60,6 +60,17 @@ AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION]) AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION]) AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION]) +# Senzing data +AC_SUBST([LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING], [v1]) + +SENZING_DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/senzing/base_data) +SENZING_PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/parser) +SENZING_LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/language_classifier) + +AC_SUBST([LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION], [$SENZING_DATA_FILE_LATEST_VERSION]) +AC_SUBST([LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION], [$SENZING_PARSER_MODEL_LATEST_VERSION]) +AC_SUBST([LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION], [$SENZING_LANG_CLASS_MODEL_LATEST_VERSION]) + AC_CONFIG_FILES([Makefile libpostal.pc src/Makefile @@ -134,6 +145,9 @@ AC_ARG_ENABLE([data-download], *) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;; esac], [DOWNLOAD_DATA=true]) +AC_ARG_VAR(MODEL, [Option to use alternative data models. Currently available is "senzing" (MODEL=senzing). If this option is not set the default libpostal data model is used.]) +AS_VAR_IF([MODEL], [], [], + [AS_VAR_IF([MODEL], [senzing], [], [AC_MSG_FAILURE([Invalid MODEL value set])])]) AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"]) diff --git a/src/libpostal_data.in b/src/libpostal_data.in index 0a3d27f2..8c18270f 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -14,6 +14,8 @@ LIBPOSTAL_DATA_DIR=$3 MB=$((1024*1024)) CHUNK_SIZE=$((64*$MB)) +DATAMODEL="@MODEL@" + # Not loving this approach but there appears to be no way to query the size # of a release asset without using the Github API LIBPOSTAL_DATA_FILE_CHUNKS=1 @@ -34,6 +36,20 @@ LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download" +if [ $DATAMODEL == "senzing" ]; then + LIBPOSTAL_DATA_FILE_CHUNKS=1 + LIBPOSTAL_PARSER_MODEL_CHUNKS=1 + LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1 + + LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING@" + + LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION@" + LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION@" + LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION@" + + LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com" +fi + LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version LIBPOSTAL_DATA_DIR_VERSION= diff --git a/versions/senzing/base_data b/versions/senzing/base_data new file mode 100644 index 00000000..60453e69 --- /dev/null +++ b/versions/senzing/base_data @@ -0,0 +1 @@ +v1.0.0 \ No newline at end of file diff --git a/versions/senzing/language_classifier b/versions/senzing/language_classifier new file mode 100644 index 00000000..60453e69 --- /dev/null +++ b/versions/senzing/language_classifier @@ -0,0 +1 @@ +v1.0.0 \ No newline at end of file diff --git a/versions/senzing/parser b/versions/senzing/parser new file mode 100644 index 00000000..0ec25f75 --- /dev/null +++ b/versions/senzing/parser @@ -0,0 +1 @@ +v1.0.0