From ec9e0e341fb40c70f0b2a8909e295086197e21f1 Mon Sep 17 00:00:00 2001 From: Oskar Thorbjornsson Date: Sun, 12 Feb 2023 17:58:36 -0800 Subject: [PATCH 1/5] Enable downloading of Senzing data model. --- configure.ac | 18 ++++++++++++++++++ src/libpostal_data.in | 16 ++++++++++++++++ versions/senzing/base_data | 1 + versions/senzing/language_classifier | 1 + versions/senzing/parser | 1 + 5 files changed, 37 insertions(+) create mode 100644 versions/senzing/base_data create mode 100644 versions/senzing/language_classifier create mode 100644 versions/senzing/parser diff --git a/configure.ac b/configure.ac index f740be12..4363c0fa 100644 --- a/configure.ac +++ b/configure.ac @@ -60,6 +60,17 @@ AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION]) AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION]) AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION]) +# Senzing data +AC_SUBST([LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING], [v1]) + +SENZING_DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/senzing/base_data) +SENZING_PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/parser) +SENZING_LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/language_classifier) + +AC_SUBST([LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION], [$SENZING_DATA_FILE_LATEST_VERSION]) +AC_SUBST([LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION], [$SENZING_PARSER_MODEL_LATEST_VERSION]) +AC_SUBST([LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION], [$SENZING_LANG_CLASS_MODEL_LATEST_VERSION]) + AC_CONFIG_FILES([Makefile libpostal.pc src/Makefile @@ -134,6 +145,13 @@ AC_ARG_ENABLE([data-download], *) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;; esac], [DOWNLOAD_DATA=true]) +AC_ARG_ENABLE([senzing-datamodel], + AS_HELP_STRING([[[--enable-senzing-datamodel]]], + [Use Senzing data model in lieu of the default one]), + [ + DATAMODEL="senzing" + AC_SUBST([LIBPOSTAL_DATA_MODEL], [$DATAMODEL]) + ]) AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"]) diff --git a/src/libpostal_data.in b/src/libpostal_data.in index 0a3d27f2..6b0c04e9 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -14,6 +14,8 @@ LIBPOSTAL_DATA_DIR=$3 MB=$((1024*1024)) CHUNK_SIZE=$((64*$MB)) +DATAMODEL="@LIBPOSTAL_DATA_MODEL@" + # Not loving this approach but there appears to be no way to query the size # of a release asset without using the Github API LIBPOSTAL_DATA_FILE_CHUNKS=1 @@ -34,6 +36,20 @@ LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download" +if [ $DATAMODEL == "senzing" ]; then + LIBPOSTAL_DATA_FILE_CHUNKS=1 + LIBPOSTAL_PARSER_MODEL_CHUNKS=1 + LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1 + + LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING@" + + LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION@" + LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION@" + LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION@" + + LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com" +fi + LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version LIBPOSTAL_DATA_DIR_VERSION= diff --git a/versions/senzing/base_data b/versions/senzing/base_data new file mode 100644 index 00000000..60453e69 --- /dev/null +++ b/versions/senzing/base_data @@ -0,0 +1 @@ +v1.0.0 \ No newline at end of file diff --git a/versions/senzing/language_classifier b/versions/senzing/language_classifier new file mode 100644 index 00000000..60453e69 --- /dev/null +++ b/versions/senzing/language_classifier @@ -0,0 +1 @@ +v1.0.0 \ No newline at end of file diff --git a/versions/senzing/parser b/versions/senzing/parser new file mode 100644 index 00000000..0ec25f75 --- /dev/null +++ b/versions/senzing/parser @@ -0,0 +1 @@ +v1.0.0 From c4c636febdd160ad437f61c062e8f0d1eb82d108 Mon Sep 17 00:00:00 2001 From: Oskar Thorbjornsson Date: Sun, 12 Feb 2023 18:04:10 -0800 Subject: [PATCH 2/5] Adding directions to the readme on how to download Senzing datamodel. --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 5cd327bc..58ad3ce8 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,16 @@ If you require a .lib import library to link this to your application. You can g lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64 ``` +Installation with an alternative data model +------------------------------------------- + +An alternative data model is available for libposta. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling. +To enable this add `--enable-senzing-datamodel` to the conigure line during installation: +``` +./configure --datadir=[...some dir with a few GB of space...] --enable-senzing-datamodel +``` + + Examples of parsing ------------------- From a11f33fb3d84b0a41334e10ce8a5583f514177fd Mon Sep 17 00:00:00 2001 From: Oskar Thorbjornsson Date: Mon, 13 Feb 2023 13:32:38 -0800 Subject: [PATCH 3/5] Add a link to info about Senzing data model. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 58ad3ce8..9fa3bf89 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,7 @@ To enable this add `--enable-senzing-datamodel` to the conigure line during inst ``` ./configure --datadir=[...some dir with a few GB of space...] --enable-senzing-datamodel ``` +Further information about this data model can be found at: https://github.com/Senzing/libpostal Examples of parsing From 0c0818c683716b35245210c0df320258c36bc3c4 Mon Sep 17 00:00:00 2001 From: Oskar Thorbjornsson Date: Mon, 13 Feb 2023 17:03:42 -0800 Subject: [PATCH 4/5] Update Senzing link. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9fa3bf89..2ec9f509 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ To enable this add `--enable-senzing-datamodel` to the conigure line during inst ``` ./configure --datadir=[...some dir with a few GB of space...] --enable-senzing-datamodel ``` -Further information about this data model can be found at: https://github.com/Senzing/libpostal +Further information about this data model can be found at: https://github.com/Senzing/libpostal-data Examples of parsing From 00568da290bb175d6d1ed28140e94453fe4547fc Mon Sep 17 00:00:00 2001 From: Oskar Thorbjornsson Date: Tue, 14 Feb 2023 21:02:51 -0800 Subject: [PATCH 5/5] Modifying README and config parameter, based on code review. --- README.md | 15 +++++++++++---- configure.ac | 10 +++------- src/libpostal_data.in | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 2ec9f509..d8e2cb9c 100644 --- a/README.md +++ b/README.md @@ -178,13 +178,20 @@ lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64 Installation with an alternative data model ------------------------------------------- -An alternative data model is available for libposta. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling. -To enable this add `--enable-senzing-datamodel` to the conigure line during installation: +An alternative data model is available for libpostal. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling. +To enable this add `MODEL=senzing` to the conigure line during installation: ``` -./configure --datadir=[...some dir with a few GB of space...] --enable-senzing-datamodel +./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing ``` -Further information about this data model can be found at: https://github.com/Senzing/libpostal-data +The data for this model is gotten from [OpenAddress](https://openaddresses.io/), [OpenStreetMap](https://www.openstreetmap.org/) and data generated by Senzing based on customer feedback (a few hundred records), a total of about 1.2 billion records of data from over 230 countries, in 100+ languages. The data from OpenStreetMap and OpenAddress is good but not perfect so the data set was modified by filtering out badly formed addresses, correcting misclassified address tokens and removing tokens that didn't belong in the addresses, whenever these conditions were encountered. + +Senzing created a data set of 12950 addresses from 89 countries that it uses to test and verify the quality of its models. The data set was generated using random addresses from OSM, minimally 50 per country. Hard-to-parse addresses were gotten from Senzing support team and customers and from the libpostal github page and added to this set. The Senzing model got 4.3% better parsing results than the default model, using this test set. + +The size of this model is about 2.2GB compared to 1.8GB for the default model so keep that in mind if storages space is important. + +Further information about this data model can be found at: https://github.com/Senzing/libpostal-data +If you run into any issues with this model, whether they have to do with parses, installation or any other problems, then please report them at https://github.com/Senzing/libpostal-data Examples of parsing ------------------- diff --git a/configure.ac b/configure.ac index 4363c0fa..ed997e32 100644 --- a/configure.ac +++ b/configure.ac @@ -145,13 +145,9 @@ AC_ARG_ENABLE([data-download], *) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;; esac], [DOWNLOAD_DATA=true]) -AC_ARG_ENABLE([senzing-datamodel], - AS_HELP_STRING([[[--enable-senzing-datamodel]]], - [Use Senzing data model in lieu of the default one]), - [ - DATAMODEL="senzing" - AC_SUBST([LIBPOSTAL_DATA_MODEL], [$DATAMODEL]) - ]) +AC_ARG_VAR(MODEL, [Option to use alternative data models. Currently available is "senzing" (MODEL=senzing). If this option is not set the default libpostal data model is used.]) +AS_VAR_IF([MODEL], [], [], + [AS_VAR_IF([MODEL], [senzing], [], [AC_MSG_FAILURE([Invalid MODEL value set])])]) AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"]) diff --git a/src/libpostal_data.in b/src/libpostal_data.in index 6b0c04e9..8c18270f 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -14,7 +14,7 @@ LIBPOSTAL_DATA_DIR=$3 MB=$((1024*1024)) CHUNK_SIZE=$((64*$MB)) -DATAMODEL="@LIBPOSTAL_DATA_MODEL@" +DATAMODEL="@MODEL@" # Not loving this approach but there appears to be no way to query the size # of a release asset without using the Github API