Merge branch 'master' into master

This commit is contained in:
Al B
2025-02-09 23:38:54 -05:00
committed by GitHub
89 changed files with 9825 additions and 541 deletions

36
.github/workflows/test.yml vendored Normal file
View File

@@ -0,0 +1,36 @@
name: Test
on:
push:
branches: [master]
pull_request:
branches: [master]
workflow_dispatch:
jobs:
build_and_test:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- name: Install Dependencies Linux
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt-get update -y
sudo apt-get install curl autoconf automake libtool pkg-config
- name: Install Dependencies MacOS
if: runner.os == 'macOS'
run: |
brew update
brew install curl autoconf automake libtool pkg-config
- name: Build
env:
LIBPOSTAL_DATA_DIR: ${GITHUB_WORKSPACE}/data
run: |
./bootstrap.sh
./configure --datadir=$LIBPOSTAL_DATA_DIR
make
- name: Test
run: make check

View File

@@ -1,53 +0,0 @@
language: c
branches:
only:
- master
env:
global:
- secure: "bHrAu46oecEj3gjamT+XWXtf2J0ZJCFa8tUdgM4evscaJiiwv1TtsGXyhIj/ai7DlRIPVJUtBUy6uoGGjr6GT43zTrzSxYAOMdVXZYsnTDcdL1/0dbwcIK6/u0EI377s1buGIxG1fHveWKXuXwJWDAw4KS+5HU88a42+zMbhKe4="
- secure: "SkvNYucKVns9qDjOEW2WIhDlOMKBOwhzVcwY++HWTRtn04ErrqR4k01Mmho0jGBQD9JrPLhDgnX1BNy5s+Kmq/bxn9OZm7K1z24qBKb0mBBiNEnf2jvT0AvF5xxM+cJf4KKNL+CC0MwNf5y7HVPq1xibOV4/CNIrc1ZZc9aqdkE="
- secure: "am/rRca5akv7gSSMeNQfHnWiTHhk8fQhOZvZ0Ut+PezkQlLgKp7bzmMFkkuQ4L5hpJU40kFzuWmIPgO33dacgq69Vx/Xct1bEnxGBGjriI5qOhMizmzLYPs5uWiRjtJnBqb4JOUh5K7JBlwrgvD72fY5ZK2lwtzTksfWo8N+ahU="
- secure: "mh/WDQapGJb6MAFvgCjiMAAv1aa8gUaIs2Ohtx7yPrDBwsD8UqlyEM7ktGLZGQ1q/7OJ/Z6QfDMfJQwDKzxyUSY1yHZTNkP3QzkTt2D1Qyvi++O6EkGqSdSS6Lb3aID3IsEaye/yasJ+rxiRSp05O9+OYvhJlqRZnzaimiAv5KI="
- secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU="
- GH_REF=github.com/openvenues/libpostal
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l)
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
compiler:
- clang
- gcc
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-4.8
- pkg-config
before_script:
- ./bootstrap.sh
- if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi;
- if [ $NUMEX_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/numbers/numex.py; fi;
- if [ $DICTIONARIES_CHANGED -ne 0 ]; then env/bin/python scripts/geodata/address_expansions/address_dictionaries.py; fi;
install:
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
script:
- ./configure --datadir=$(pwd)/data
- make -j4
- if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
- if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
- if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
- make check
after_success:
- |
if [[ "$CC" == gcc* && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then
env/bin/pip install awscli;
export PATH=$PATH:env/bin/;
./src/libpostal_data upload base $(pwd)/data/libpostal;
git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1
cp src/*_data.c _travis/src
cd _travis
git config user.name "$GIT_COMMITTER_NAME";
git config user.email "$GIT_COMMITTER_EMAIL";
git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER";
git push --quiet origin master;
fi;

View File

@@ -1,6 +1,6 @@
## Submitting Issues
When submitting issues to libpostal, please repeect these guildelines:
When submitting issues to libpostal, please respect these guidelines:
- Be constructive. Try to help solve the problem.
- Always search for existing issues before submitting one.

View File

@@ -8,7 +8,7 @@ I was checking out libpostal, and saw something that could be improved.
---
#### Here's how I'm using libpostal
<!-- Always interested to know how people use the library! What are you working on? Which orgnization? What's your use case? -->
<!-- Always interested to know how people use the library! What are you working on? Which organization? What's your use case? -->
---
#### Here's what I did

View File

@@ -1,6 +1,6 @@
# libpostal: international street address NLP
[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal)
[![Build Status](https://github.com/openvenues/libpostal/actions/workflows/test.yml/badge.svg)](https://github.com/openvenues/libpostal/actions)
[![Build Status](https://ci.appveyor.com/api/projects/status/github/openvenues/libpostal?branch=master&svg=true)](https://ci.appveyor.com/project/albarrentine/libpostal/branch/master)
[![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE)
[![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors)
@@ -98,7 +98,7 @@ Before you install, make sure you have the following prerequisites:
**On Ubuntu/Debian**
```
sudo apt-get install curl autoconf automake libtool pkg-config
sudo apt-get install -y curl build-essential autoconf automake libtool pkg-config
```
**On CentOS/RHEL**
@@ -106,19 +106,46 @@ sudo apt-get install curl autoconf automake libtool pkg-config
sudo yum install curl autoconf automake libtool pkgconfig
```
**On Mac OSX**
**On macOS**
Install with one command via [MacPorts](https://www.macports.org/):
```
port install libpostal
```
Or as follows with [Homebrew](https://brew.sh/):
```
brew install curl autoconf automake libtool pkg-config
```
Then to install the C library:
If you're using an M1 Mac, add `--disable-sse2` to the `./configure` command. This will result in poorer performance but the build will succeed.
```
git clone https://github.com/openvenues/libpostal
cd libpostal
# skip if installing for the first time
make distclean
./bootstrap.sh
./configure --datadir=[...some dir with a few GB of space...]
# omit --datadir flag to install data in current directory
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
make -j4
# For Intel/AMD processors and the default model
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...]
# For Apple / ARM cpus and the default model
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] --disable-sse2
# For the improved Senzing model:
./configure --datadir=[...some dir with a few GB of space where a "libpostal" directory exists or can be created/modified...] MODEL=senzing
make -j8
sudo make install
# On Linux it's probably a good idea to run
@@ -175,6 +202,24 @@ If you require a .lib import library to link this to your application. You can g
lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64
```
Installation with an alternative data model
-------------------------------------------
An alternative data model is available for libpostal. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling.
To enable this add `MODEL=senzing` to the conigure line during installation:
```
./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing
```
The data for this model is gotten from [OpenAddress](https://openaddresses.io/), [OpenStreetMap](https://www.openstreetmap.org/) and data generated by Senzing based on customer feedback (a few hundred records), a total of about 1.2 billion records of data from over 230 countries, in 100+ languages. The data from OpenStreetMap and OpenAddress is good but not perfect so the data set was modified by filtering out badly formed addresses, correcting misclassified address tokens and removing tokens that didn't belong in the addresses, whenever these conditions were encountered.
Senzing created a data set of 12950 addresses from 89 countries that it uses to test and verify the quality of its models. The data set was generated using random addresses from OSM, minimally 50 per country. Hard-to-parse addresses were gotten from Senzing support team and customers and from the libpostal github page and added to this set. The Senzing model got 4.3% better parsing results than the default model, using this test set.
The size of this model is about 2.2GB compared to 1.8GB for the default model so keep that in mind if storages space is important.
Further information about this data model can be found at: https://github.com/Senzing/libpostal-data
If you run into any issues with this model, whether they have to do with parses, installation or any other problems, then please report them at https://github.com/Senzing/libpostal-data
Examples of parsing
-------------------
@@ -382,22 +427,19 @@ Libpostal is designed to be used by higher-level languages. If you don't see yo
- LuaJIT: [lua-resty-postal](https://github.com/bungle/lua-resty-postal)
- Perl: [Geo::libpostal](https://metacpan.org/pod/Geo::libpostal)
- Elixir: [Expostal](https://github.com/SweetIQ/expostal)
- Haskell: [haskell-postal](http://github.com/netom/haskell-postal)
- Rust: [rust-postal](https://github.com/pnordahl/rust-postal)
- Rust: [rustpostal](https://crates.io/crates/rustpostal)
**Database extensions**
**Unofficial database extensions**
- PostgreSQL: [pgsql-postal](https://github.com/pramsey/pgsql-postal)
**Unofficial REST API**
**Unofficial servers**
- Libpostal REST: [libpostal REST](https://github.com/johnlonganecker/libpostal-rest)
**Libpostal REST Docker**
- Libpostal REST Docker [Libpostal REST Docker](https://github.com/johnlonganecker/libpostal-rest-docker)
**Libpostal ZeroMQ Docker**
- Libpostal ZeroMQ Docker image: [pasupulaphani/libpostal-zeromq](https://hub.docker.com/r/pasupulaphani/libpostal-zeromq/) , Source: [Github](https://github.com/pasupulaphani/libpostal-docker)
- Libpostal REST Go Docker: [libpostal-rest-docker](https://github.com/johnlonganecker/libpostal-rest-docker)
- Libpostal REST FastAPI Docker: [libpostal-fastapi](https://github.com/alpha-affinity/libpostal-fastapi)
- Libpostal ZeroMQ Docker: [libpostal-zeromq](https://github.com/pasupulaphani/libpostal-docker)
Tests
@@ -466,13 +508,13 @@ whitespace e.g. Chinese) are supported, as are Germanic languages where
thoroughfare types are concatenated onto the end of the string, and may
optionally be separated so Rosenstraße and Rosen Straße are equivalent.
- **International address parsing**: [Conditional Random Field](http://blog.echen.me/2012/01/03/introduction-to-conditional-random-fields/) which parses
- **International address parsing**: [Conditional Random Field](https://web.archive.org/web/20240104172655/http://blog.echen.me/2012/01/03/introduction-to-conditional-random-fields/) which parses
"123 Main Street New York New York" into {"house_number": 123, "road":
"Main Street", "city": "New York", "state": "New York"}. The parser works
for a wide variety of countries and languages, not just US/English.
The model is trained on over 1 billion addresses and address-like strings, using the
templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted,
tagged traning examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
tagged training examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
are performed to make the training data resemble real messy geocoder input as closely as possible.
- **Language classification**: multinomial logistic regression
@@ -494,7 +536,7 @@ language (IX => 9) which occur in the names of many monarchs, popes, etc.
- **Fast, accurate tokenization/lexing**: clocked at > 1M tokens / sec,
implements the TR-29 spec for UTF8 word segmentation, tokenizes East Asian
languages chracter by character instead of on whitespace.
languages character by character instead of on whitespace.
- **UTF8 normalization**: optionally decompose UTF8 to NFD normalization form,
strips accent marks e.g. à => a and/or applies Latin-ASCII transliteration.
@@ -518,6 +560,7 @@ Non-goals
- Verifying that a location is a valid address
- Actually geocoding addresses to a lat/lon (that requires a database/search index)
- Extracting addresses from free text
Raison d'être
-------------
@@ -623,7 +666,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions:
- Confines almost all mallocs to *name*_new and all frees to *name*_destroy
- Efficient existing implementations for simple things like hashtables
- Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
- Data structrues take advantage of sparsity as much as possible
- Data structures take advantage of sparsity as much as possible
- Efficient double-array trie implementation for most string dictionaries
- Cross-platform as much as possible, particularly for *nix

View File

@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
m4_define(LIBPOSTAL_MINOR_VERSION, [1])
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
@@ -50,10 +50,32 @@ AC_CHECK_TYPES([ptrdiff_t])
# Checks for library functions.
AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
AC_SUBST([LIBPOSTAL_DATA_DIR_VERSION_STRING], [v1])
DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/base_data)
PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/parser)
LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/language_classifier)
AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
# Senzing data
AC_SUBST([LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING], [v1])
SENZING_DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/senzing/base_data)
SENZING_PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/parser)
SENZING_LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/language_classifier)
AC_SUBST([LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION], [$SENZING_DATA_FILE_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION], [$SENZING_PARSER_MODEL_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION], [$SENZING_LANG_CLASS_MODEL_LATEST_VERSION])
AC_CONFIG_FILES([Makefile
libpostal.pc
src/Makefile
test/Makefile])
src/libpostal_data
test/Makefile], [chmod +x src/libpostal_data])
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
@@ -64,6 +86,7 @@ AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf avail
# ------------------------------------------------------------------
# Checks for SSE2 build
# ------------------------------------------------------------------
AC_ARG_ENABLE([sse2],
AS_HELP_STRING(
[--disable-sse2],
@@ -71,7 +94,7 @@ AC_ARG_ENABLE([sse2],
)
)
AS_IF([test "x$enable_sse2" != "xno"], [
AS_IF([test "x$enable_sse2" != "xno" && test "x$(uname -m)" != "xarm64"], [
CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}"
])
@@ -85,6 +108,10 @@ AC_ARG_ENABLE([data-download],
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
esac], [DOWNLOAD_DATA=true])
AC_ARG_VAR(MODEL, [Option to use alternative data models. Currently available is "senzing" (MODEL=senzing). If this option is not set the default libpostal data model is used.])
AS_VAR_IF([MODEL], [], [],
[AS_VAR_IF([MODEL], [senzing], [], [AC_MSG_FAILURE([Invalid MODEL value set])])])
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],

View File

@@ -3,6 +3,8 @@ libpostal_address_parser_response_destroy
libpostal_expand_address
libpostal_expand_address_root
libpostal_expansion_array_destroy
libpostal_address_parser_response_destroy
libpostal_language_classifier_response_destroy
libpostal_get_address_parser_default_options
libpostal_get_default_duplicate_options
libpostal_get_default_fuzzy_duplicate_options
@@ -29,6 +31,7 @@ libpostal_normalized_tokens_languages
libpostal_parse_address
libpostal_parser_print_features
libpostal_place_languages
libpostal_classify_language
libpostal_setup
libpostal_setup_datadir
libpostal_setup_language_classifier

View File

@@ -152,11 +152,21 @@ if test $ax_cblas_ok = no; then
[], [-lblas])])
fi
# BLAS in OpenBLAS library?
if test $ax_cblas_ok = no; then
AC_CHECK_LIB(openblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lopenblas"])
fi
# Generic CBLAS library?
if test $ax_cblas_ok = no; then
AC_CHECK_LIB(cblas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lcblas"])
fi
# Generic BLAS library?
if test $ax_cblas_ok = no; then
AC_CHECK_LIB(blas, cblas_dgemm, [ax_cblas_ok=yes; CBLAS_LIBS="-lblas"])
fi
AC_SUBST(CBLAS_LIBS)
LIBS="$ax_cblas_save_LIBS"

View File

@@ -63,10 +63,23 @@ numbers:
house_numbers:
gebaude: &gebaude
canonical: gebäude
abbreviated: geb
sample: true
canonical_probability: 0.5
abbreviated_probability: 0.5
sample_probability: 0.05
numeric:
direction: left
alphanumeric:
default: *nummer
probability: 0.95
alternatives:
- alternative: *gebaude
probability: 0.05
alphanumeric_phrase_probability: 0.0001
alphanumeric_phrase_probability: 0.05
conscription_numbers:
alphanumeric:

View File

@@ -49,7 +49,7 @@ numbers:
house_numbers:
budnyok: &budnyok
budynok: &budynok
canonical: будинок
abbreviated: буд
sample: true
@@ -58,8 +58,8 @@ house_numbers:
sample_probability: 0.1
numeric:
direction: left
budnyok_latin: &budnyok_latin
canonical: budnyok
budynok_latin: &budynok_latin
canonical: budynok
abbreviated: bud
sample: true
canonical_probability: 0.6
@@ -88,10 +88,10 @@ house_numbers:
direction: left
alphanumeric:
default: *budnyok
default: *budynok
probability: 0.65
alternatives:
- alternative: *budnyok_latin
- alternative: *budynok_latin
probability: 0.05
- alternative: *dom
probability: 0.25

View File

@@ -11,8 +11,9 @@
overrides:
id:
# Buenos Aires (state boundary coterminous with city)
"3082668": null
relation:
# Buenos Aires (state boundary coterminous with city)
"3082668": null
contained_by:
relation:
# Buenos Aires

View File

@@ -2,4 +2,4 @@
admin_level:
"2": "country"
"4": "state"
"6:": "state_district"
"6": "state_district"

View File

@@ -9,7 +9,7 @@
id:
relation:
# Auckland
"2094141": "city"
"17000522": "city"
# Wellington
"4266321": "city"
# Christchurch
@@ -31,12 +31,12 @@
# Invercargill
"1656388": "city"
# Nelson
"4266962": "city"
"17000449": "city"
# Upper Hutt
"4266375": "city"
use_admin_center:
- id: 2094141 # Auckland
- id: 17000522 # Auckland
type: relation
- id: 4266321 # Wellington
type: relation
@@ -68,7 +68,7 @@
- id: 1656388 # Invercargill
type: relation
probability: 0.7
- id: 4266962 # Nelson
- id: 17000449 # Nelson
type: relation
- id: 4266375 # Upper Hutt
type: relation

View File

@@ -10,10 +10,10 @@
"8": "city"
"9": "suburb"
overrides:
id:
relation:
# Taiwan Province
"3777248": "state"
# Fujian Province
"3777250": "state"
overrides:
id:
relation:
# Taiwan Province
"3777248": "state"
# Fujian Province
"3777250": "state"

View File

@@ -54,7 +54,7 @@ center|centre|cetr|cntr|ctr|c|cen
centers|ctrs
centreway|cnwy
chase|ch|chas
circle|cir|circel
circle|cir|circel|cirlce
circles|cirs
circlet|clt
circuit|crct|circ|cct|cirt|ci|circt
@@ -132,6 +132,7 @@ falls|fls
fare
farm|frm
farms|frms
farm to market|fm|farm-to-market
fern
ferry|fry|fy
field|fld|fd

View File

@@ -64,7 +64,7 @@ rhode island|ri
saskatchewan|sk
south carolina|sc
south dakota|sd
southern australia|sa
south australia|sa
tasmania|tas
tennessee|tn
texas|tx

View File

@@ -14,9 +14,8 @@ banda|bda
barranco|branc
barranquillo|bqllo
barriada|barda
boulevard|blvd|bvd
brazal|brzal
bulevar|bulev|blev|blv|bv|bl
bulevar|bulev|blev|blv|bv|bl|br|boulevard|blvd|bvd
calle|c|cl|cll|c /|ca|call
calleja|cllja
callejón|callejon|callej|cjón|cjon|cllon|cllón|cj

View File

@@ -0,0 +1,5 @@
mandir|मन्दिर|मंदिर
station
police station
post office
office

View File

@@ -0,0 +1,4 @@
dakshin|दक्षिण
uttar|उत्तर
poorva|poorav|पूर्व
paschim|पश्चिम

View File

@@ -0,0 +1 @@
mahatma gandhi|mg|m g

View File

@@ -0,0 +1,24 @@
baba
babu
bhagat
guru
jagirdar
maharaja|maharaj
mahatma|महात्मा
pandit
raja
rajarshi
rajkumar
rajkumari
rani
rishi
sahib
sant
sardar
senapati
shah
shrimati|smt|srimathi|श्रीमती
shri|shree|sri|श्री
sushri
swami
ustad

View File

@@ -0,0 +1,3 @@
nagar|नगर
colony
cantonment|cantt

View File

@@ -1,3 +1,4 @@
bazaar|bazar
marg
nagar
flyover
रोड

View File

@@ -1,2 +1,2 @@
félemelet|felemelet
magasföldszint|magasfoldszint
félemelet|felemelet|félem|1/2 em|1/2em
magasföldszint|magasfoldszint|mgfszt|mgfsz|mfszt|mfsz

View File

@@ -1 +1,2 @@
szent|szt
idősebb|id

View File

@@ -1,21 +1,34 @@
árok|arok
dűlő|dulo
dűlő|dulo|d.|d
fasor
fasora
főközlekedési út|főút|fout
határút|hatarut
kapu
kert
körönd|korond|krnd
körvasútsor|korvasutsor
körút|korut|krt
köz|koz
lakótelep|lakotelep|ltp.|ltp
lejtő|lejto
lépcső|lepcso
liget
mező|mezo
országút|orszagut
park
rakpart|rpt
sétány|setany
sor
sugárút|sugarut
parkja
rakpart|rkpt|rkp|rpt
sétány|setany|stny.|stny
sor|s.|s
sétány|setany|sét
sugárút|sugarut|sgrt.|sgrt|srt.|srt|sgt.|sgt
sziget
telep
tér|ter
tere
utca|u
út|ut
tanya|t.|t
udvar
utca|u.|u
út|ut|u.|u
útja|utja

View File

@@ -0,0 +1,3 @@
kuala lumpur|federal territory kuala lumpur|federal territory of kuala lumpur|wilayah persekutuan kuala lumpur|kl
labuan|federal territory labuan|federal territory of labuan|wilayah persekutuan labuan
putrajaya|federal territory putrajaya|federal territory of putrajaya|wilayah persekutuan putrajaya

View File

@@ -1,12 +1,18 @@
aleja|al
autostrada
boczna
bulwar
bulwar|bulw
droga
obwodnica
ogród
osiedle|os
park
plac|pl
rondo
rynek
skwer
szosa
ulica|ul
wybrzeże|wyb
wyspa
zaulek

View File

@@ -10,10 +10,10 @@ calçada|calcada|cc
calçadinha|caclcadinha|ccnh
câmara municipal|camara municipal|cm|c.m.|c. m.
caminho|cam|camno
direito|dto
direito|dto|dt
esquerdo|esq
estrada|estr
astrada marginal|estr marg
estrada marginal|estr marg
estrada municipal|em|e m|estr m
estrada nacional|en|e n|estr n
estrada regional|er|e r|estr r

View File

@@ -0,0 +1,6 @@
anexa
bloc|blc|bl
casa
cladirea|cladire
complex
garaj

View File

@@ -0,0 +1,5 @@
banca
organizatie neguvernamentala|ong
societate comerciala|sc
societate cu raspundere limitata|srl
societate pe actiuni|sa

View File

@@ -1,5 +1,5 @@
&
colț|colt
colț|colt|colț cu|colt cu
între|intre
la colțul de pe|la coltul de pe
și|si

View File

@@ -1 +1 @@
intrare
intrare|intrarea

View File

@@ -4,4 +4,4 @@ din
in apropiere de
în apropiere|in apropiere
în jurul aici|in jurul aici
lângă mine|langa mine
lângă mine|langa mine|lângă|langa

View File

@@ -1 +1 @@
număr|numar|nr|nº|n°|#|№|no
număr|numar|nr|nº|n°|#|№|no|numarul|numărul

View File

@@ -4,10 +4,11 @@ colonel|col
comandor
contra amiral
doctor|dr
general|gen
general|gen|g-ral
major|maj
locotenent
locotenent colonel
pictor
profesor|prof
sergent
sublocotenent

View File

@@ -0,0 +1,3 @@
cinema
cafenea
fabrica

View File

@@ -1 +1,7 @@
bloc|bl
cartier|cartierul
comuna|comunā
kilometrul|kilometru|km
sat|satul
sector|sectorul|sect
zona

View File

@@ -1,2 +1,3 @@
și|si|&
cel
intre

View File

@@ -1,12 +1,12 @@
aleea|ale|alea|al
bulevardul|bd|bul|bdul|blv|blvd|b-dul|b.dul|bulev|bulevardu|bulevard
calea|cal
drumul
calea|cale|cal
drumul|drum
fundătura|fundatura|fnd
fundacul|fdc
intrarea|int|intr
piaţa|piata|piață|pta|pţa|p-ta|p-ţa
strada|str
strada|str|st
stradela|str-la|sdla
șoseaua|soseaua|sos|șos
splaiul|sp|spl

View File

@@ -0,0 +1 @@
decembrie|dec

View File

@@ -1,4 +1,8 @@
apartament|ap|apt|apart
apartamentul|apartament|ap|apt|apart
birou
cladire|cladirea|clădire|clădirea
corp|corpul
complex
interior|int
lotul
sală|sala

View File

@@ -1,2 +1,4 @@
вход
vkhod
подъезд
pod'ezd

View File

@@ -6,3 +6,5 @@ kvartal|kvart|kv|kv-l
oblast|obl
район|р
raion|r-n
місто|міс|м
misto|mis|m

93
resources/states/my.yaml Normal file
View File

@@ -0,0 +1,93 @@
"KL":
en: Kuala Lumpur
ms: Kuala Lumpur
"federal territory kuala lumpur":
en: Kuala Lumpur
ms: Kuala Lumpur
"federal territory of kuala lumpur":
en: Kuala Lumpur
ms: Kuala Lumpur
"wilayah persekutuan kuala lumpur":
en: Kuala Lumpur
ms: Kuala Lumpur
"federal territory labuan":
en: Labuan
ms: Labuan
"federal territory of labuan":
en: Labuan
ms: Labuan
"wilayah persekutuan labuan":
en: Labuan
ms: Labuan
"federal territory putrajaya":
en: Putrajaya
ms: Putrajaya
"federal territory of putrajaya":
en: Putrajaya
ms: Putrajaya
"wilayah persekutuan putrajaya":
en: Putrajaya
ms: Putrajaya
"pulau pinang":
en: Penang
ms: Pulau Pinang
"penang":
en: Penang
ms: Pulau Pinang
JHR:
en: Johor
ms: Johor
KDH:
en: Kedah
ms: Kedah
KTN:
en: Kelantan
ms: Kelantan
MLK:
en: Melaka
ms: Melaka
NSN:
en: Negeri Sembilan
ms: Negeri Sembilan
PHG:
en: Pahang
ms: Pahang
PRK:
en: Perak
ms: Perak
PLS:
en: Perlis
ms: Perlis
PNG:
en: Penang
ms: Pulau Pinang
SBH:
en: Sabah
ms: Sabah
SWK:
en: Sarawak
ms: Sarawak
SGR:
en: Selangor
ms: Selangor
TRG:
en: Terengganu
ms: Terengganu
KUL:
en: Kuala Lumpur
ms: Kuala Lumpur
LBN:
en: Labuan
ms: Labuan
PJY:
en: Putrajaya
ms: Putrajaya
KL:
en: Kuala Lumpur
ms: Kuala Lumpur
LB:
en: Labuan
ms: Labuan
PY:
en: Putrajaya
ms: Putrajaya

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import argparse
import fnmatch
import logging
import operator
import os
@@ -24,7 +25,7 @@ from geodata.osm.components import osm_address_components
from geodata.osm.definitions import osm_definitions
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
from geodata.polygons.index import *
from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder
from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder
from geodata.statistics.tf_idf import IDFIndex
@@ -212,6 +213,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
(ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
tests for neighborhoods. The properties vary by source but each has
source has least a "name" key which in practice is what we care about.
Quattroshapes data is no longer accessible and has been replaced by
WhosOnFirst.
'''
PRIORITIES_FILENAME = 'priorities.json'
@@ -224,9 +228,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
source_priorities = {
'osm': 0, # Best names/polygons, same coordinate system
'osm_cth': 1, # Prefer the OSM names if possible
'clickthathood': 2, # Better names/polygons than Quattroshapes
'osm_quattro': 3, # Prefer OSM names matched with Quattroshapes polygon
'quattroshapes': 4, # Good results in some countries/areas
'clickthathood': 2, # Better names/polygons than WhosOnFirst
'osm_wof': 3, # Prefer OSM names matched with WhosOnFirst polygon
'wof': 4, # Replacement of Quattroshapes
}
level_priorities = {
@@ -235,7 +239,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
}
regex_replacements = [
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes
(re.compile('^paris-(?=[\d])', re.I), ''),
(re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
]
@@ -254,7 +258,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
return doc
@classmethod
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
'''
Given an OSM file (planet or some other bounds) containing neighborhoods
as points (some suburbs have boundaries)
@@ -270,17 +274,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
logger = logging.getLogger('neighborhoods')
qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
ensure_dir(qs_scratch_dir)
logger.info('Creating ClickThatHood neighborhoods')
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
logger.info('Creating OSM neighborhoods')
osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
logger.info('Creating Quattroshapes neighborhoods')
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
logger.info('Creating WhosOnFirst neighborhoods')
wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods"))
country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
@@ -292,7 +293,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
char_scripts = get_chars_by_script()
for idx in (cth, qs, osmn):
for idx in (cth, wof, osmn):
for i in xrange(idx.i):
props = idx.get_properties(i)
name = props.get('name')
@@ -317,11 +318,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
index.index_polygon(poly.context)
index.add_polygon(poly.context, props)
qs.matched = [False] * qs.i
wof.matched = [False] * wof.i
cth.matched = [False] * cth.i
logger.info('Matching OSM points to neighborhood polygons')
# Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons
# Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons
num_polys = 0
for element_id, attrs, deps in parse_osm(filename):
try:
@@ -359,14 +360,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
for name_key in OSM_NAME_TAGS:
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
for idx in (cth, qs):
for idx in (cth, wof):
candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
if candidates:
max_sim = 0.0
arg_max = None
normalized_qs_names = {}
normalized_wof_names = {}
for osm_name in osm_names:
@@ -375,16 +376,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
for i in candidates:
props = idx.get_properties(i)
name = normalized_qs_names.get(i)
name = normalized_wof_names.get(i)
if not name:
name = props.get('name')
if not name:
continue
for pattern, repl in cls.regex_replacements:
name = pattern.sub(repl, name)
normalized_qs_names[i] = name
normalized_wof_names[i] = name
if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood':
if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood':
continue
if not contains_ideographs:
@@ -446,7 +447,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
continue
source = 'osm_cth'
else:
level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None)
source = 'osm_quattro'
if level == 'neighborhood':
@@ -467,7 +468,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
if num_polys % 1000 == 0 and num_polys > 0:
logger.info('did {} neighborhoods'.format(num_polys))
for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')):
for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')):
for i in xrange(idx.i):
props = idx.get_properties(i)
poly = idx.get_polygon(i)
@@ -482,7 +483,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
props['polygon_type'] = 'local_admin'
else:
continue
elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood':
component = AddressFormatter.SUBURB
name = props.get('name')
if not name:
@@ -525,28 +526,67 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
return sorted(candidates, key=self.priority)
class QuattroshapesNeighborhoodsReverseGeocoder(GeohashPolygonIndex, QuattroshapesReverseGeocoder):
class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex):
persistent_polygons = False
cache_size = None
NAME = "wof:name"
ASCII_NAME = "gn:asciiname"
LEVEL = "wof:placetype"
GEONAMES_ID = "gn:geonameid"
SUPERSEDED = "wof:superseded_by"
NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"}
POLYGON_TYPES = {"Polygon", "MultiPolygon"}
@classmethod
def create_neighborhoods_index(cls, quattroshapes_dir,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME)
neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME)
return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename],
output_dir, index_filename=index_filename,
polys_filename=polys_filename)
def is_valid_neighbourhood(cls, geojson):
validity = not geojson["properties"].get(cls.SUPERSEDED)
for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}:
validity &= geojson["properties"].get(field)
return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES
@classmethod
def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None):
index = cls(save_dir=output_dir, index_filename=index_filename)
for root, dirnames, filenames in os.walk(wof_dir):
for fname in fnmatch.filter(filenames, "*.geojson"):
with open(os.path.join(root, fname)) as f:
geojson = json.load(f)
if cls.is_valid_neighbourhood(geojson):
properties = {
"name": safe_decode(geojson["properties"].get(cls.NAME)),
"name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)),
"qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)),
"gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID))
}
poly_type = geojson['geometry']['type']
if poly_type == 'Polygon':
poly = cls.to_polygon(geojson['geometry']['coordinates'][0])
index.index_polygon(poly)
poly = index.simplify_polygon(poly)
index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props)
elif poly_type == 'MultiPolygon':
polys = []
for coords in geojson['geometry']['coordinates']:
poly = cls.to_polygon(coords[0])
polys.append(poly)
index.index_polygon(poly)
multi_poly = index.simplify_polygon(MultiPolygon(polys))
index.add_polygon(multi_poly, dict(geojson['properties']))
return index
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-q', '--quattroshapes-dir',
help='Path to quattroshapes dir')
parser.add_argument('-w', '--wof-dir',
help='Path to WhosOnFirst dir')
parser.add_argument('-a', '--osm-admin-rtree-dir',
help='Path to OSM admin rtree dir')
@@ -567,16 +607,16 @@ if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
args = parser.parse_args()
if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
index = NeighborhoodReverseGeocoder.create_from_osm_and_wof(
args.osm_neighborhoods_file,
args.quattroshapes_dir,
args.wof_dir,
args.country_rtree_dir,
args.osm_admin_rtree_dir,
args.osm_neighborhood_borders_file,
args.out_dir
)
else:
parser.error('Must specify quattroshapes dir or osm admin borders file')
parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file')
index.save()

View File

@@ -226,7 +226,6 @@ class PolygonIndex(object):
@classmethod
def create_from_geojson_files(cls, inputs, output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME,
include_only_properties=None):
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
for input_file in inputs:

View File

@@ -0,0 +1,27 @@
import os
import pycountry
import subprocess
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
WOF_DATA_ADMIN_REPO_URL_PREFIX = "https://github.com/whosonfirst-data/whosonfirst-data/"
WOF_DATA_ADMIN_REPO_PREFIX = "whosonfirst-data-admin-"
def download_wof_data_admin(wof_dir):
for country_object in pycountry.countries:
repo_name = WOF_DATA_ADMIN_REPO_PREFIX + country_object.alpha2.lower()
repo_location = os.path.join(wof_dir, repo_name)
if not os.path.exists(repo_location):
subprocess.call(["git", "clone", WOF_DATA_ADMIN_REPO_URL_PREFIX + repo_name])
if __name__ == '__main__':
if len(sys.argv) < 2:
sys.exit('Usage: python download_whosonfirst_data.py wof_dir')
download_wof_data_admin(sys.argv[1])

View File

@@ -1,4 +1,4 @@
requests==2.9.1
requests==2.32.2
six==1.10.0
PyYAML==3.11
ujson==1.33
PyYAML==5.4
ujson==5.4.0

View File

@@ -1,7 +1,7 @@
boto3==1.4.0
botocore==1.4.53
Fiona==1.6.3.post1
PyYAML==3.11
Fiona==1.10.0
PyYAML==5.4
Rtree==0.8.2
Shapely==1.5.14
Unidecode==0.4.19
@@ -10,17 +10,17 @@ click-plugins==1.0.3
cligj==0.4.0
cssselect==0.9.1
distribute==0.7.3
future==0.15.2
future==0.18.3
futures==3.0.5
ftfy==4.2.0
gevent==1.1.2
gevent==23.9.0
greenlet==0.4.10
jmespath==0.9.0
leveldb==0.193
lxml==3.6.0
lxml==4.9.1
lru-dict==1.1.3
marisa-trie==0.7.2
numpy==1.10.4
numpy==1.22.0
pycountry==1.20
git+https://github.com/kmike/pymorphy2
pymorphy2-dicts-ru==2.4.394633.4298366
@@ -29,9 +29,9 @@ pyproj==1.9.5.1
pystache==0.5.4
python-Levenshtein==0.12.0
python-geohash==0.8.5
requests==2.9.1
requests==2.32.2
s3transfer==0.1.3
six==1.10.0
ujson==1.35
ujson==5.4.0
urlnorm==1.1.3
wsgiref==0.1.2

View File

@@ -42,7 +42,8 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co
address_expansion_t expansion = expansions[i];
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
if (string_contains(canonical, " ")) {
bool is_possible_acronym = string_contains(canonical, " ") || (phrase.len == 1 && address_expansion_in_dictionary(expansion, DICTIONARY_DIRECTIONAL));
if (is_possible_acronym) {
for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
existing_acronyms[j] = 1;
}

View File

@@ -1,4 +1,5 @@
#include "crf_context.h"
#include "float_utils.h"
crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
crf_context_t *context = malloc(sizeof(crf_context_t));
@@ -39,8 +40,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
}
if (context->flag & CRF_CONTEXT_MARGINALS) {
#ifdef USE_SSE
context->exp_state = double_matrix_new_aligned(T, L, 16);
#if defined(USE_SSE)
context->exp_state = double_matrix_new_aligned(T, L, 32);
if (context->exp_state == NULL) goto exit_context_created;
double_matrix_zero(context->exp_state);
#else
@@ -51,8 +52,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
context->mexp_state = double_matrix_new_zeros(T, L);
if (context->mexp_state == NULL) goto exit_context_created;
#ifdef USE_SSE
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 16);
#if defined(USE_SSE)
context->exp_state_trans = double_matrix_new_aligned(T, L * L, 32);
if (context->exp_state_trans == NULL) goto exit_context_created;
double_matrix_zero(context->exp_state_trans);
#else
@@ -63,8 +64,8 @@ crf_context_t *crf_context_new(int flag, size_t L, size_t T) {
context->mexp_state_trans = double_matrix_new_zeros(T, L * L);
if (context->mexp_state_trans == NULL) goto exit_context_created;
#ifdef USE_SSE
context->exp_trans = double_matrix_new_aligned(L, L, 16);
#if defined(USE_SSE)
context->exp_trans = double_matrix_new_aligned(L, L, 32);
if (context->exp_trans == NULL) goto exit_context_created;
double_matrix_zero(context->exp_trans);
#else
@@ -129,14 +130,14 @@ bool crf_context_set_num_items(crf_context_t *self, size_t T) {
if (self->flag & CRF_CONTEXT_MARGINALS &&
(
#ifdef USE_SSE
!double_matrix_resize_aligned(self->exp_state, T, L, 16) ||
#if defined(USE_SSE)
!double_matrix_resize_aligned(self->exp_state, T, L, 32) ||
#else
!double_matrix_resize(self->exp_state, T, L) ||
#endif
!double_matrix_resize(self->mexp_state, T, L) ||
#ifdef USE_SSE
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 16) ||
#if defined(USE_SSE)
!double_matrix_resize_aligned(self->exp_state_trans, T, L * L, 32) ||
#else
!double_matrix_resize(self->exp_state_trans, T, L * L) ||
#endif
@@ -183,7 +184,7 @@ void crf_context_destroy(crf_context_t *self) {
}
if (self->exp_state != NULL) {
#ifdef USE_SSE
#if defined(USE_SSE)
double_matrix_destroy_aligned(self->exp_state);
#else
double_matrix_destroy(self->exp_state);
@@ -199,7 +200,7 @@ void crf_context_destroy(crf_context_t *self) {
}
if (self->exp_state_trans != NULL) {
#ifdef USE_SSE
#if defined(USE_SSE)
double_matrix_destroy_aligned(self->exp_state_trans);
#else
double_matrix_destroy(self->exp_state_trans);
@@ -215,7 +216,7 @@ void crf_context_destroy(crf_context_t *self) {
}
if (self->exp_trans != NULL) {
#ifdef USE_SSE
#if defined(USE_SSE)
double_matrix_destroy_aligned(self->exp_trans);
#else
double_matrix_destroy(self->exp_trans);

View File

@@ -15,6 +15,14 @@
#include "token_types.h"
#include "transliterate.h"
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifndef HAVE_STRNDUP
#include "strndup.h"
#endif
#define DEFAULT_KEY_LEN 32
@@ -878,6 +886,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
log_debug("have_ambiguous = %d\n", have_ambiguous);
log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable);
log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
}
bool skipped_last_edge_phrase = false;
@@ -913,7 +922,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
}
if (token.type != WHITESPACE) {
if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) ) {
if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) || (prev_phrase.start == phrase.start && prev_phrase.len == phrase.len) ) {
log_debug("Adding space\n");
string_tree_add_string(tree, " ");
string_tree_finalize_token(tree);
@@ -1536,6 +1545,29 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *
void expand_alternative_phrase_option_languages(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
char **temp_languages = calloc(1, sizeof(char *));
libpostal_normalize_options_t temp_options = options;
for (size_t i = 0; i < options.num_languages; i++) {
char *lang = options.languages[i];
temp_languages[0] = lang;
temp_options.languages = temp_languages;
temp_options.num_languages = 1;
expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option);
}
if (options.num_languages == 0) {
temp_options.languages = options.languages;
temp_options.num_languages = options.num_languages;
expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option);
}
free(temp_languages);
}
cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) {
options.address_components |= LIBPOSTAL_ADDRESS_ANY;
@@ -1543,7 +1575,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
size_t len = strlen(input);
language_classifier_response_t *lang_response = NULL;
libpostal_language_classifier_response_t *lang_response = NULL;
if (options.num_languages == 0) {
lang_response = classify_languages(input);
@@ -1566,7 +1598,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
if (string_tree_num_strings(tree) == 1) {
char *normalized = string_tree_get_alternative(tree, 0, 0);
expand_alternative_phrase_option(strings, unique_strings, normalized, options, phrase_option);
expand_alternative_phrase_option_languages(strings, unique_strings, normalized, options, phrase_option);
} else {
log_debug("Adding alternatives for multiple normalizations\n");
@@ -1587,7 +1619,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
char_array_terminate(temp_string);
token = char_array_get_string(temp_string);
log_debug("current permutation = %s\n", token);
expand_alternative_phrase_option(strings, unique_strings, token, options, phrase_option);
expand_alternative_phrase_option_languages(strings, unique_strings, token, options, phrase_option);
}
string_tree_iterator_destroy(iter);
@@ -1603,7 +1635,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
kh_destroy(str_set, unique_strings);
if (lang_response != NULL) {
language_classifier_response_destroy(lang_response);
libpostal_language_classifier_response_destroy(lang_response);
}
char_array_destroy(temp_string);
@@ -1612,7 +1644,6 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt
*n = cstring_array_num_strings(strings);
return strings;
}
cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {

View File

@@ -198,7 +198,7 @@ bool file_write_float(FILE *file, float value) {
}
inline uint32_t file_deserialize_uint32(unsigned char *buf) {
return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
return ((uint32_t)buf[0] << 24) | ((uint32_t)buf[1] << 16) | ((uint32_t)buf[2] << 8) | (uint32_t)buf[3];
}
bool file_read_uint32(FILE *file, uint32_t *value) {
@@ -243,7 +243,7 @@ bool file_write_uint32(FILE *file, uint32_t value) {
inline uint16_t file_deserialize_uint16(unsigned char *buf) {
return (buf[0] << 8) | buf[1];
return ((uint16_t)buf[0] << 8) | buf[1];
}

View File

@@ -292,16 +292,28 @@ static int geohashstr_to_interleaved(char *r, size_t length, uint16_t *interleav
if(j== 0) i[0] = map[c[ 0]]<<11;
if(j== 1) i[0] += map[c[ 1]]<< 6;
if(j== 2) i[0] += map[c[ 2]]<< 1;
if(j== 3) i[0] += map[c[ 3]]>> 4;
if(j== 3) {
i[0] += map[c[ 3]]>> 4;
i[1] = map[c[ 3]]<<12;
}
if(j== 4) i[1] += map[c[ 4]]<< 7;
if(j== 5) i[1] += map[c[ 5]]<< 2;
if(j== 6) i[1] += map[c[ 6]]>> 3;
if(j== 6) {
i[1] += map[c[ 6]]>> 3;
i[2] = map[c[ 6]]<<13;
}
if(j== 7) i[2] += map[c[ 7]]<< 8;
if(j== 8) i[2] += map[c[ 8]]<< 3;
if(j== 9) i[2] += map[c[ 9]]>> 2;
if(j== 9) {
i[2] += map[c[ 9]]>> 2;
i[3] = map[c[ 9]]<<14;
}
if(j==10) i[3] += map[c[10]]<< 9;
if(j==11) i[3] += map[c[11]]<< 4;
if(j==12) i[3] += map[c[12]]>> 1;
if(j==12) {
i[3] += map[c[12]]>> 1;
i[4] = map[c[12]]<<15;
}
if(j==13) i[4] += map[c[13]]<<10;
if(j==14) i[4] += map[c[14]]<< 5;
if(j==15) i[4] += map[c[15]]>> 0;

View File

@@ -29,12 +29,6 @@
#define RAND48_MULT_2 (0x0005)
#define RAND48_ADD (0x000b)
unsigned short _rand48_seed[3];
unsigned short _rand48_mult[3];
unsigned short _rand48_add;
void _dorand48(unsigned short xseed[3]);
double erand48(unsigned short xseed[3]);

View File

@@ -46,7 +46,7 @@ language_classifier_t *get_language_classifier(void) {
return language_classifier;
}
void language_classifier_response_destroy(language_classifier_response_t *self) {
void language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
if (self == NULL) return;
if (self->languages != NULL) {
free(self->languages);
@@ -59,7 +59,7 @@ void language_classifier_response_destroy(language_classifier_response_t *self)
free(self);
}
language_classifier_response_t *classify_languages(char *address) {
libpostal_language_classifier_response_t *classify_languages(char *address) {
language_classifier_t *classifier = get_language_classifier();
if (classifier == NULL) {
@@ -88,7 +88,7 @@ language_classifier_response_t *classify_languages(char *address) {
size_t n = classifier->num_labels;
double_matrix_t *p_y = double_matrix_new_zeros(1, n);
language_classifier_response_t *response = NULL;
libpostal_language_classifier_response_t *response = NULL;
bool model_exp = false;
if (classifier->weights_type == MATRIX_DENSE) {
model_exp = logistic_regression_model_expectation(classifier->weights.dense, x, p_y);
@@ -129,7 +129,7 @@ language_classifier_response_t *classify_languages(char *address) {
free(indices);
response = malloc(sizeof(language_classifier_response_t));
response = malloc(sizeof(libpostal_language_classifier_response_t));
response->num_languages = num_languages;
response->languages = languages;
response->probs = probs;

View File

@@ -6,6 +6,8 @@
#include <stdint.h>
#include <stdbool.h>
#include "libpostal.h"
#include "collections.h"
#include "language_features.h"
#include "logistic_regression.h"
@@ -29,21 +31,14 @@ typedef struct language_classifier {
} weights;
} language_classifier_t;
typedef struct language_classifier_response {
size_t num_languages;
char **languages;
double *probs;
} language_classifier_response_t;
// General usage
language_classifier_t *language_classifier_new(void);
language_classifier_t *get_language_classifier(void);
language_classifier_t *get_language_classifier_country(void);
language_classifier_response_t *classify_languages(char *address);
void language_classifier_response_destroy(language_classifier_response_t *self);
libpostal_language_classifier_response_t *classify_languages(char *address);
void language_classifier_response_destroy(libpostal_language_classifier_response_t *self);
void language_classifier_destroy(language_classifier_t *self);

View File

@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
}
language_classifier_response_t *response = classify_languages(address);
libpostal_language_classifier_response_t *response = classify_languages(address);
if (response == NULL) {
printf("Could not classify language\n");
exit(EXIT_FAILURE);

View File

@@ -7,6 +7,7 @@
#include "language_classifier_io.h"
#include "string_utils.h"
#include "trie_utils.h"
#include "transliterate.h"
double test_accuracy(char *filename) {
@@ -33,7 +34,7 @@ double test_accuracy(char *filename) {
continue;
}
language_classifier_response_t *response = classify_languages(address);
libpostal_language_classifier_response_t *response = classify_languages(address);
if (response == NULL || response->num_languages == 0) {
printf("%s\tNULL\t%s\n", language, address);
continue;

View File

@@ -85,6 +85,17 @@ libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(
return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS;
}
char **libpostal_near_dupe_name_hashes(char *name, libpostal_normalize_options_t normalize_options, size_t *num_hashes) {
cstring_array *strings = name_word_hashes(name, normalize_options);
if (strings == NULL) {
*num_hashes = 0;
return NULL;
}
*num_hashes = cstring_array_num_strings(strings);
return cstring_array_to_strings(strings);
}
char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) {
cstring_array *strings = near_dupe_hashes(num_components, labels, values, options);
if (strings == NULL) {
@@ -108,7 +119,7 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels
char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
libpostal_language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
if (lang_response == NULL) {
*num_languages = 0;
return NULL;
@@ -202,6 +213,30 @@ libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t nu
return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
}
libpostal_language_classifier_response_t *libpostal_classify_language(char *address) {
libpostal_language_classifier_response_t *response = classify_languages(address);
if (response == NULL) {
log_error("Language classification returned NULL\n");
return NULL;
}
return response;
}
void libpostal_language_classifier_response_destroy(libpostal_language_classifier_response_t *self) {
if (self == NULL) return;
if (self->languages != NULL) {
free(self->languages);
}
if (self->probs) {
free(self->probs);
}
free(self);
}
void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) {
if (self == NULL) return;
@@ -262,19 +297,21 @@ bool libpostal_setup_datadir(char *datadir) {
address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
}
bool setup_succeed = true;
if (!transliteration_module_setup(transliteration_path)) {
log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
return false;
setup_succeed = false;
}
if (!numex_module_setup(numex_path)) {
if (setup_succeed && !numex_module_setup(numex_path)) {
log_error("Error loading numex module, dir=%s\n", numex_path);
return false;
setup_succeed = false;
}
if (!address_dictionary_module_setup(address_dictionary_path)) {
if (setup_succeed && !address_dictionary_module_setup(address_dictionary_path)) {
log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
return false;
setup_succeed = false;
}
if (transliteration_path != NULL) {
@@ -289,7 +326,7 @@ bool libpostal_setup_datadir(char *datadir) {
free(address_dictionary_path);
}
return true;
return setup_succeed;
}
bool libpostal_setup(void) {

View File

@@ -167,6 +167,19 @@ LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(ch
LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features);
/*
Language classification
*/
typedef struct libpostal_language_classifier_response {
size_t num_languages;
char **languages;
double *probs;
} libpostal_language_classifier_response_t;
LIBPOSTAL_EXPORT libpostal_language_classifier_response_t *libpostal_classify_language(char *address);
LIBPOSTAL_EXPORT void libpostal_language_classifier_response_destroy(libpostal_language_classifier_response_t *self);
/*
Deduping
@@ -191,8 +204,8 @@ typedef struct libpostal_near_dupe_hash_options {
bool address_only_keys;
} libpostal_near_dupe_hash_options_t;
LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void);
LIBPOSTAL_EXPORT char **libpostal_near_dupe_name_hashes(char *name, libpostal_normalize_options_t normalize_options, size_t *num_hashes);
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes);
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes);

View File

@@ -1,232 +0,0 @@
#!/bin/sh
set -e
if [ "$#" -lt 3 ]; then
echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
exit 1
fi
LIBPOSTAL_VERSION_STRING="v1"
LIBPOSTAL_RELEASE_VERSION_STRING="v1.0.0"
LIBPOSTAL_REPO_NAME="openvenues/libpostal"
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
GITHUB_API_URL="https://api.github.com"
LIBPOSTAL_RELEASE_API_URL="$GITHUB_API_URL/repos/$LIBPOSTAL_REPO_NAME/releases"
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
LIBPOSTAL_DATA_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/parser.tar.gz"
LIBPOSTAL_PARSER_S3_PREFIX="$LIBPOSTAL_LATEST_DATA_VERSION_STRING/libpostal_data.tar.gz"
LIBPOSTAL_LANG_CLASS_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/language_classifier.tar.gz"
COMMAND=$1
FILE=$2
LIBPOSTAL_DATA_DIR=$3
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
LIBPOSTAL_DATA_DIR_VERSION=
mkdir -p $LIBPOSTAL_DATA_DIR
LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser
LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier
BASIC_MODULE_DIRS="address_expansions numex transliteration"
PARSER_MODULE_DIR=address_parser
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
export LC_ALL=C
EPOCH_DATE="Jan 1 00:00:00 1970"
MB=$((1024*1024))
CHUNK_SIZE=$((64*$MB))
LARGE_FILE_SIZE=$((CHUNK_SIZE*2))
NUM_WORKERS=12
kill_background_processes() {
jobs -p | xargs kill;
exit
}
trap kill_background_processes INT
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
download_multipart() {
url=$1
filename=$2
size=$3
num_chunks=$((size/CHUNK_SIZE))
echo "Downloading multipart: $url, size=$size, num_chunks=$num_chunks"
offset=0
i=0
while [ $i -lt $num_chunks ]; do
i=$((i+1))
part_filename="$filename.$i"
if [ $i -lt $num_chunks ]; then
max=$((offset+CHUNK_SIZE-1));
else
max=$size;
fi;
printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
offset=$((offset+CHUNK_SIZE))
done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
> $local_path
i=0
while [ $i -lt $num_chunks ]; do
i=$((i+1))
part_filename="$filename.$i"
cat $part_filename >> $local_path
rm $part_filename
done;
}
download_file() {
updated_path=$1
data_dir=$2
metadata_url=$3
url=$4
size=$5
filename=$6
name=$7
shift 7
subdirs=$@
local_path=$data_dir/$filename
if [ ! -e $updated_path ]; then
echo "$EPOCH_DATE" > $updated_path;
fi;
echo "Checking for new libpostal $name..."
if [ $(curl -LsI $metadata_url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
echo "New libpostal $name available"
if [ $size -ge $LARGE_FILE_SIZE ]; then
download_multipart $url $local_path $size
else
curl -L $url --retry 3 --retry-delay 2 -o $local_path
fi
if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then
echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path;
elif stat -f %Sm . >/dev/null 2>&1; then
echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
fi;
for subdir in $subdirs; do
rm -rf $data_dir/$subdir;
done
tar -xvzf $local_path --no-same-owner -C $data_dir;
rm $local_path;
else
echo "libpostal $name up to date"
fi
}
if [ $COMMAND = "download" ]; then
if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)
fi
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
echo "Old version of datadir detected, removing..."
for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
done
# Legacy, blow it away too to be nice
if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
rm -rf $LIBPOSTAL_DATA_DIR/geodb;
fi
rm -f $LIBPOSTAL_DATA_UPDATED_PATH
rm -f $LIBPOSTAL_LANG_CLASS_UPDATED_PATH
rm -f $LIBPOSTAL_PARSER_UPDATED_PATH
fi
mkdir -p $LIBPOSTAL_DATA_DIR
release_id=$(curl -s $LIBPOSTAL_RELEASE_API_URL/tags/$LIBPOSTAL_RELEASE_VERSION_STRING | grep "\"id\"" | head -n1 | grep -o '[0-9][0-9]*')
release_assets="$(curl -s $LIBPOSTAL_RELEASE_API_URL/$release_id/assets)"
asset_names_tempfile="$LIBPOSTAL_DATA_DIR/asset_names.tmp"
echo "$release_assets" | grep -o '"name": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_names_tempfile
asset_metadata_tempfile="$LIBPOSTAL_DATA_DIR/asset_metadata.tmp"
echo "$release_assets" | grep -o '"url": *"[^"]*/releases/assets/[0-9]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_metadata_tempfile
asset_urls_tempfile="$LIBPOSTAL_DATA_DIR/asset_urls.tmp"
echo "$release_assets" | grep -o '"browser_download_url": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_urls_tempfile
asset_sizes_tempfile="$LIBPOSTAL_DATA_DIR/asset_sizes.tmp"
echo "$release_assets" | grep -o '"size": *[0-9]*' | grep -o '[0-9]*$' > $asset_sizes_tempfile
assets_tempfile="$LIBPOSTAL_DATA_DIR/assets.tmp"
paste -d' ' $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile > $assets_tempfile
rm $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile
while read -r line; do
asset=$(echo "$line" | cut -f1 -d' ')
asset_metadata_url=$(echo "$line" | cut -f2 -d' ')
asset_url=$(echo "$line" | cut -f3 -d' ')
asset_size=$(echo "$line" | cut -f4 -d' ')
if [ $asset = $LIBPOSTAL_DATA_FILE ] && ([ $FILE = "base" ] || [ $FILE = "all" ]); then
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
fi
if [ $asset = $LIBPOSTAL_PARSER_FILE ] && ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
fi
if [ $asset = $LIBPOSTAL_LANG_CLASS_FILE ] && ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
fi
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE;
fi
done < $assets_tempfile;
rm $assets_tempfile
elif [ $COMMAND = "upload" ]; then
echo "upload not implemented yet"
#if [ $FILE = "base" ] || [ $FILE = "all" ]; then
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/
#fi
#if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
# latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
# parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
#fi
#if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
# latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
# lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/"
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
#fi
else
echo "Invalid command: $COMMAND"
exit 1
fi

189
src/libpostal_data.in Executable file
View File

@@ -0,0 +1,189 @@
#!/bin/sh
set -e
if [ "$#" -lt 3 ]; then
echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
exit 1
fi
COMMAND=$1
FILE=$2
LIBPOSTAL_DATA_DIR=$3
MB=$((1024*1024))
CHUNK_SIZE=$((64*$MB))
DATAMODEL="@MODEL@"
# Not loving this approach but there appears to be no way to query the size
# of a release asset without using the Github API
LIBPOSTAL_DATA_FILE_CHUNKS=1
LIBPOSTAL_PARSER_MODEL_CHUNKS=12
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_DATA_DIR_VERSION_STRING@"
LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_DATA_FILE_LATEST_VERSION@"
LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_PARSER_MODEL_LATEST_VERSION@"
LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION@"
LIBPOSTAL_REPO_NAME="openvenues/libpostal"
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
if [ "$DATAMODEL" = "senzing" ]; then
LIBPOSTAL_DATA_FILE_CHUNKS=1
LIBPOSTAL_PARSER_MODEL_CHUNKS=1
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING@"
LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION@"
LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION@"
LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION@"
LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com"
fi
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
LIBPOSTAL_DATA_DIR_VERSION=
mkdir -p $LIBPOSTAL_DATA_DIR
LIBPOSTAL_DATA_FILE_VERSION_PATH=$LIBPOSTAL_DATA_DIR/base_data_file_version
LIBPOSTAL_PARSER_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/parser_model_file_version
LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/language_classifier_model_file_version
BASIC_MODULE_DIRS="address_expansions numex transliteration"
PARSER_MODULE_DIR=address_parser
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
NUM_WORKERS=12
kill_background_processes() {
jobs -p | xargs kill;
exit
}
trap kill_background_processes INT
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
download_release_multipart() {
url=$1
filename=$2
num_chunks=$3
echo "Downloading multipart: $url, num_chunks=$num_chunks"
offset=0
i=0
while [ $i -lt $num_chunks ]; do
i=$((i+1))
part_filename="$filename.$i"
max=$((offset+CHUNK_SIZE-1));
printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
offset=$((offset+CHUNK_SIZE))
done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
> $filename
i=0
while [ $i -lt $num_chunks ]; do
i=$((i+1))
part_filename="$filename.$i"
cat $part_filename >> $filename
rm $part_filename
done;
}
download_release() {
version_file_path=$1
version=$2
data_dir=$3
num_chunks=$4
filename=$5
name=$6
shift 6
subdirs=$@
local_path=$data_dir/$filename
url=$LIBPOSTAL_BASE_URL/$version/$filename
if [ ! -e $version_file_path ]; then
current_version=""
else
current_version="$(cat $version_file_path)"
fi;
echo "Checking for new libpostal $name..."
if [ "$current_version" != "$version" ]; then
echo "New libpostal $name available"
if [ $num_chunks -gt 1 ]; then
download_release_multipart $url $local_path $num_chunks
else
curl -L $url --retry 3 --retry-delay 2 -o $local_path
fi
for subdir in $subdirs; do
rm -rf $data_dir/$subdir;
done
tar -xvzf $local_path --no-same-owner -C $data_dir;
rm $local_path;
echo "$version" > $version_file_path;
else
echo "libpostal $name up to date"
fi
}
if [ $COMMAND = "download" ]; then
if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_DATA_DIR_VERSION_STRING" ]; then
echo "Old version of datadir detected, removing..."
for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
done
# Legacy, blow it away too to be nice
if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
rm -rf $LIBPOSTAL_DATA_DIR/geodb;
fi
rm -f $LIBPOSTAL_DATA_DIR/last_updated*
rm -f $LIBPOSTAL_DATA_DIR/*_version
fi
fi
mkdir -p $LIBPOSTAL_DATA_DIR
if ([ $FILE = "base" ] || [ $FILE = "all" ]); then
download_release $LIBPOSTAL_DATA_FILE_VERSION_PATH $LIBPOSTAL_DATA_FILE_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE_CHUNKS $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
fi
if ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
download_release $LIBPOSTAL_PARSER_MODEL_VERSION_PATH $LIBPOSTAL_PARSER_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_PARSER_MODEL_CHUNKS $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
fi
if ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
download_release $LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH $LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
fi
echo "$LIBPOSTAL_DATA_DIR_VERSION_STRING" > $LIBPOSTAL_DATA_VERSION_FILE
else
echo "Invalid command: $COMMAND"
exit 1
fi

View File

@@ -13,10 +13,16 @@
#define LIBPOSTAL_USAGE "Usage: ./libpostal address [...languages] [--json]\n"
static inline void print_output(char *address, libpostal_normalize_options_t options, bool use_json) {
static inline void print_output(char *address, libpostal_normalize_options_t options, bool use_json, bool root_expansions) {
size_t num_expansions;
char **expansions = libpostal_expand_address(address, options, &num_expansions);
char **expansions;
if (!root_expansions) {
expansions = libpostal_expand_address(address, options, &num_expansions);
} else {
expansions = libpostal_expand_address_root(address, options, &num_expansions);
}
char *normalized;
@@ -45,9 +51,9 @@ int main(int argc, char **argv) {
char *arg;
char *address = NULL;
char *language = NULL;
bool use_json = false;
bool root_expansions = false;
string_array *languages = NULL;
@@ -58,6 +64,8 @@ int main(int argc, char **argv) {
exit(EXIT_SUCCESS);
} else if (string_equals(arg, "--json")) {
use_json = true;
} else if (string_equals(arg, "--root")) {
root_expansions = true;
} else if (address == NULL) {
address = arg;
} else if (!string_starts_with(arg, "-")) {
@@ -87,11 +95,11 @@ int main(int argc, char **argv) {
if (address == NULL) {
char *line;
while ((line = file_getline(stdin)) != NULL) {
print_output(line, options, use_json);
print_output(line, options, use_json, root_expansions);
free(line);
}
} else {
print_output(address, options, use_json);
print_output(address, options, use_json, root_expansions);
}
if (languages != NULL) {

View File

@@ -6,7 +6,9 @@
#include <stdint.h>
#include <stdbool.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include "collections.h"
#include "file_utils.h"
@@ -31,7 +33,7 @@ typedef enum {
} name##_t; \
\
static name##_t *name##_new(size_t m, size_t n) { \
name##_t *matrix = malloc(sizeof(name##_t)); \
name##_t *matrix = malloc(sizeof(name##_t)); \
\
if (matrix == NULL) { \
return NULL; \
@@ -60,7 +62,7 @@ typedef enum {
matrix->m = m; \
matrix->n = n; \
\
matrix->values = _aligned_malloc(sizeof(type) * m * n, alignment); \
matrix->values = aligned_malloc(sizeof(type) * m * n, alignment); \
if (matrix->values == NULL) { \
free(matrix); \
return NULL; \
@@ -84,7 +86,7 @@ typedef enum {
if (self == NULL) return; \
\
if (self->values != NULL) { \
_aligned_free(self->values); \
aligned_free(self->values); \
} \
\
free(self); \
@@ -116,7 +118,7 @@ typedef enum {
if (self == NULL) return false; \
\
if (m * n > (self->m * self->n)) { \
type *ptr = _aligned_realloc(self->values, sizeof(type) * m * n, alignment); \
type *ptr = aligned_resize(self->values, sizeof(type) * self->m * self->n, sizeof(type) * m * n, alignment); \
if (ptr == NULL) { \
return false; \
} \

View File

@@ -144,6 +144,7 @@ bool cstring_array_add_string_no_whitespace(cstring_array *strings, char *str) {
cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, bool remove_spaces, size_t *n) {
char *expansion;
size_t num_expansions = 0;
cstring_array *expansions = expand_address(input, options, &num_expansions);
@@ -160,7 +161,6 @@ cstring_array *expanded_component_combined(char *input, libpostal_normalize_opti
return root_expansions;
} else {
khash_t(str_set) *unique_strings = kh_init(str_set);
char *expansion;
khiter_t k;
int ret;
@@ -387,6 +387,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
log_debug("token_str = %s\n", token_str);
add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings, ngrams);
add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams);
// For non-Latin words (Arabic, Cyrllic, etc.) just add the word
// For ideograms, we do two-character shingles, so only add the first character if the string has one token
} else if (!ideogram || j > 0 || num_tokens == 1) {
@@ -640,7 +641,7 @@ static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes,
cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) {
if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_postal_code) return NULL;
if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_small_containing_boundaries && !options.with_postal_code) return NULL;
place_t *place = place_from_components(num_components, labels, values);
log_debug("created place\n");
@@ -669,7 +670,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels,
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
language_classifier_response_t *lang_response = NULL;
libpostal_language_classifier_response_t *lang_response = NULL;
if (num_languages == 0) {
lang_response = place_languages(num_components, labels, values);

View File

@@ -8,6 +8,7 @@
#include "libpostal.h"
#include "string_utils.h"
cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options);
cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options);
cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages);

View File

@@ -434,7 +434,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
char_array_append(array, " ");
append_char = false;
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
append_char = !is_hyphen_between_letter_and_number;
append_char = is_hyphen_between_letter_and_number;
}
if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) {

View File

@@ -5,6 +5,15 @@
#include "log/log.h"
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifndef HAVE_STRNDUP
#include "strndup.h"
#endif
#define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB
#define NUMEX_SETUP_ERROR "numex module not setup, call libpostal_setup() or numex_module_setup()\n"

View File

@@ -17,10 +17,10 @@ static inline bool is_address_text_component(char *label) {
);
}
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) {
if (num_components == 0 || values == NULL || labels == NULL) return NULL;
language_classifier_response_t *lang_response = NULL;
libpostal_language_classifier_response_t *lang_response = NULL;
char *label;
char *value;

View File

@@ -32,7 +32,7 @@ typedef struct place {
char *website;
} place_t;
language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
libpostal_language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values);
place_t *place_new(void);

View File

@@ -4,6 +4,8 @@
#include "string_utils.h"
#include "file_utils.h"
// Run shuf/gshuf on a file in-place if the shuf command is available.
bool shuffle_file(char *filename) {
char *shuffle_command = NULL;

View File

@@ -157,8 +157,8 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
uint32_array **t1_tokens_unicode = NULL;
uint32_array **t2_tokens_unicode = NULL;
uint32_array *t1_unicode;
uint32_array *t2_unicode;
uint32_array *t1_unicode = NULL;
uint32_array *t2_unicode = NULL;
int64_array *phrase_memberships_array1 = NULL;
int64_array *phrase_memberships_array2 = NULL;
@@ -232,8 +232,8 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
}
}
uint32_t *suffixes1;
uint32_t *suffixes2;
uint32_t *suffixes1 = NULL;
uint32_t *suffixes2 = NULL;
if (ordinal_suffixes1 != NULL && ordinal_suffixes2 != NULL) {
suffixes1 = ordinal_suffixes1->a;

View File

@@ -94,15 +94,15 @@ inline bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khas
}
uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) {
khash_t(int_set) *unique_columns = kh_init(int_set);
khash_t(int_uint32) *unique_columns = kh_init(int_uint32);
uint32_array *ret = uint32_array_new();
if (sparse_matrix_add_unique_columns(matrix, unique_columns, ret)) {
kh_destroy(int_set, unique_columns);
kh_destroy(int_uint32, unique_columns);
return ret;
}
kh_destroy(int_set, unique_columns);
kh_destroy(int_uint32, unique_columns);
uint32_array_destroy(ret);
return NULL;
}

8853
src/sse2neon.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -78,7 +78,7 @@ typedef struct {
#define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 0, 1, 1, 0, 0, 0, 0}
static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result, char *str, size_t start_index) {
static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result) {
uint32_t node_id = result.node_id;
if (node_id == NULL_NODE_ID) return NULL;
@@ -834,11 +834,11 @@ char *transliterate(char *trans_name, char *str, size_t len) {
log_debug("Context match\n");
match_state = match_candidate_state;
match_state.state = TRANS_STATE_MATCH;
replacement = get_replacement(trie, context_result, str, match_state.phrase_start);
replacement = get_replacement(trie, context_result);
} else {
if (match_state.state == TRANS_STATE_MATCH) {
log_debug("Context no match and previous match\n");
replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
replacement = get_replacement(trie, match_state.result);
if (state.state != TRANS_STATE_PARTIAL_MATCH) {
state.advance_index = false;
}
@@ -869,7 +869,7 @@ char *transliterate(char *trans_name, char *str, size_t len) {
if (match_state.state == TRANS_STATE_MATCH) {
log_debug("Match no context\n");
replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
replacement = get_replacement(trie, match_state.result);
} else {
log_debug("Tried context for %s at char '%.*s', no match\n", str, (int)char_len, ptr);
@@ -934,14 +934,24 @@ char *transliterate(char *trans_name, char *str, size_t len) {
match_state = TRANSLITERATION_DEFAULT_STATE;
}
bool added_previous_phrase = false;
if (context_no_match && !prev_state.empty_transition && prev_state.phrase_len > 0) {
log_debug("Previous phrase stays as is %.*s\n", (int)prev_state.phrase_len, str+prev_state.phrase_start);
char_array_cat_len(new_str, str + prev_state.phrase_start, prev_state.phrase_len);
state = start_state;
added_previous_phrase = true;
if (match_candidate_state.state != TRANS_STATE_PARTIAL_MATCH) {
state = start_state;
}
}
if (state.state == TRANS_STATE_BEGIN && !prev_state.empty_transition) {
if (match_candidate_state.state != TRANS_STATE_PARTIAL_MATCH && !prev_state.empty_transition && idx + char_len == len) {
log_debug("No replacement for %.*s\n", (int)char_len, ptr);
char_array_cat_len(new_str, str + idx, char_len);
state = start_state;
} else if (state.state == TRANS_STATE_BEGIN && !prev_state.empty_transition) {
log_debug("TRANS_STATE_BEGIN && !prev_state.empty_transition\n");
state.advance_index = false;
} else if (prev_state.empty_transition) {

View File

@@ -85,7 +85,7 @@ trie_t *trie_new_from_cstring_array(cstring_array *strings) {
char *key;
uint32_t i;
uint32_t next_id;
uint32_t next_id = 0;
trie_t *trie = trie_new();

View File

@@ -7,43 +7,44 @@
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
#include <malloc.h>
static inline void *aligned_malloc(size_t size, size_t alignment) {
return _aligned_malloc(size, alignment);
}
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment) {
return _aligned_realloc(p, new_size, alignment);
}
static inline void aligned_free(void *p) {
_aligned_free(p);
}
#else
#include <stdlib.h>
static inline void *_aligned_malloc(size_t size, size_t alignment)
static inline void *aligned_malloc(size_t size, size_t alignment)
{
void *p;
int ret = posix_memalign(&p, alignment, size);
return (ret == 0) ? p : NULL;
}
static inline void *_aligned_realloc(void *p, size_t size, size_t alignment)
static inline void *aligned_resize(void *p, size_t old_size, size_t new_size, size_t alignment)
{
if ((alignment == 0) || ((alignment & (alignment - 1)) != 0) || (alignment < sizeof(void *))) {
return NULL;
}
if (size == 0) {
if (p == NULL) {
return NULL;
}
void *rp = realloc(p, size);
/* If realloc result is not already at an aligned boundary,
_aligned_malloc a new block and copy the contents of the realloc'd
pointer to the aligned block, free the realloc'd pointer and return
the aligned pointer.
*/
if ( ((size_t)rp & (alignment - 1)) != 0) {
void *p1 = _aligned_malloc(size, alignment);
if (p1 != NULL) {
memcpy(p1, rp, size);
}
free(rp);
rp = p1;
void *p1 = aligned_malloc(new_size, alignment);
if (p1 == NULL) {
free(p);
return NULL;
}
return rp;
memcpy(p1, p, old_size);
free(p);
return p1;
}
static inline void _aligned_free(void *p)
static inline void aligned_free(void *p)
{
free(p);
}
@@ -79,7 +80,7 @@ static inline void _aligned_free(void *p)
name *array = malloc(sizeof(name)); \
if (array == NULL) return NULL; \
array->n = array->m = 0; \
array->a = _aligned_malloc(size * sizeof(type), alignment); \
array->a = aligned_malloc(size * sizeof(type), alignment); \
if (array->a == NULL) return NULL; \
array->m = size; \
return array; \
@@ -94,7 +95,7 @@ static inline void _aligned_free(void *p)
} \
static inline bool name##_resize_aligned(name *array, size_t size, size_t alignment) { \
if (size <= array->m) return true; \
type *ptr = _aligned_realloc(array->a, sizeof(type) * size, alignment); \
type *ptr = aligned_resize(array->a, sizeof(type) * array->m, sizeof(type) * size, alignment); \
if (ptr == NULL) return false; \
array->a = ptr; \
array->m = size; \
@@ -160,7 +161,7 @@ static inline void _aligned_free(void *p)
} \
static inline void name##_destroy_aligned(name *array) { \
if (array == NULL) return; \
if (array->a != NULL) _aligned_free(array->a); \
if (array->a != NULL) aligned_free(array->a); \
free(array); \
}
@@ -182,7 +183,7 @@ static inline void _aligned_free(void *p)
free_func(array->a[i]); \
} \
} \
_aligned_free(array->a); \
aligned_free(array->a); \
free(array); \
}

View File

@@ -8,7 +8,7 @@
#define ks_lt_index(a, b) ((a).value < (b).value)
#ifdef USE_SSE
#if defined(USE_SSE)
#include <emmintrin.h>
#endif
@@ -338,7 +338,7 @@
#ifdef USE_SSE
#if defined(USE_SSE)
/*
From https://github.com/herumi/fmath/blob/master/fastexp.cpp

View File

@@ -96,13 +96,13 @@ static greatest_test_res test_expansion_contains_phrase_option_with_languages(ch
static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) {
bool root = false;
va_list args;
if (num_languages > 0) {
va_list args;
va_start(args, num_languages);
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args));
va_end(args);
} else {
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, NULL));
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args));
}
PASS();
}
@@ -110,13 +110,13 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha
static greatest_test_res test_root_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) {
bool root = true;
va_list args;
if (num_languages > 0) {
va_list args;
va_start(args, num_languages);
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args));
va_end(args);
} else {
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, NULL));
CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args));
}
PASS();
}
@@ -132,6 +132,9 @@ TEST test_expansions(void) {
CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs-elysees", options, 1, "fr"));
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs elysees", options, 1, "fr"));
CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champselysees", options, 1, "fr"));
CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de"));
CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));
CHECK_CALL(test_expansion_contains_with_languages("มงแตร", "มงแตร", options, 1, "th"));
@@ -182,6 +185,9 @@ TEST test_street_root_expansions(void) {
CHECK_CALL(test_root_expansion_contains("Center Street E", "center", options));
CHECK_CALL(test_root_expansion_contains("Ctr Street E", "center", options));
CHECK_CALL(test_root_expansion_contains_with_languages("W. UNION STREET", "union", options, 2, "en", "es"));
// Spanish
CHECK_CALL(test_root_expansion_contains("C/ Ocho", "8", options));
PASS();

1
versions/base_data Normal file
View File

@@ -0,0 +1 @@
v1.0.0

View File

@@ -0,0 +1 @@
v1.0.0

1
versions/parser Normal file
View File

@@ -0,0 +1 @@
v1.0.0

View File

@@ -0,0 +1 @@
v1.0.0

View File

@@ -0,0 +1 @@
v1.0.0

1
versions/senzing/parser Normal file
View File

@@ -0,0 +1 @@
v1.0.0

View File

@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
m4_define(LIBPOSTAL_MINOR_VERSION, [1])
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
@@ -50,10 +50,21 @@ AC_CHECK_TYPES([ptrdiff_t])
# Checks for library functions.
AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
AC_SUBST([LIBPOSTAL_DATA_DIR_VERSION_STRING], [v1])
DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/base_data)
PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/parser)
LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/language_classifier)
AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
AC_CONFIG_FILES([Makefile
libpostal.pc
src/Makefile
test/Makefile])
src/libpostal_data
test/Makefile], [chmod +x src/libpostal_data])
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
@@ -64,6 +75,7 @@ AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf avail
# ------------------------------------------------------------------
# Checks for SSE2 build
# ------------------------------------------------------------------
AC_ARG_ENABLE([sse2],
AS_HELP_STRING(
[--disable-sse2],