Skip to content

Commit 6e1644d

Browse files
authored
Merge pull request #67 from rinigus/nominatim
Import data through Nominatim
2 parents 822fc52 + 5081ded commit 6e1644d

24 files changed

+1436
-3868
lines changed

.clang-format

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
# We'll use defaults from the LLVM style, but with 4 columns indentation.
3+
BasedOnStyle: GNU
4+
ColumnLimit: 100
5+
---
6+
Language: Cpp
7+
AllowShortFunctionsOnASingleLine: Inline
8+
AlwaysBreakAfterDefinitionReturnType: None
9+
AlwaysBreakAfterReturnType: None
10+
SpaceBeforeParens: ControlStatements
11+
12+
AlignConsecutiveAssignments: Consecutive
13+
AlignConsecutiveDeclarations: Consecutive
14+
AlignConsecutiveDeclarations: Consecutive

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,7 @@
1515
*~
1616

1717
/*.pro.user
18+
build/
19+
.vscode
20+
*.code-workspace
21+
CMakeLists.txt.user

CMakeLists.txt

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,22 @@ project(geocoder-nlp
55
DESCRIPTION "Geocoder NLP")
66

77
set(CMAKE_INCLUDE_CURRENT_DIR ON)
8-
set(CMAKE_CXX_STANDARD 11)
8+
set(CMAKE_CXX_STANDARD 17)
99
set(CMAKE_CXX_STANDARD_REQUIRED True)
1010

1111
include(FindPkgConfig)
1212
include(FeatureSummary)
1313
include(GNUInstallDirs)
1414

1515
find_package(PkgConfig REQUIRED)
16+
find_package(nlohmann_json 3.2.0 REQUIRED)
17+
find_package(Boost 1.30 COMPONENTS program_options REQUIRED)
1618

1719
pkg_check_modules(MARISA marisa IMPORTED_TARGET)
1820
pkg_check_modules(KYOTOCABINET kyotocabinet IMPORTED_TARGET)
19-
pkg_check_modules(POSTAL postal IMPORTED_TARGET)
21+
pkg_check_modules(POSTAL libpostal IMPORTED_TARGET)
2022
pkg_check_modules(SQLITE3 sqlite3 IMPORTED_TARGET)
23+
pkg_check_modules(LIBPQXX libpqxx IMPORTED_TARGET)
2124

2225
set(SRC
2326
src/geocoder.cpp
@@ -31,31 +34,70 @@ set(HEAD
3134
include_directories(thirdparty/sqlite3pp/headeronly_src)
3235
include_directories(src)
3336

37+
# boost
38+
include_directories(${Boost_INCLUDE_DIR})
39+
40+
# importer
41+
set(IMPSRC
42+
importer/src/config.h
43+
importer/src/main.cpp
44+
importer/src/hierarchy.cpp
45+
importer/src/hierarchy.h
46+
importer/src/hierarchyitem.cpp
47+
importer/src/hierarchyitem.h
48+
importer/src/normalization.cpp
49+
importer/src/normalization.h
50+
importer/src/utils.cpp
51+
importer/src/utils.h
52+
)
53+
add_executable(geocoder-importer ${SRC} ${HEAD} ${IMPSRC})
54+
target_link_libraries(geocoder-importer
55+
PkgConfig::MARISA
56+
PkgConfig::KYOTOCABINET
57+
PkgConfig::POSTAL
58+
PkgConfig::SQLITE3
59+
PkgConfig::LIBPQXX
60+
nlohmann_json::nlohmann_json
61+
${Boost_LIBRARIES})
62+
3463
# demo codes
3564
add_executable(geocoder-nlp
3665
demo/geocoder-nlp.cpp
3766
${SRC}
3867
${HEAD})
3968

4069
target_link_libraries(geocoder-nlp
41-
-lmarisa -lkyotocabinet -lpostal -lsqlite3)
70+
PkgConfig::MARISA
71+
PkgConfig::KYOTOCABINET
72+
PkgConfig::POSTAL
73+
PkgConfig::SQLITE3)
4274

4375
add_executable(nearby-line
4476
demo/nearby-line.cpp
4577
${SRC}
4678
${HEAD})
4779

4880
target_link_libraries(nearby-line
49-
-lmarisa -lkyotocabinet -lpostal -lsqlite3)
81+
PkgConfig::MARISA
82+
PkgConfig::KYOTOCABINET
83+
PkgConfig::POSTAL
84+
PkgConfig::SQLITE3)
5085

5186
add_executable(nearby-point
5287
demo/nearby-point.cpp
5388
${SRC}
5489
${HEAD})
5590

5691
target_link_libraries(nearby-point
57-
-lmarisa -lkyotocabinet -lpostal -lsqlite3)
92+
PkgConfig::MARISA
93+
PkgConfig::KYOTOCABINET
94+
PkgConfig::POSTAL
95+
PkgConfig::SQLITE3)
5896

97+
# install
98+
install(TARGETS geocoder-importer
99+
DESTINATION ${CMAKE_INSTALL_BINDIR})
59100

101+
# summary
60102
feature_summary(WHAT ALL FATAL_ON_MISSING_REQUIRED_PACKAGES)
61103

Database.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Geocoder NLP database format
2+
3+
The geocoder database consists of several files which are expected to be in the
4+
same directory. All locations are described using singe coordinate to keep the
5+
files as small as possible.
6+
7+
The files composing a database are:
8+
9+
1. geonlp-primary.sqlite: SQLite database with location description and coordinate
10+
2. geonlp-normalized.trie: MARISA database with normalized strings
11+
3. geonlp-normalized-id.kch: Kyoto Cabinet database for linking MARISA and primary IDs
12+
13+
## geonlp-primary.sqlite
14+
15+
SQLite database contains location description, their organization into hierarchy
16+
of objects.
17+
18+
Table `object_primary` keeps location description. In this table, objects are
19+
stored sequentially (in terms of their `id`) according to the positioning in the
20+
object hierarchy with the children stored after parents. Table `hierarchy` has a
21+
record for each item (`id` from `object_primary`) with the children consisting
22+
of parent ID (`prim_id`) and the ID of the last child (`last_subobject`).
23+
24+
Object types are stored separately in `type` table with the type ID used in
25+
`object_primary`.
26+
27+
Spatial queries are indexed using R-Tree with `box_id` used as a reference in
28+
`object_primary`. Namely, as all objects are stored as points, for storage
29+
efficiency, objects next to each other are set to have the same `box_id` and are
30+
found through `-rtree` tables.
31+
32+
Table `meta` keeps database format version and is used to check version
33+
compatibility.
34+
35+
## geonlp-normalized.trie
36+
37+
All normalized strings are stored in MARISA database
38+
(https://github.com/s-yata/marisa-trie). Normalized strings are formed from
39+
`name` and other similar fields of `object_primary` table in
40+
`geonlp-primary.sqlite`. All strings are pushed into MARISA database that
41+
assigns its internal ID for each of the strings.
42+
43+
## geonlp-normalized-id.kch
44+
45+
Kyoto Cabinet (https://dbmx.net/kyotocabinet/) database for linking MARISA and
46+
primary IDs. Hash database variant is used where `key` is an ID provided by
47+
MARISA for a search string and value is an array of bytes consisting of
48+
`object_primary` IDs stored as `uint32_t` one after another. The array is stored
49+
using `std::string`.

Makefile

Lines changed: 0 additions & 100 deletions
This file was deleted.

README.md

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1-
# geocoder-nlp
1+
# Geocoder NLP
22

3-
This is a geocoder C++ library that uses libpostal to parse the user
3+
This is a geocoder C++ library that targets offline use by mobile
4+
applications. It is able to perform forward and reverse geocoding.
5+
For forward geocoding, it uses libpostal to parse the user
46
request, normalize the parsed result, and search for the match in
5-
geocoder database.
7+
geocoder database. In addition to traditional reverse geocoding, it is
8+
able to find points of interest close to the reference point or line.
69

710
The library includes demo program showing how to use it. Its also used
811
as one of the geocoders in OSM Scout Server
@@ -29,7 +32,7 @@ libraries mentioned above.
2932
## Databases
3033

3134
At present, the datasets required for the geocoder to function are distributed
32-
as a part of OSM Scout Server datasets .
35+
as a part of OSM Scout Server datasets.
3336

3437
If you use the geocoder with the full libpostal installation, you don't need to
3538
get the libpostal datasets from that location, but can use the datasets
@@ -43,8 +46,10 @@ To use country-specific datasets, you would have to get:
4346
In addition, the prepared geocoder databases are available at
4447
geocoder/SELECT THE NEEDED ONES.
4548

49+
Database format is described in [separate document](Database.md).
50+
4651
## Acknowledgments
4752

48-
libpostal: https://github.com/openvenues/libpostal
53+
libpostal: Used for input parsing; https://github.com/openvenues/libpostal
4954

50-
libosmscout: http://libosmscout.sourceforge.net
55+
Nominatim: Used for data import; https://nominatim.org/

importer/Makefile

Lines changed: 0 additions & 74 deletions
This file was deleted.

0 commit comments

Comments
 (0)