Skip to content

Commit 5669466

Browse files
committed
Use normalized but non-expanded string in search
Related to #64
1 parent 9fa6fd1 commit 5669466

File tree

2 files changed

+28
-9
lines changed

2 files changed

+28
-9
lines changed

importer/src/main.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,22 @@ void normalize_libpostal(sqlite3pp::database& db, std::string address_expansion_
688688
continue;
689689
}
690690

691+
// insert normalized, but not expanded string
692+
{
693+
char *normalized = libpostal_normalize_string(charbuff.data(), LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS);
694+
sqlite3pp::command cmd(db, "INSERT INTO normalized_name (prim_id, name) VALUES (?,?)");
695+
std::string s = normalized;
696+
cmd.binder() << d.id
697+
<< s;
698+
if (cmd.execute() != SQLITE_OK)
699+
{
700+
// std::cerr << "Error inserting: " << d.id << " " << s << std::endl;
701+
num_doubles_dropped++;
702+
}
703+
704+
free(normalized);
705+
}
706+
691707
char **expansions = libpostal_expand_address(charbuff.data(), options, &num_expansions);
692708

693709
if ( num_expansions > MAX_NUMBER_OF_EXPANSIONS )

src/postal.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <functional>
99
#include <cctype>
1010
#include <locale>
11+
#include <set>
1112

1213
#include <string.h>
1314

@@ -363,26 +364,29 @@ void Postal::expand(const Postal::ParseResult &input, std::vector<Postal::ParseR
363364

364365
std::vector< std::vector< std::string > > address_expansions;
365366
std::vector< std::string > address_keys;
366-
for (const auto i: input)
367+
for (const auto &i: input)
367368
{
368369
// in practice, its only one element at ParseResult at this stage
369-
for (const std::string tonorm: i.second)
370+
for (const std::string &tonorm: i.second)
370371
{
371-
std::vector< std::string > norm;
372+
std::set< std::string > norm;
373+
// always add unexpanded result into address expansions
374+
// this will help with the partial entries as described in
375+
// issue #64 https://github.com/rinigus/geocoder-nlp/issues/64
376+
norm.insert(tonorm);
372377
// no need to keep postal code in normalized and expanded
373-
if (i.first == ADDRESS_PARSER_LABEL_POSTAL_CODE || i.first == PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY)
374-
norm.push_back(tonorm);
375-
else
378+
if (i.first != ADDRESS_PARSER_LABEL_POSTAL_CODE && i.first != PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY)
376379
{
377380
charbuff.resize(tonorm.length() + 1);
378381
std::copy(tonorm.c_str(), tonorm.c_str() + tonorm.length() + 1, charbuff.begin());
379382
char **expansions = libpostal_expand_address(charbuff.data(), options_norm, &num_expansions);
380383
for (size_t j = 0; j < num_expansions; j++)
381-
norm.push_back(expansions[j]);
384+
norm.insert(expansions[j]);
382385

383386
libpostal_expansion_array_destroy(expansions, num_expansions);
384387
}
385-
address_expansions.push_back(norm);
388+
address_expansions.push_back(std::vector< std::string >(norm.begin(),
389+
norm.end()));
386390
address_keys.push_back(i.first);
387391
}
388392
}
@@ -417,7 +421,6 @@ void Postal::expand_string(const std::string &input, std::vector<std::string> &e
417421
std::copy(input.c_str(), input.c_str() + input.length() + 1, charbuff.begin());
418422

419423
char **expansions_cstr = libpostal_expand_address(charbuff.data(), options_norm, &num_expansions);
420-
std::vector< std::string > norm;
421424
for (size_t j = 0; j < num_expansions; j++)
422425
expansions.push_back(expansions_cstr[j]);
423426

0 commit comments

Comments
 (0)