Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 72 additions & 15 deletions ydb/core/base/fulltext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,38 @@ namespace {
return length;
}

void BuildNgrams(const TString& token, size_t lengthMin, size_t lengthMax, bool edge, TVector<TString>& ngrams) {
TVector<wchar32> characters;

const unsigned char* ptr = (const unsigned char*)token.data();
const unsigned char* end = ptr + token.size();
wchar32 symbol;
size_t symbolBytes;
while (ptr < end) {
if (SafeReadUTF8Char(symbol, symbolBytes, ptr, end) != RECODE_OK) {
Y_ASSERT(false); // should already be validated during tokenization
return;
}
characters.push_back(symbol);
ptr += symbolBytes;
}

for (size_t len : xrange(lengthMin, Min(lengthMax, characters.size()) + 1)) {
for (size_t start : xrange<size_t>(0, characters.size() - len + 1)) {
TVector<unsigned char> ngram(len * 4);
unsigned char* ptr = (unsigned char*)ngram.data();
for (size_t i : xrange(len)) {
WriteUTF8Char(characters[start + i], symbolBytes, ptr);
ptr += symbolBytes;
}
ngrams.emplace_back((const char*)ngram.data(), ptr - ngram.data());
if (edge) {
break; // only prefixes
}
}
}
}

bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) {
if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) {
error = "tokenizer should be set";
Expand All @@ -151,21 +183,38 @@ namespace {
return false;
}

if (settings.use_filter_ngram()) {
error = "Unsupported use_filter_ngram setting";
return false;
}
if (settings.use_filter_edge_ngram()) {
error = "Unsupported use_filter_edge_ngram setting";
return false;
}
if (settings.has_filter_ngram_min_length()) {
error = "Unsupported filter_ngram_min_length setting";
return false;
}
if (settings.has_filter_ngram_max_length()) {
error = "Unsupported filter_ngram_max_length setting";
return false;
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
if (settings.use_filter_ngram() && settings.use_filter_edge_ngram()) {
error = "only one of use_filter_ngram or use_filter_edge_ngram should be set, not both";
return false;
}
if (!settings.has_filter_ngram_min_length()) {
error = "filter_ngram_min_length should be set with use_filter_ngram/use_filter_edge_ngram";
return false;
}
if (!settings.has_filter_ngram_max_length()) {
error = "filter_ngram_max_length should be set with use_filter_ngram/use_filter_edge_ngram";
return false;
}
if (!ValidateSettingInRange("filter_ngram_min_length", settings.filter_ngram_min_length(), 1, 20, error)) {
return false;
}
if (!ValidateSettingInRange("filter_ngram_max_length", settings.filter_ngram_max_length(), 1, 20, error)) {
return false;
}
if (settings.filter_ngram_min_length() > settings.filter_ngram_max_length()) {
error = "Invalid filter_ngram_min_length: should be less or equal than filter_ngram_max_length";
return false;
}
} else {
if (settings.has_filter_ngram_min_length()) {
error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_min_length";
return false;
}
if (settings.has_filter_ngram_max_length()) {
error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_max_length";
return false;
}
}

if (settings.use_filter_length()) {
Expand Down Expand Up @@ -220,6 +269,14 @@ TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSet
}), tokens.end());
}

if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
TVector<TString> ngrams;
for (const auto& token : tokens) {
BuildNgrams(token, settings.filter_ngram_min_length(), settings.filter_ngram_max_length(), settings.use_filter_edge_ngram(), ngrams);
}
tokens.swap(ngrams);
}

return tokens;
}

Expand Down
13 changes: 13 additions & 0 deletions ydb/core/base/ut/fulltext_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,19 @@ Y_UNIT_TEST_SUITE(NFulltext) {
analyzers.clear_filter_length_min();
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"кот", "ест", "день"}));
}

Y_UNIT_TEST(AnalyzeFilterNgram) {
Ydb::Table::FulltextIndexSettings::Analyzers analyzers;
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
TString text = "это текст";

UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"это", "текст"}));

analyzers.set_use_filter_ngram(true);
analyzers.set_filter_ngram_min_length(2);
analyzers.set_filter_ngram_max_length(3);
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"эт", "то", "это", "те", "ек", "кс", "ст", "тек", "екс", "кст"}));
}
}

}
Loading