Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 73 additions & 16 deletions ydb/core/base/fulltext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,38 @@ namespace {
return length;
}

void BuildNgrams(const TString& token, size_t lengthMin, size_t lengthMax, bool edge, TVector<TString>& ngrams) {
TVector<wchar32> characters;

const unsigned char* ptr = (const unsigned char*)token.data();
const unsigned char* end = ptr + token.size();
wchar32 symbol;
size_t symbolBytes;
while (ptr < end) {
if (SafeReadUTF8Char(symbol, symbolBytes, ptr, end) != RECODE_OK) {
Y_ASSERT(false); // should already be validated during tokenization
return;
}
characters.push_back(symbol);
ptr += symbolBytes;
}

TVector<unsigned char> ngram(token.size());
for (size_t len : xrange(lengthMin, Min(lengthMax, characters.size()) + 1)) {
for (size_t start : xrange<size_t>(0, characters.size() - len + 1)) {
unsigned char* ptr = (unsigned char*)ngram.data();
for (size_t i : xrange(len)) {
WriteUTF8Char(characters[start + i], symbolBytes, ptr);
ptr += symbolBytes;
}
ngrams.emplace_back((const char*)ngram.data(), ptr - ngram.data());
if (edge) {
break; // only prefixes
}
}
}
}

bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) {
if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) {
error = "tokenizer should be set";
Expand All @@ -151,21 +183,38 @@ namespace {
return false;
}

if (settings.use_filter_ngram()) {
error = "Unsupported use_filter_ngram setting";
return false;
}
if (settings.use_filter_edge_ngram()) {
error = "Unsupported use_filter_edge_ngram setting";
return false;
}
if (settings.has_filter_ngram_min_length()) {
error = "Unsupported filter_ngram_min_length setting";
return false;
}
if (settings.has_filter_ngram_max_length()) {
error = "Unsupported filter_ngram_max_length setting";
return false;
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
if (settings.use_filter_ngram() && settings.use_filter_edge_ngram()) {
error = "only one of use_filter_ngram or use_filter_edge_ngram should be set, not both";
return false;
}
if (!settings.has_filter_ngram_min_length()) {
error = "filter_ngram_min_length should be set with use_filter_ngram/use_filter_edge_ngram";
return false;
}
if (!settings.has_filter_ngram_max_length()) {
error = "filter_ngram_max_length should be set with use_filter_ngram/use_filter_edge_ngram";
return false;
}
if (!ValidateSettingInRange("filter_ngram_min_length", settings.filter_ngram_min_length(), 1, 20, error)) {
return false;
}
if (!ValidateSettingInRange("filter_ngram_max_length", settings.filter_ngram_max_length(), 1, 20, error)) {
return false;
}
if (settings.filter_ngram_min_length() > settings.filter_ngram_max_length()) {
error = "Invalid filter_ngram_min_length: should be less than or equal to filter_ngram_max_length";
return false;
}
} else {
if (settings.has_filter_ngram_min_length()) {
error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_min_length";
return false;
}
if (settings.has_filter_ngram_max_length()) {
error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_max_length";
return false;
}
}

if (settings.use_filter_length()) {
Expand All @@ -180,7 +229,7 @@ namespace {
return false;
}
if (settings.has_filter_length_min() && settings.has_filter_length_max() && settings.filter_length_min() > settings.filter_length_max()) {
error = "Invalid filter_length_min: should be less or equal than filter_length_max";
error = "Invalid filter_length_min: should be less than or equal to filter_length_max";
return false;
}
} else {
Expand Down Expand Up @@ -220,6 +269,14 @@ TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSet
}), tokens.end());
}

if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
TVector<TString> ngrams;
for (const auto& token : tokens) {
BuildNgrams(token, settings.filter_ngram_min_length(), settings.filter_ngram_max_length(), settings.use_filter_edge_ngram(), ngrams);
}
tokens.swap(ngrams);
}

return tokens;
}

Expand Down
29 changes: 28 additions & 1 deletion ydb/core/base/ut/fulltext_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Y_UNIT_TEST_SUITE(NFulltext) {

columnAnalyzers->set_filter_length_max(3);
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_min: should be less or equal than filter_length_max");
UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_min: should be less than or equal to filter_length_max");

columnAnalyzers->set_filter_length_min(-5);
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
Expand Down Expand Up @@ -243,6 +243,33 @@ Y_UNIT_TEST_SUITE(NFulltext) {
analyzers.clear_filter_length_min();
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"кот", "ест", "день"}));
}

Y_UNIT_TEST(AnalyzeFilterNgram) {
Ydb::Table::FulltextIndexSettings::Analyzers analyzers;
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
TString text = "это текст";

UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"это", "текст"}));

analyzers.set_use_filter_ngram(true);
analyzers.set_filter_ngram_min_length(2);
analyzers.set_filter_ngram_max_length(3);
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"эт", "то", "это", "те", "ек", "кс", "ст", "тек", "екс", "кст"}));

analyzers.set_filter_ngram_min_length(4);
analyzers.set_filter_ngram_max_length(10);
UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector<TString>{"слов", "лово", "слово"}));

analyzers.set_filter_ngram_min_length(10);
analyzers.set_filter_ngram_max_length(10);
UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector<TString>{}));

analyzers.set_use_filter_ngram(false);
analyzers.set_use_filter_edge_ngram(true);
analyzers.set_filter_ngram_min_length(2);
analyzers.set_filter_ngram_max_length(3);
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"эт", "это", "те", "тек"}));
}
}

}
Loading