diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index cc34a914545d..dabd55a25850 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -135,6 +135,37 @@ namespace { return length; } + void BuildNgrams(const TString& token, size_t lengthMin, size_t lengthMax, bool edge, TVector& ngrams) { + const unsigned char* ngram_begin_ptr = (const unsigned char*)token.data(); + const unsigned char* end = ngram_begin_ptr + token.size(); + wchar32 symbol; + size_t symbolBytes; + + while (ngram_begin_ptr < end) { + const unsigned char* ngram_end_ptr = ngram_begin_ptr; + size_t ngram_length = 0; + while (ngram_end_ptr < end) { + if (SafeReadUTF8Char(symbol, symbolBytes, ngram_end_ptr, end) != RECODE_OK) { + Y_ASSERT(false); // should already be validated during tokenization + return; + } + ngram_length++; + ngram_end_ptr += symbolBytes; + if (lengthMin <= ngram_length && ngram_length <= lengthMax) { + ngrams.emplace_back((const char*)ngram_begin_ptr, ngram_end_ptr - ngram_begin_ptr); + } + } + if (edge) { + break; // only prefixes + } + if (SafeReadUTF8Char(symbol, symbolBytes, ngram_begin_ptr, end) != RECODE_OK) { + Y_ASSERT(false); // should already be validated during tokenization + return; + } + ngram_begin_ptr += symbolBytes; + } + } + bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) { if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) { error = "tokenizer should be set"; @@ -151,21 +182,38 @@ namespace { return false; } - if (settings.use_filter_ngram()) { - error = "Unsupported use_filter_ngram setting"; - return false; - } - if (settings.use_filter_edge_ngram()) { - error = "Unsupported use_filter_edge_ngram setting"; - return false; - } - if (settings.has_filter_ngram_min_length()) { - error = "Unsupported filter_ngram_min_length setting"; - return false; - } - if (settings.has_filter_ngram_max_length()) { - error = "Unsupported filter_ngram_max_length setting"; - return false; + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { + if (settings.use_filter_ngram() && settings.use_filter_edge_ngram()) { + error = "only one of use_filter_ngram or use_filter_edge_ngram should be set, not both"; + return false; + } + if (!settings.has_filter_ngram_min_length()) { + error = "filter_ngram_min_length should be set with use_filter_ngram/use_filter_edge_ngram"; + return false; + } + if (!settings.has_filter_ngram_max_length()) { + error = "filter_ngram_max_length should be set with use_filter_ngram/use_filter_edge_ngram"; + return false; + } + if (!ValidateSettingInRange("filter_ngram_min_length", settings.filter_ngram_min_length(), 1, 20, error)) { + return false; + } + if (!ValidateSettingInRange("filter_ngram_max_length", settings.filter_ngram_max_length(), 1, 20, error)) { + return false; + } + if (settings.filter_ngram_min_length() > settings.filter_ngram_max_length()) { + error = "Invalid filter_ngram_min_length: should be less than or equal to filter_ngram_max_length"; + return false; + } + } else { + if (settings.has_filter_ngram_min_length()) { + error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_min_length"; + return false; + } + if (settings.has_filter_ngram_max_length()) { + error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_max_length"; + return false; + } } if (settings.use_filter_length()) { @@ -180,7 +228,7 @@ namespace { return false; } if (settings.has_filter_length_min() && settings.has_filter_length_max() && settings.filter_length_min() > settings.filter_length_max()) { - error = "Invalid filter_length_min: should be less or equal than filter_length_max"; + error = "Invalid filter_length_min: should be less than or equal to filter_length_max"; return false; } } else { @@ -220,6 +268,14 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet }), tokens.end()); } + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { + TVector ngrams; + for (const auto& token : tokens) { + BuildNgrams(token, settings.filter_ngram_min_length(), settings.filter_ngram_max_length(), settings.use_filter_edge_ngram(), ngrams); + } + tokens.swap(ngrams); + } + return tokens; } diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index eda860588f54..a9e9ab058854 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -69,7 +69,7 @@ Y_UNIT_TEST_SUITE(NFulltext) { columnAnalyzers->set_filter_length_max(3); UNIT_ASSERT_C(!ValidateSettings(settings, error), error); - UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_min: should be less or equal than filter_length_max"); + UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_min: should be less than or equal to filter_length_max"); columnAnalyzers->set_filter_length_min(-5); UNIT_ASSERT_C(!ValidateSettings(settings, error), error); @@ -243,6 +243,33 @@ Y_UNIT_TEST_SUITE(NFulltext) { analyzers.clear_filter_length_min(); UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"кот", "ест", "день"})); } + + Y_UNIT_TEST(AnalyzeFilterNgram) { + Ydb::Table::FulltextIndexSettings::Analyzers analyzers; + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + TString text = "это текст"; + + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"это", "текст"})); + + analyzers.set_use_filter_ngram(true); + analyzers.set_filter_ngram_min_length(2); + analyzers.set_filter_ngram_max_length(3); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "это", "то", "те", "тек", "ек", "екс", "кс", "кст", "ст"})); + + analyzers.set_filter_ngram_min_length(4); + analyzers.set_filter_ngram_max_length(10); + UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector{"слов", "слово", "лово"})); + + analyzers.set_filter_ngram_min_length(10); + analyzers.set_filter_ngram_max_length(10); + UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector{})); + + analyzers.set_use_filter_ngram(false); + analyzers.set_use_filter_edge_ngram(true); + analyzers.set_filter_ngram_min_length(2); + analyzers.set_filter_ngram_max_length(3); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "это", "те", "тек"})); + } } }