From abebbd3dc6ec87118af5612800fcb7b9af281ed6 Mon Sep 17 00:00:00 2001 From: kungasc Date: Fri, 31 Oct 2025 13:38:47 +0300 Subject: [PATCH 1/6] Support fulltext ngrams --- ydb/core/base/fulltext.cpp | 87 ++++++++++++++++++++++++++------ ydb/core/base/ut/fulltext_ut.cpp | 13 +++++ 2 files changed, 85 insertions(+), 15 deletions(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index cc34a914545d..0ef6a429a018 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -135,6 +135,38 @@ namespace { return length; } + void BuildNgrams(const TString& token, size_t lengthMin, size_t lengthMax, bool edge, TVector& ngrams) { + TVector characters; + + const unsigned char* ptr = (const unsigned char*)token.data(); + const unsigned char* end = ptr + token.size(); + wchar32 symbol; + size_t symbolBytes; + while (ptr < end) { + if (SafeReadUTF8Char(symbol, symbolBytes, ptr, end) != RECODE_OK) { + Y_ASSERT(false); // should already be validated during tokenization + return; + } + characters.push_back(symbol); + ptr += symbolBytes; + } + + for (size_t len : xrange(lengthMin, Min(lengthMax, characters.size()) + 1)) { + for (size_t start : xrange(0, characters.size() - len + 1)) { + TVector ngram(len * 4); + unsigned char* ptr = (unsigned char*)ngram.data(); + for (size_t i : xrange(len)) { + WriteUTF8Char(characters[start + i], symbolBytes, ptr); + ptr += symbolBytes; + } + ngrams.emplace_back((const char*)ngram.data(), ptr - ngram.data()); + if (edge) { + break; // only prefixes + } + } + } + } + bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) { if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) { error = "tokenizer should be set"; @@ -151,21 +183,38 @@ namespace { return false; } - if (settings.use_filter_ngram()) { - error = "Unsupported use_filter_ngram setting"; - return false; - } - if (settings.use_filter_edge_ngram()) { - error = "Unsupported use_filter_edge_ngram setting"; - return false; - } - if (settings.has_filter_ngram_min_length()) { - error = "Unsupported filter_ngram_min_length setting"; - return false; - } - if (settings.has_filter_ngram_max_length()) { - error = "Unsupported filter_ngram_max_length setting"; - return false; + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { + if (settings.use_filter_ngram() && settings.use_filter_edge_ngram()) { + error = "only one of use_filter_ngram or use_filter_edge_ngram should be set, not both"; + return false; + } + if (!settings.has_filter_ngram_min_length()) { + error = "filter_ngram_min_length should be set with use_filter_ngram/use_filter_edge_ngram"; + return false; + } + if (!settings.has_filter_ngram_max_length()) { + error = "filter_ngram_max_length should be set with use_filter_ngram/use_filter_edge_ngram"; + return false; + } + if (!ValidateSettingInRange("filter_ngram_min_length", settings.filter_ngram_min_length(), 1, 20, error)) { + return false; + } + if (!ValidateSettingInRange("filter_ngram_max_length", settings.filter_ngram_max_length(), 1, 20, error)) { + return false; + } + if (settings.filter_ngram_min_length() > settings.filter_ngram_max_length()) { + error = "Invalid filter_ngram_min_length: should be less or equal than filter_ngram_max_length"; + return false; + } + } else { + if (settings.has_filter_ngram_min_length()) { + error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_min_length"; + return false; + } + if (settings.has_filter_ngram_max_length()) { + error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_max_length"; + return false; + } } if (settings.use_filter_length()) { @@ -220,6 +269,14 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet }), tokens.end()); } + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { + TVector ngrams; + for (const auto& token : tokens) { + BuildNgrams(token, settings.filter_ngram_min_length(), settings.filter_ngram_max_length(), settings.use_filter_edge_ngram(), ngrams); + } + tokens.swap(ngrams); + } + return tokens; } diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index eda860588f54..b3a840df3b47 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -243,6 +243,19 @@ Y_UNIT_TEST_SUITE(NFulltext) { analyzers.clear_filter_length_min(); UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"кот", "ест", "день"})); } + + Y_UNIT_TEST(AnalyzeFilterNgram) { + Ydb::Table::FulltextIndexSettings::Analyzers analyzers; + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + TString text = "это текст"; + + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"это", "текст"})); + + analyzers.set_use_filter_ngram(true); + analyzers.set_filter_ngram_min_length(2); + analyzers.set_filter_ngram_max_length(3); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "то", "это", "те", "ек", "кс", "ст", "тек", "екс", "кст"})); + } } } From 4189c2d10fc0b81961fd8478e8811077d08dfcf4 Mon Sep 17 00:00:00 2001 From: kungasc Date: Fri, 31 Oct 2025 13:46:13 +0300 Subject: [PATCH 2/6] + edge test --- ydb/core/base/ut/fulltext_ut.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index b3a840df3b47..076940bc55a0 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -255,6 +255,12 @@ Y_UNIT_TEST_SUITE(NFulltext) { analyzers.set_filter_ngram_min_length(2); analyzers.set_filter_ngram_max_length(3); UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "то", "это", "те", "ек", "кс", "ст", "тек", "екс", "кст"})); + + analyzers.set_use_filter_ngram(false); + analyzers.set_use_filter_edge_ngram(true); + analyzers.set_filter_ngram_min_length(2); + analyzers.set_filter_ngram_max_length(3); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "это", "те", "тек"})); } } From 4f6ecbd44ec5221fc548f6045d4421ad6c5fc8ff Mon Sep 17 00:00:00 2001 From: kungasc Date: Fri, 31 Oct 2025 13:55:18 +0300 Subject: [PATCH 3/6] fixes --- ydb/core/base/fulltext.cpp | 6 +++--- ydb/core/base/ut/fulltext_ut.cpp | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 0ef6a429a018..af2085996321 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -151,9 +151,9 @@ namespace { ptr += symbolBytes; } + TVector ngram(token.size()); for (size_t len : xrange(lengthMin, Min(lengthMax, characters.size()) + 1)) { for (size_t start : xrange(0, characters.size() - len + 1)) { - TVector ngram(len * 4); unsigned char* ptr = (unsigned char*)ngram.data(); for (size_t i : xrange(len)) { WriteUTF8Char(characters[start + i], symbolBytes, ptr); @@ -203,7 +203,7 @@ namespace { return false; } if (settings.filter_ngram_min_length() > settings.filter_ngram_max_length()) { - error = "Invalid filter_ngram_min_length: should be less or equal than filter_ngram_max_length"; + error = "Invalid filter_ngram_min_length: should be less than or equal to filter_ngram_max_length"; return false; } } else { @@ -229,7 +229,7 @@ namespace { return false; } if (settings.has_filter_length_min() && settings.has_filter_length_max() && settings.filter_length_min() > settings.filter_length_max()) { - error = "Invalid filter_length_min: should be less or equal than filter_length_max"; + error = "Invalid filter_length_min: should be less than or equal to filter_length_max"; return false; } } else { diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index 076940bc55a0..fe6a855169f0 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -69,7 +69,7 @@ Y_UNIT_TEST_SUITE(NFulltext) { columnAnalyzers->set_filter_length_max(3); UNIT_ASSERT_C(!ValidateSettings(settings, error), error); - UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_min: should be less or equal than filter_length_max"); + UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_min: should be less than or equal to filter_length_max"); columnAnalyzers->set_filter_length_min(-5); UNIT_ASSERT_C(!ValidateSettings(settings, error), error); @@ -256,6 +256,14 @@ Y_UNIT_TEST_SUITE(NFulltext) { analyzers.set_filter_ngram_max_length(3); UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "то", "это", "те", "ек", "кс", "ст", "тек", "екс", "кст"})); + analyzers.set_filter_ngram_min_length(4); + analyzers.set_filter_ngram_max_length(10); + UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector{"слов", "лово", "слово"})); + + analyzers.set_filter_ngram_min_length(10); + analyzers.set_filter_ngram_max_length(10); + UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector{})); + analyzers.set_use_filter_ngram(false); analyzers.set_use_filter_edge_ngram(true); analyzers.set_filter_ngram_min_length(2); From f61f5f863ff5844d418dbff86184fa45214fce17 Mon Sep 17 00:00:00 2001 From: kungasc Date: Fri, 31 Oct 2025 14:39:21 +0300 Subject: [PATCH 4/6] safe write --- ydb/core/base/fulltext.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index af2085996321..3242432614c8 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -156,7 +156,10 @@ namespace { for (size_t start : xrange(0, characters.size() - len + 1)) { unsigned char* ptr = (unsigned char*)ngram.data(); for (size_t i : xrange(len)) { - WriteUTF8Char(characters[start + i], symbolBytes, ptr); + if (SafeWriteUTF8Char(characters[start + i], symbolBytes, ptr, ngram.end()) != RECODE_OK) { + Y_ASSERT(false); // should fit + return; + } ptr += symbolBytes; } ngrams.emplace_back((const char*)ngram.data(), ptr - ngram.data()); From ddbf9918c9c40c29d68f91adae4f8ca11ee996a9 Mon Sep 17 00:00:00 2001 From: kungasc Date: Fri, 31 Oct 2025 17:31:57 +0300 Subject: [PATCH 5/6] cr: do not allocate --- ydb/core/base/fulltext.cpp | 47 +++++++++++++++----------------- ydb/core/base/ut/fulltext_ut.cpp | 4 +-- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 3242432614c8..81169ee25f36 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -136,37 +136,34 @@ namespace { } void BuildNgrams(const TString& token, size_t lengthMin, size_t lengthMax, bool edge, TVector& ngrams) { - TVector characters; - - const unsigned char* ptr = (const unsigned char*)token.data(); - const unsigned char* end = ptr + token.size(); + const unsigned char* ngram_begin_ptr = (const unsigned char*)token.data(); + const unsigned char* end = ngram_begin_ptr + token.size(); wchar32 symbol; size_t symbolBytes; - while (ptr < end) { - if (SafeReadUTF8Char(symbol, symbolBytes, ptr, end) != RECODE_OK) { - Y_ASSERT(false); // should already be validated during tokenization - return; - } - characters.push_back(symbol); - ptr += symbolBytes; - } - TVector ngram(token.size()); - for (size_t len : xrange(lengthMin, Min(lengthMax, characters.size()) + 1)) { - for (size_t start : xrange(0, characters.size() - len + 1)) { - unsigned char* ptr = (unsigned char*)ngram.data(); - for (size_t i : xrange(len)) { - if (SafeWriteUTF8Char(characters[start + i], symbolBytes, ptr, ngram.end()) != RECODE_OK) { - Y_ASSERT(false); // should fit - return; - } - ptr += symbolBytes; + while (ngram_begin_ptr < end) { + const unsigned char* ngram_end_ptr = ngram_begin_ptr; + size_t ngram_length = 0; + while (ngram_end_ptr < end) { + if (SafeReadUTF8Char(symbol, symbolBytes, ngram_end_ptr, end) != RECODE_OK) { + Y_ASSERT(false); // should already be validated during tokenization + return; } - ngrams.emplace_back((const char*)ngram.data(), ptr - ngram.data()); - if (edge) { - break; // only prefixes + ngram_length++; + ngram_end_ptr += symbolBytes; + + if (lengthMin <= ngram_length && ngram_length <= lengthMax) { + ngrams.emplace_back((const char*)ngram_begin_ptr, ngram_end_ptr - ngram_begin_ptr); } } + if (edge) { + break; // only prefixes + } + if (SafeReadUTF8Char(symbol, symbolBytes, ngram_begin_ptr, end) != RECODE_OK) { + Y_ASSERT(false); // should already be validated during tokenization + return; + } + ngram_begin_ptr += symbolBytes; } } diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index fe6a855169f0..a9e9ab058854 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -254,11 +254,11 @@ Y_UNIT_TEST_SUITE(NFulltext) { analyzers.set_use_filter_ngram(true); analyzers.set_filter_ngram_min_length(2); analyzers.set_filter_ngram_max_length(3); - UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "то", "это", "те", "ек", "кс", "ст", "тек", "екс", "кст"})); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "это", "то", "те", "тек", "ек", "екс", "кс", "кст", "ст"})); analyzers.set_filter_ngram_min_length(4); analyzers.set_filter_ngram_max_length(10); - UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector{"слов", "лово", "слово"})); + UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector{"слов", "слово", "лово"})); analyzers.set_filter_ngram_min_length(10); analyzers.set_filter_ngram_max_length(10); From f10023174585e1f3e64dac76a090d0c070f4c2a5 Mon Sep 17 00:00:00 2001 From: kungasc Date: Fri, 31 Oct 2025 17:33:23 +0300 Subject: [PATCH 6/6] fix empty line --- ydb/core/base/fulltext.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 81169ee25f36..dabd55a25850 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -151,7 +151,6 @@ namespace { } ngram_length++; ngram_end_ptr += symbolBytes; - if (lengthMin <= ngram_length && ngram_length <= lengthMax) { ngrams.emplace_back((const char*)ngram_begin_ptr, ngram_end_ptr - ngram_begin_ptr); }