Skip to content

Commit d797251

Browse files
authored
Support Utf8 as a fulltext column (#27892)
1 parent b9b3351 commit d797251

File tree

12 files changed

+99
-35
lines changed

12 files changed

+99
-35
lines changed

ydb/core/base/table_index.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,6 @@ TClusterId SetPostingParentFlag(TClusterId parent);
8585
}
8686

8787
namespace NFulltext {
88-
// TODO: support utf-8 in fulltext index
89-
inline constexpr auto TokenType = Ydb::Type::STRING;
90-
inline constexpr const char* TokenTypeName = "String";
91-
9288
inline constexpr const char* TokenColumn = "__ydb_token";
9389
}
9490

ydb/core/kqp/opt/kqp_type_ann.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1800,8 +1800,8 @@ TStatus AnnotateFulltextAnalyze(const TExprNode::TPtr& node, TExprContext& ctx)
18001800
return TStatus::Error;
18011801
}
18021802

1803-
// Return type: List<String>
1804-
auto stringType = ctx.MakeType<TDataExprType>(EDataSlot::String);
1803+
// Return type: List<String or Utf8>
1804+
auto stringType = ctx.MakeType<TDataExprType>(textDataType->GetSlot());
18051805
auto listType = ctx.MakeType<TListExprType>(stringType);
18061806
node->SetTypeAnn(listType);
18071807

ydb/core/kqp/runtime/kqp_program_builder.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ TRuntimeNode TKqpProgramBuilder::KqpIndexLookupJoin(const TRuntimeNode& input, c
356356

357357
TRuntimeNode TKqpProgramBuilder::FulltextAnalyze(TRuntimeNode text, TRuntimeNode settings)
358358
{
359-
// Validate text argument - should be a string or optional string
359+
// Validate text argument - should be a String or Utf8 or optional String or Utf8
360360
const auto& textType = text.GetStaticType();
361361
const TDataType* textDataType = nullptr;
362362

@@ -370,17 +370,18 @@ TRuntimeNode TKqpProgramBuilder::FulltextAnalyze(TRuntimeNode text, TRuntimeNode
370370
textDataType = static_cast<const TDataType*>(textType);
371371
}
372372

373-
MKQL_ENSURE(textDataType->GetSchemeType() == NUdf::TDataType<char*>::Id, "Expected string for text.");
373+
MKQL_ENSURE(textDataType->GetSchemeType() == NScheme::NTypeIds::String
374+
|| textDataType->GetSchemeType() == NScheme::NTypeIds::Utf8, "Expected String or Utf8 for text column.");
375+
376+
// Return type: List<String or Utf8>
377+
auto stringType = TDataType::Create(textDataType->GetSchemeType(), Env);
378+
auto listType = TListType::Create(stringType, Env);
374379

375380
// Validate settings argument - should be a string (serialized proto)
376381
const auto& settingsType = settings.GetStaticType();
377382
MKQL_ENSURE(settingsType->IsData(), "Expected data type for settings.");
378383
const auto& settingsTypeData = static_cast<const TDataType&>(*settingsType);
379-
MKQL_ENSURE(settingsTypeData.GetSchemeType() == NUdf::TDataType<char*>::Id, "Expected string for settings.");
380-
381-
// Return type: List<String>
382-
auto stringType = TDataType::Create(NUdf::TDataType<char*>::Id, Env);
383-
auto listType = TListType::Create(stringType, Env);
384+
MKQL_ENSURE(settingsTypeData.GetSchemeType() == NScheme::NTypeIds::String, "Expected string for settings.");
384385

385386
TCallableBuilder callableBuilder(Env, __func__, listType);
386387
callableBuilder.Add(text);

ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1949,6 +1949,66 @@ Y_UNIT_TEST(NoIndexImplTableUpdates) {
19491949
auto index = ReadIndex(db);
19501950
CompareYson(R"([])", NYdb::FormatResultSetYson(index));
19511951
}
1952+
1953+
Y_UNIT_TEST(Utf8) {
1954+
auto kikimr = Kikimr();
1955+
auto db = kikimr.GetQueryClient();
1956+
1957+
{ // CreateTexts
1958+
TString query = R"sql(
1959+
CREATE TABLE `/Root/Texts` (
1960+
Key Uint64,
1961+
Text Utf8,
1962+
PRIMARY KEY (Key)
1963+
);
1964+
)sql";
1965+
auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync();
1966+
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
1967+
}
1968+
{ // UpsertTexts
1969+
TString query = R"sql(
1970+
UPSERT INTO `/Root/Texts` (Key, Text) VALUES
1971+
(100, "Мышь спит"),
1972+
(200, "Собака ест")
1973+
)sql";
1974+
auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync();
1975+
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
1976+
}
1977+
AddIndex(db);
1978+
{ // UpsertRow
1979+
TString query = R"sql(
1980+
UPSERT INTO `/Root/Texts` (Key, Text) VALUES
1981+
(150, "Кошка ест мышь")
1982+
)sql";
1983+
auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync();
1984+
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
1985+
}
1986+
1987+
auto index = ReadIndex(db);
1988+
CompareYson(R"([
1989+
[[150u];"ест"];
1990+
[[200u];"ест"];
1991+
[[150u];"кошка"];
1992+
[[100u];"мышь"];
1993+
[[150u];"мышь"];
1994+
[[200u];"собака"];
1995+
[[100u];"спит"]
1996+
])", NYdb::FormatResultSetYson(index));
1997+
1998+
{
1999+
TString query = R"sql(
2000+
SELECT Key FROM `/Root/Texts/fulltext_idx/indexImplTable`
2001+
WHERE __ydb_token = "ест"
2002+
ORDER BY Key
2003+
)sql";
2004+
auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync();
2005+
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
2006+
CompareYson(R"([
2007+
[[150u]];
2008+
[[200u]]
2009+
])", NYdb::FormatResultSetYson(result.GetResultSet(0)));
2010+
}
2011+
}
19522012

19532013
}
19542014

ydb/core/kqp/ut/scheme/kqp_scheme_fulltext_ut.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ Y_UNIT_TEST_SUITE(KqpSchemeFulltext) {
407407
)";
408408
auto result = ExecuteSchemeQuery(kikimr, query, UseQueryClient);
409409
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::BAD_REQUEST, result.GetIssues().ToString());
410-
UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Error: Fulltext column 'Text' expected type 'String' but got Uint64");
410+
UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Error: Fulltext column 'Text' expected type 'String' or 'Utf8' but got Uint64");
411411
}
412412
}
413413

ydb/core/tx/datashard/build_index/fulltext.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ class TBuildFulltextIndexScan: public TActor<TBuildFulltextIndexScan>, public IA
9292
};
9393
{
9494
Ydb::Type type;
95-
type.set_type_id(TokenType);
95+
NScheme::ProtoFromTypeInfo(types.at(TextColumn), type);
9696
uploadTypes->emplace_back(TokenColumn, type);
9797
}
9898
for (const auto& column : table.KeyColumnIds) {

ydb/core/tx/datashard/build_index/ut/ut_fulltext.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ Y_UNIT_TEST_SUITE(TTxDataShardBuildFulltextIndexScan) {
123123
options.Shards(1);
124124
options.AllowSystemColumnNames(true);
125125
options.Columns({
126-
{TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true},
126+
{TokenColumn, "String", true, true},
127127
{"key", "Uint32", true, true},
128128
{"data", "String", false, false},
129129
});
@@ -266,7 +266,7 @@ __ydb_token = yellow, key = 3, data = three
266266
options.Shards(1);
267267
options.AllowSystemColumnNames(true);
268268
options.Columns({
269-
{TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true},
269+
{TokenColumn, "String", true, true},
270270
{"key", "Uint32", true, true},
271271
{"text", "String", false, false},
272272
{"data", "String", false, false},
@@ -331,7 +331,7 @@ __ydb_token = yellow, key = 3, text = yellow apple, data = three
331331
options.Shards(1);
332332
options.AllowSystemColumnNames(true);
333333
options.Columns({
334-
{TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true},
334+
{TokenColumn, "String", true, true},
335335
{"key", "Uint32", true, true},
336336
{"text", "String", true, true},
337337
{"subkey", "Uint32", true, true},

ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa
200200
indexTableDesc = indexDesc.GetIndexImplTableDescriptions(0);
201201
}
202202
const THashSet<TString> indexDataColumns{indexDesc.GetDataColumnNames().begin(), indexDesc.GetDataColumnNames().end()};
203-
auto implTableDesc = CalcFulltextImplTableDesc(tableInfo, tableInfo->PartitionConfig(), indexDataColumns, indexTableDesc);
203+
auto implTableDesc = CalcFulltextImplTableDesc(tableInfo, tableInfo->PartitionConfig(), indexDataColumns, indexTableDesc, indexDesc.GetFulltextIndexDescription());
204204
implTableDesc.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true);
205205
result.push_back(createImplTable(std::move(implTableDesc)));
206206
break;

ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr
358358
userIndexDesc = indexDescription.GetIndexImplTableDescriptions(0);
359359
}
360360
const THashSet<TString> indexDataColumns{indexDescription.GetDataColumnNames().begin(), indexDescription.GetDataColumnNames().end()};
361-
result.push_back(createIndexImplTable(CalcFulltextImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), indexDataColumns, userIndexDesc)));
361+
result.push_back(createIndexImplTable(CalcFulltextImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), indexDataColumns, userIndexDesc, indexDescription.GetFulltextIndexDescription())));
362362
break;
363363
}
364364
default:

ydb/core/tx/schemeshard/schemeshard_schema.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@ namespace NKikimr::NSchemeShard {
1717

1818
inline constexpr auto ClusterIdTypeId = NScheme::NTypeIds::Uint64;
1919

20-
// TODO: support utf-8 in fulltext index
21-
inline constexpr auto TokenTypeId = NScheme::NTypeIds::String;
22-
2320
struct Schema : NIceDb::Schema {
2421
struct Paths : Table<1> {
2522
struct Id : Column<1, NScheme::NTypeIds::Uint64> { using Type = TLocalPathId; };

0 commit comments

Comments
 (0)