Skip to content

Commit 17a77ae

Browse files
authored
do not report RED storage issue when schemeshard is down (#28080)
1 parent 0a4065a commit 17a77ae

File tree

2 files changed

+56
-1
lines changed

2 files changed

+56
-1
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
841841
TabletRequests.TabletStates[RootSchemeShardId].Database = DomainPath;
842842
TabletRequests.TabletStates[RootSchemeShardId].Type = TTabletTypes::SchemeShard;
843843
DescribeByPath[DomainPath] = RequestDescribe(RootSchemeShardId, DomainPath);
844+
DatabaseState[DomainPath].SchemeShardId = RootSchemeShardId;
844845
}
845846

846847
if (BsControllerId) {
@@ -3412,7 +3413,11 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
34123413

34133414
void FillStorage(TDatabaseState& databaseState, Ydb::Monitoring::StorageStatus& storageStatus, TSelfCheckContext context) {
34143415
if (HaveAllBSControllerInfo() && databaseState.StoragePools.empty()) {
3415-
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "There are no storage pools", ETags::StorageState);
3416+
if (TabletRequests.TabletStates[databaseState.SchemeShardId].IsUnresponsive) {
3417+
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREY, "Could not get data on storage", ETags::StorageState);
3418+
} else {
3419+
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "There are no storage pools", ETags::StorageState);
3420+
}
34163421
} else {
34173422
if (HaveAllBSControllerInfo()) {
34183423
for (const ui64 poolId : databaseState.StoragePools) {

ydb/core/health_check/health_check_ut.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
704704
Filters.emplace_back([=](auto&& location) { return location.storage().pool().group().pile().name() == name || location.compute().pile().name() == name; });
705705
return *this;
706706
}
707+
708+
TLocationFilter& TabletType(const TString& type) {
709+
Filters.emplace_back([=](auto&& location) { return location.compute().tablet().type() == type; });
710+
return *this;
711+
}
707712
};
708713

709714
void CheckHcResultHasIssuesWithStatus(const Ydb::Monitoring::SelfCheckResult& result, const TString& type,
@@ -2287,6 +2292,51 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
22872292
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1, TLocationFilter().Pool("static").Pile("pile0"));
22882293
}
22892294

2295+
Y_UNIT_TEST(TestNoSchemeShardResponse) {
2296+
TPortManager tp;
2297+
ui16 port = tp.GetPort(2134);
2298+
ui16 grpcPort = tp.GetPort(2135);
2299+
auto settings = TServerSettings(port)
2300+
.SetNodeCount(1)
2301+
.SetDynamicNodeCount(1)
2302+
.SetUseRealThreads(false)
2303+
.SetDomainName("Root");
2304+
TServer server(settings);
2305+
server.EnableGRpc(grpcPort);
2306+
TClient client(settings);
2307+
TTestActorRuntime& runtime = *server.GetRuntime();
2308+
2309+
auto &dynamicNameserviceConfig = runtime.GetAppData().DynamicNameserviceConfig;
2310+
dynamicNameserviceConfig->MaxStaticNodeId = runtime.GetNodeId(server.StaticNodes() - 1);
2311+
dynamicNameserviceConfig->MinDynamicNodeId = runtime.GetNodeId(server.StaticNodes());
2312+
dynamicNameserviceConfig->MaxDynamicNodeId = runtime.GetNodeId(server.StaticNodes() + server.DynamicNodes() - 1);
2313+
2314+
TBlockEvents<TEvSchemeShard::TEvDescribeScheme> blockSS(runtime);
2315+
2316+
TActorId sender = runtime.AllocateEdgeActor();
2317+
TAutoPtr<IEventHandle> handle;
2318+
2319+
auto *request = new NHealthCheck::TEvSelfCheckRequest;
2320+
request->Request.set_return_verbose_status(true);
2321+
request->Database = "/Root";
2322+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
2323+
const auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
2324+
2325+
Ctest << result.ShortDebugString() << Endl;
2326+
2327+
UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::EMERGENCY);
2328+
CheckHcResultHasIssuesWithStatus(result, "SYSTEM_TABLET", Ydb::Monitoring::StatusFlag::RED, 1, TLocationFilter().TabletType("SchemeShard"));
2329+
2330+
UNIT_ASSERT_VALUES_EQUAL(result.database_status_size(), 1);
2331+
const auto &database_status = result.database_status(0);
2332+
2333+
UNIT_ASSERT_VALUES_EQUAL(database_status.name(), "/Root");
2334+
UNIT_ASSERT_VALUES_EQUAL(database_status.overall(), Ydb::Monitoring::StatusFlag::RED);
2335+
2336+
UNIT_ASSERT_VALUES_EQUAL(database_status.compute().overall(), Ydb::Monitoring::StatusFlag::RED);
2337+
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().overall(), Ydb::Monitoring::StatusFlag::GREY);
2338+
}
2339+
22902340
Y_UNIT_TEST(ShardsLimit999) {
22912341
ShardsQuotaTest(999, 1000, 1, Ydb::Monitoring::StatusFlag::RED);
22922342
}

0 commit comments

Comments
 (0)