Skip to content

Commit 626e2a7

Browse files
authored
feat: add troubleshoot_kubernetes_list_top_cpu_consumed_by_workload (#43)
1 parent 026a8df commit 626e2a7

File tree

5 files changed

+204
-2
lines changed

5 files changed

+204
-2
lines changed

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ The handler filters tools dynamically based on `GetMyPermissions` from Sysdig Se
6161
| `troubleshoot_kubernetes_list_count_pods_per_cluster` | `tool_troubleshoot_kubernetes_list_count_pods_per_cluster.go` | List the count of running Kubernetes Pods grouped by cluster and namespace. | `promql.exec` | "List the count of running Kubernetes Pods in cluster 'production'" |
6262
| `troubleshoot_kubernetes_list_underutilized_pods_by_cpu_quota` | `tool_troubleshoot_kubernetes_list_underutilized_pods_by_cpu_quota.go` | List Kubernetes pods with CPU usage below 25% of the quota limit. | `promql.exec` | "Show the top 10 underutilized pods by CPU quota in cluster 'production'" |
6363
| `troubleshoot_kubernetes_list_underutilized_pods_by_memory_quota` | `tool_troubleshoot_kubernetes_list_underutilized_pods_by_memory_quota.go` | List Kubernetes pods with memory usage below 25% of the limit. | `promql.exec` | "Show the top 10 underutilized pods by memory quota in cluster 'production'" |
64+
| `troubleshoot_kubernetes_list_top_cpu_consumed_by_workload` | `tool_troubleshoot_kubernetes_list_top_cpu_consumed_by_workload.go` | Identifies the Kubernetes workloads (all containers) consuming the most CPU (in cores). | `promql.exec` | "Show the top 10 workloads consuming the most CPU in cluster 'production'" |
6465

6566
Every tool has a companion `_test.go` file that exercises request validation, permission metadata, and Sysdig client calls through mocks.
6667
Note that if you add more tools you need to also update this file to reflect that.

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,14 @@ The server dynamically filters the available tools based on the permissions asso
166166
- **`troubleshoot_kubernetes_list_underutilized_pods_by_memory_quota`**
167167
- **Description**: List Kubernetes pods with memory usage below 25% of the limit.
168168
- **Required Permission**: `promql.exec`
169-
- **Sample Prompt**: "Show the top 10 underutilized pods by memory quota in cluster 'production'"
169+
- **Sample Prompt**: "Show the top 10 underutilized pods by memory quota in cluster 'production'"
170170

171-
## Requirements
171+
- **`troubleshoot_kubernetes_list_top_cpu_consumed_by_workload`**
172+
- **Description**: Identifies the Kubernetes workloads (all containers) consuming the most CPU (in cores).
173+
- **Required Permission**: `promql.exec`
174+
- **Sample Prompt**: "Show the top 10 workloads consuming the most CPU in cluster 'production'"
172175

176+
## Requirements
173177
- [Go](https://go.dev/doc/install) 1.25 or higher (if running without Docker).
174178

175179
## Configuration

cmd/server/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp
106106
tools.NewTroubleshootKubernetesListTopNetworkErrorsInPods(sysdigClient),
107107
tools.NewTroubleshootKubernetesListCountPodsPerCluster(sysdigClient),
108108
tools.NewTroubleshootKubernetesListUnderutilizedPodsByCPUQuota(sysdigClient),
109+
tools.NewTroubleshootKubernetesListTopCPUConsumedByWorkload(sysdigClient),
109110
tools.NewTroubleshootKubernetesListUnderutilizedPodsByMemoryQuota(sysdigClient),
110111
)
111112
return handler
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
package tools
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"io"
8+
"strings"
9+
10+
"github.com/mark3labs/mcp-go/mcp"
11+
"github.com/mark3labs/mcp-go/server"
12+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
13+
)
14+
15+
type TroubleshootKubernetesListTopCPUConsumedByWorkload struct {
16+
SysdigClient sysdig.ExtendedClientWithResponsesInterface
17+
}
18+
19+
func NewTroubleshootKubernetesListTopCPUConsumedByWorkload(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *TroubleshootKubernetesListTopCPUConsumedByWorkload {
20+
return &TroubleshootKubernetesListTopCPUConsumedByWorkload{
21+
SysdigClient: sysdigClient,
22+
}
23+
}
24+
25+
func (t *TroubleshootKubernetesListTopCPUConsumedByWorkload) RegisterInServer(s *server.MCPServer) {
26+
tool := mcp.NewTool("troubleshoot_kubernetes_list_top_cpu_consumed_by_workload",
27+
mcp.WithDescription("Identifies the Kubernetes workloads (all containers) consuming the most CPU (in cores)."),
28+
mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")),
29+
mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")),
30+
mcp.WithString("workload_type", mcp.Description("The type of the workload to filter by.")),
31+
mcp.WithString("workload_name", mcp.Description("The name of the workload to filter by.")),
32+
mcp.WithNumber("limit",
33+
mcp.Description("Maximum number of workloads to return."),
34+
mcp.DefaultNumber(20),
35+
),
36+
mcp.WithOutputSchema[map[string]any](),
37+
WithRequiredPermissions("promql.exec"),
38+
)
39+
s.AddTool(tool, t.handle)
40+
}
41+
42+
func (t *TroubleshootKubernetesListTopCPUConsumedByWorkload) handle(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
43+
clusterName := mcp.ParseString(request, "cluster_name", "")
44+
namespaceName := mcp.ParseString(request, "namespace_name", "")
45+
workloadType := mcp.ParseString(request, "workload_type", "")
46+
workloadName := mcp.ParseString(request, "workload_name", "")
47+
limit := mcp.ParseInt(request, "limit", 20)
48+
49+
query := buildTopCPUConsumedByWorkloadQuery(clusterName, namespaceName, workloadType, workloadName, limit)
50+
51+
params := &sysdig.GetQueryV1Params{
52+
Query: query,
53+
}
54+
55+
httpResp, err := t.SysdigClient.GetQueryV1(ctx, params)
56+
if err != nil {
57+
return mcp.NewToolResultErrorFromErr("failed to get top cpu consumed by workload", err), nil
58+
}
59+
60+
if httpResp.StatusCode != 200 {
61+
bodyBytes, _ := io.ReadAll(httpResp.Body)
62+
return mcp.NewToolResultErrorf("failed to get top cpu consumed by workload: status code %d, body: %s", httpResp.StatusCode, string(bodyBytes)), nil
63+
}
64+
65+
var queryResponse sysdig.QueryResponseV1
66+
if err := json.NewDecoder(httpResp.Body).Decode(&queryResponse); err != nil {
67+
return mcp.NewToolResultErrorFromErr("failed to decode response", err), nil
68+
}
69+
70+
return mcp.NewToolResultJSON(queryResponse)
71+
}
72+
73+
func buildTopCPUConsumedByWorkloadQuery(clusterName, namespaceName, workloadType, workloadName string, limit int) string {
74+
filters := []string{}
75+
if clusterName != "" {
76+
filters = append(filters, fmt.Sprintf(`kube_cluster_name="%s"`, clusterName))
77+
}
78+
if namespaceName != "" {
79+
filters = append(filters, fmt.Sprintf(`kube_namespace_name="%s"`, namespaceName))
80+
}
81+
if workloadType != "" {
82+
filters = append(filters, fmt.Sprintf(`kube_workload_type="%s"`, workloadType))
83+
}
84+
if workloadName != "" {
85+
filters = append(filters, fmt.Sprintf(`kube_workload_name="%s"`, workloadName))
86+
}
87+
88+
filterString := ""
89+
if len(filters) > 0 {
90+
filterString = fmt.Sprintf("{%s}", strings.Join(filters, ","))
91+
}
92+
93+
return fmt.Sprintf("topk(%d, sum by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name)(sysdig_container_cpu_cores_used%s))", limit, filterString)
94+
}
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package tools_test
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"io"
7+
"net/http"
8+
9+
"github.com/mark3labs/mcp-go/mcp"
10+
"github.com/mark3labs/mcp-go/server"
11+
. "github.com/onsi/ginkgo/v2"
12+
. "github.com/onsi/gomega"
13+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools"
14+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
15+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks"
16+
"go.uber.org/mock/gomock"
17+
)
18+
19+
var _ = Describe("TroubleshootKubernetesListTopCPUConsumedByWorkload Tool", func() {
20+
var (
21+
tool *tools.TroubleshootKubernetesListTopCPUConsumedByWorkload
22+
mockSysdig *mocks.MockExtendedClientWithResponsesInterface
23+
mcpServer *server.MCPServer
24+
ctrl *gomock.Controller
25+
)
26+
27+
BeforeEach(func() {
28+
ctrl = gomock.NewController(GinkgoT())
29+
mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl)
30+
tool = tools.NewTroubleshootKubernetesListTopCPUConsumedByWorkload(mockSysdig)
31+
mcpServer = server.NewMCPServer("test", "test")
32+
tool.RegisterInServer(mcpServer)
33+
})
34+
35+
It("should register successfully in the server", func() {
36+
Expect(mcpServer.GetTool("troubleshoot_kubernetes_list_top_cpu_consumed_by_workload")).NotTo(BeNil())
37+
})
38+
39+
When("listing top cpu consumed by workload", func() {
40+
DescribeTable("it succeeds", func(ctx context.Context, toolName string, request mcp.CallToolRequest, expectedParamsRequested sysdig.GetQueryV1Params) {
41+
mockSysdig.EXPECT().GetQueryV1(gomock.Any(), &expectedParamsRequested).Return(&http.Response{
42+
StatusCode: http.StatusOK,
43+
Body: io.NopCloser(bytes.NewBufferString(`{"status":"success"}`)),
44+
}, nil)
45+
46+
serverTool := mcpServer.GetTool(toolName)
47+
result, err := serverTool.Handler(ctx, request)
48+
Expect(err).NotTo(HaveOccurred())
49+
50+
resultData, ok := result.Content[0].(mcp.TextContent)
51+
Expect(ok).To(BeTrue())
52+
Expect(resultData.Text).To(MatchJSON(`{"status":"success"}`))
53+
},
54+
Entry(nil,
55+
"troubleshoot_kubernetes_list_top_cpu_consumed_by_workload",
56+
mcp.CallToolRequest{
57+
Params: mcp.CallToolParams{
58+
Name: "troubleshoot_kubernetes_list_top_cpu_consumed_by_workload",
59+
Arguments: map[string]any{},
60+
},
61+
},
62+
sysdig.GetQueryV1Params{
63+
Query: `topk(20, sum by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name)(sysdig_container_cpu_cores_used))`,
64+
},
65+
),
66+
Entry(nil,
67+
"troubleshoot_kubernetes_list_top_cpu_consumed_by_workload",
68+
mcp.CallToolRequest{
69+
Params: mcp.CallToolParams{
70+
Name: "troubleshoot_kubernetes_list_top_cpu_consumed_by_workload",
71+
Arguments: map[string]any{
72+
"cluster_name": "prod",
73+
"namespace_name": "default",
74+
"limit": 10,
75+
},
76+
},
77+
},
78+
sysdig.GetQueryV1Params{
79+
Query: `topk(10, sum by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name)(sysdig_container_cpu_cores_used{kube_cluster_name="prod",kube_namespace_name="default"}))`,
80+
},
81+
),
82+
Entry(nil,
83+
"troubleshoot_kubernetes_list_top_cpu_consumed_by_workload",
84+
mcp.CallToolRequest{
85+
Params: mcp.CallToolParams{
86+
Name: "troubleshoot_kubernetes_list_top_cpu_consumed_by_workload",
87+
Arguments: map[string]any{
88+
"cluster_name": "prod",
89+
"namespace_name": "default",
90+
"workload_name": "api",
91+
"workload_type": "deployment",
92+
"limit": 5,
93+
},
94+
},
95+
},
96+
sysdig.GetQueryV1Params{
97+
Query: `topk(5, sum by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name)(sysdig_container_cpu_cores_used{kube_cluster_name="prod",kube_namespace_name="default",kube_workload_type="deployment",kube_workload_name="api"}))`,
98+
},
99+
),
100+
)
101+
})
102+
})

0 commit comments

Comments
 (0)