Skip to content

Commit 5efa2ff

Browse files
authored
feat: handle k8s metrics semantic convention updates (#1248)
Handle OpenTelemetry semantic versions based on the ScopeVersion field (metrics) Related to [changes](https://opentelemetry.io/blog/2025/kubeletstats-receiver-metrics-deprecation/) Old (switched to v0.137.0) <img width="818" height="317" alt="image" src="https://github.com/user-attachments/assets/ceea52c6-ad06-4295-afae-a44f21b2e962" /> New (be able to handle multiple versions) <img width="568" height="329" alt="image" src="https://github.com/user-attachments/assets/d2e282b2-cfd7-490a-a64d-502881a360a2" /> Ref: HDX-2322, HDX-2562
1 parent c90a93e commit 5efa2ff

File tree

10 files changed

+841
-4
lines changed

10 files changed

+841
-4
lines changed

.changeset/heavy-dryers-occur.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
"@hyperdx/common-utils": patch
3+
"@hyperdx/app": patch
4+
---
5+
6+
feat: handle k8s metrics semantic convention updates

packages/api/src/clickhouse/__tests__/__snapshots__/renderChartConfig.test.ts.snap

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,86 @@
11
// Jest Snapshot v1, https://goo.gl/fbAQLP
22

3+
exports[`renderChartConfig K8s Semantic Convention Migrations with metricNameSql should handle gauge metric with metricNameSql and groupBy 1`] = `
4+
Array [
5+
Object {
6+
"__hdx_time_bucket": "2022-01-05T00:00:00Z",
7+
"arrayElement(ResourceAttributes, 'k8s.pod.name')": "test-pod",
8+
"avg(toFloat64OrDefault(toString(LastValue)))": 45,
9+
},
10+
Object {
11+
"__hdx_time_bucket": "2022-01-05T00:01:00Z",
12+
"arrayElement(ResourceAttributes, 'k8s.pod.name')": "test-pod",
13+
"avg(toFloat64OrDefault(toString(LastValue)))": 50,
14+
},
15+
Object {
16+
"__hdx_time_bucket": "2022-01-05T00:02:00Z",
17+
"arrayElement(ResourceAttributes, 'k8s.pod.name')": "test-pod",
18+
"avg(toFloat64OrDefault(toString(LastValue)))": 55,
19+
},
20+
Object {
21+
"__hdx_time_bucket": "2022-01-05T00:03:00Z",
22+
"arrayElement(ResourceAttributes, 'k8s.pod.name')": "test-pod",
23+
"avg(toFloat64OrDefault(toString(LastValue)))": 60,
24+
},
25+
Object {
26+
"__hdx_time_bucket": "2022-01-05T00:04:00Z",
27+
"arrayElement(ResourceAttributes, 'k8s.pod.name')": "test-pod",
28+
"avg(toFloat64OrDefault(toString(LastValue)))": 65,
29+
},
30+
Object {
31+
"__hdx_time_bucket": "2022-01-05T00:05:00Z",
32+
"arrayElement(ResourceAttributes, 'k8s.pod.name')": "test-pod",
33+
"avg(toFloat64OrDefault(toString(LastValue)))": 70,
34+
},
35+
]
36+
`;
37+
38+
exports[`renderChartConfig K8s Semantic Convention Migrations with metricNameSql should handle metrics without metricNameSql (backward compatibility) 1`] = `
39+
Array [
40+
Object {
41+
"__hdx_time_bucket": "2022-01-05T00:00:00Z",
42+
"avg(toFloat64OrDefault(toString(LastValue)))": 45,
43+
},
44+
Object {
45+
"__hdx_time_bucket": "2022-01-05T00:01:00Z",
46+
"avg(toFloat64OrDefault(toString(LastValue)))": 50,
47+
},
48+
Object {
49+
"__hdx_time_bucket": "2022-01-05T00:02:00Z",
50+
"avg(toFloat64OrDefault(toString(LastValue)))": 55,
51+
},
52+
]
53+
`;
54+
55+
exports[`renderChartConfig K8s Semantic Convention Migrations with metricNameSql should query k8s.pod.cpu.utilization gauge metric using metricNameSql to handle both old and new conventions 1`] = `
56+
Array [
57+
Object {
58+
"__hdx_time_bucket": "2022-01-05T00:00:00Z",
59+
"avg(toFloat64OrDefault(toString(LastValue)))": 45,
60+
},
61+
Object {
62+
"__hdx_time_bucket": "2022-01-05T00:01:00Z",
63+
"avg(toFloat64OrDefault(toString(LastValue)))": 50,
64+
},
65+
Object {
66+
"__hdx_time_bucket": "2022-01-05T00:02:00Z",
67+
"avg(toFloat64OrDefault(toString(LastValue)))": 55,
68+
},
69+
Object {
70+
"__hdx_time_bucket": "2022-01-05T00:03:00Z",
71+
"avg(toFloat64OrDefault(toString(LastValue)))": 60,
72+
},
73+
Object {
74+
"__hdx_time_bucket": "2022-01-05T00:04:00Z",
75+
"avg(toFloat64OrDefault(toString(LastValue)))": 65,
76+
},
77+
Object {
78+
"__hdx_time_bucket": "2022-01-05T00:05:00Z",
79+
"avg(toFloat64OrDefault(toString(LastValue)))": 70,
80+
},
81+
]
82+
`;
83+
384
exports[`renderChartConfig Query Events - Logs simple select + group by query logs 1`] = `
485
Array [
586
Object {

packages/api/src/clickhouse/__tests__/renderChartConfig.test.ts

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,4 +1380,145 @@ describe('renderChartConfig', () => {
13801380
expect(res).toMatchSnapshot();
13811381
});
13821382
});
1383+
1384+
describe('K8s Semantic Convention Migrations with metricNameSql', () => {
1385+
beforeEach(async () => {
1386+
// Insert gauge metrics with old semantic convention (ScopeVersion < 0.125.0)
1387+
const oldVersionGaugePoints = [
1388+
{ value: 45, timestamp: now, ScopeVersion: '0.124.0' },
1389+
{ value: 50, timestamp: now + ms('1m'), ScopeVersion: '0.124.0' },
1390+
{ value: 55, timestamp: now + ms('2m'), ScopeVersion: '0.124.0' },
1391+
].map(point => ({
1392+
MetricName: 'k8s.pod.cpu.utilization',
1393+
ServiceName: 'k8s-monitor',
1394+
ResourceAttributes: {
1395+
'k8s.pod.name': 'test-pod',
1396+
'k8s.namespace.name': 'default',
1397+
},
1398+
Value: point.value,
1399+
TimeUnix: new Date(point.timestamp),
1400+
ScopeVersion: point.ScopeVersion,
1401+
}));
1402+
1403+
// Insert gauge metrics with new semantic convention (ScopeVersion >= 0.125.0)
1404+
const newVersionGaugePoints = [
1405+
{ value: 60, timestamp: now + ms('3m'), ScopeVersion: '0.125.0' },
1406+
{ value: 65, timestamp: now + ms('4m'), ScopeVersion: '0.125.0' },
1407+
{ value: 70, timestamp: now + ms('5m'), ScopeVersion: '0.126.0' },
1408+
].map(point => ({
1409+
MetricName: 'k8s.pod.cpu.usage',
1410+
ServiceName: 'k8s-monitor',
1411+
ResourceAttributes: {
1412+
'k8s.pod.name': 'test-pod',
1413+
'k8s.namespace.name': 'default',
1414+
},
1415+
Value: point.value,
1416+
TimeUnix: new Date(point.timestamp),
1417+
ScopeVersion: point.ScopeVersion,
1418+
}));
1419+
1420+
await bulkInsertMetricsGauge([
1421+
...oldVersionGaugePoints,
1422+
...newVersionGaugePoints,
1423+
]);
1424+
});
1425+
1426+
it('should query k8s.pod.cpu.utilization gauge metric using metricNameSql to handle both old and new conventions', async () => {
1427+
const query = await renderChartConfig(
1428+
{
1429+
select: [
1430+
{
1431+
aggFn: 'avg',
1432+
metricName: 'k8s.pod.cpu.utilization',
1433+
metricNameSql:
1434+
"if(greaterOrEquals(ScopeVersion, '0.125.0'), 'k8s.pod.cpu.usage', 'k8s.pod.cpu.utilization')",
1435+
metricType: MetricsDataType.Gauge,
1436+
valueExpression: 'Value',
1437+
},
1438+
],
1439+
from: metricSource.from,
1440+
where: '',
1441+
metricTables: TEST_METRIC_TABLES,
1442+
dateRange: [new Date(now), new Date(now + ms('10m'))],
1443+
granularity: '1 minute',
1444+
timestampValueExpression: metricSource.timestampValueExpression,
1445+
connection: connection.id,
1446+
},
1447+
metadata,
1448+
);
1449+
1450+
const res = await queryData(query);
1451+
// Should return data from both old (k8s.pod.cpu.utilization) and new (k8s.pod.cpu.usage) metric names
1452+
expect(res.length).toBeGreaterThan(0);
1453+
expect(res).toMatchSnapshot();
1454+
1455+
// Verify the SQL contains the dynamic metric name condition
1456+
expect(query.sql).toContain('if(greaterOrEquals(ScopeVersion');
1457+
expect(query.sql).toContain('k8s.pod.cpu.usage');
1458+
expect(query.sql).toContain('k8s.pod.cpu.utilization');
1459+
});
1460+
1461+
it('should handle gauge metric with metricNameSql and groupBy', async () => {
1462+
const query = await renderChartConfig(
1463+
{
1464+
select: [
1465+
{
1466+
aggFn: 'avg',
1467+
metricName: 'k8s.pod.cpu.utilization',
1468+
metricNameSql:
1469+
"if(greaterOrEquals(ScopeVersion, '0.125.0'), 'k8s.pod.cpu.usage', 'k8s.pod.cpu.utilization')",
1470+
metricType: MetricsDataType.Gauge,
1471+
valueExpression: 'Value',
1472+
},
1473+
],
1474+
from: metricSource.from,
1475+
where: '',
1476+
metricTables: TEST_METRIC_TABLES,
1477+
dateRange: [new Date(now), new Date(now + ms('10m'))],
1478+
granularity: '1 minute',
1479+
groupBy: `ResourceAttributes['k8s.pod.name']`,
1480+
timestampValueExpression: metricSource.timestampValueExpression,
1481+
connection: connection.id,
1482+
},
1483+
metadata,
1484+
);
1485+
1486+
const res = await queryData(query);
1487+
expect(res.length).toBeGreaterThan(0);
1488+
expect(res).toMatchSnapshot();
1489+
});
1490+
1491+
it('should handle metrics without metricNameSql (backward compatibility)', async () => {
1492+
// Test querying the old metric name directly without migration SQL
1493+
const query = await renderChartConfig(
1494+
{
1495+
select: [
1496+
{
1497+
aggFn: 'avg',
1498+
metricName: 'k8s.pod.cpu.utilization',
1499+
// No metricNameSql provided - should query old name only
1500+
metricType: MetricsDataType.Gauge,
1501+
valueExpression: 'Value',
1502+
},
1503+
],
1504+
from: metricSource.from,
1505+
where: '',
1506+
metricTables: TEST_METRIC_TABLES,
1507+
dateRange: [new Date(now), new Date(now + ms('10m'))],
1508+
granularity: '1 minute',
1509+
timestampValueExpression: metricSource.timestampValueExpression,
1510+
connection: connection.id,
1511+
},
1512+
metadata,
1513+
);
1514+
1515+
const res = await queryData(query);
1516+
// Should only return data from old metric name (k8s.pod.cpu.utilization)
1517+
expect(res).toMatchSnapshot();
1518+
1519+
// Verify the SQL uses simple string comparison
1520+
expect(query.sql).toContain("MetricName = 'k8s.pod.cpu.utilization'");
1521+
expect(query.sql).not.toContain('if(greaterOrEquals(ScopeVersion');
1522+
});
1523+
});
13831524
});

packages/app/src/ChartUtils.tsx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import {
2020
} from '@hyperdx/common-utils/dist/types';
2121
import { SegmentedControl, Select as MSelect } from '@mantine/core';
2222

23+
import { getMetricNameSql } from './otelSemanticConventions';
2324
import {
2425
AggFn,
2526
ChartSeries,
@@ -707,6 +708,10 @@ export const convertV1ChartConfigToV2 = (
707708
const [metricName, rawMetricDataType] = field
708709
.split(' - ')
709710
.map(s => s.trim());
711+
712+
// Check if this metric name needs version-based SQL transformation
713+
const metricNameSql = getMetricNameSql(metricName);
714+
710715
const metricDataType = z
711716
.nativeEnum(MetricsDataTypeV2)
712717
.parse(rawMetricDataType?.toLowerCase());
@@ -715,6 +720,7 @@ export const convertV1ChartConfigToV2 = (
715720
metricType: metricDataType,
716721
valueExpression: field,
717722
metricName,
723+
metricNameSql,
718724
aggConditionLanguage: 'lucene',
719725
aggCondition: s.where,
720726
};
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import { getMetricNameSql } from '../otelSemanticConventions';
2+
3+
describe('otelSemanticConventions', () => {
4+
describe('getMetricNameSql', () => {
5+
it('should return SQL for k8s.pod.cpu.utilization migration', () => {
6+
const result = getMetricNameSql('k8s.pod.cpu.utilization');
7+
expect(result).toBe(
8+
"if(greaterOrEquals(ScopeVersion, '0.125.0'), 'k8s.pod.cpu.usage', 'k8s.pod.cpu.utilization')",
9+
);
10+
});
11+
12+
it('should return SQL for k8s.node.cpu.utilization migration', () => {
13+
const result = getMetricNameSql('k8s.node.cpu.utilization');
14+
expect(result).toBe(
15+
"if(greaterOrEquals(ScopeVersion, '0.125.0'), 'k8s.node.cpu.usage', 'k8s.node.cpu.utilization')",
16+
);
17+
});
18+
19+
it('should return SQL for container.cpu.utilization migration', () => {
20+
const result = getMetricNameSql('container.cpu.utilization');
21+
expect(result).toBe(
22+
"if(greaterOrEquals(ScopeVersion, '0.125.0'), 'container.cpu.usage', 'container.cpu.utilization')",
23+
);
24+
});
25+
26+
it('should return undefined for non-migrated metrics', () => {
27+
const result = getMetricNameSql('some.other.metric');
28+
expect(result).toBeUndefined();
29+
});
30+
31+
it('should return undefined for empty string', () => {
32+
const result = getMetricNameSql('');
33+
expect(result).toBeUndefined();
34+
});
35+
36+
it('should return undefined for new metric names', () => {
37+
// If someone queries using the new name directly, we shouldn't transform it
38+
const result = getMetricNameSql('k8s.pod.cpu.usage');
39+
expect(result).toBeUndefined();
40+
});
41+
});
42+
});
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/**
2+
* OpenTelemetry Semantic Conventions utilities
3+
* Handles transformations between different versions of OTel semantic conventions
4+
*/
5+
6+
/**
7+
* Mapping of old metric names to new metric names based on semantic convention version
8+
*/
9+
const METRIC_NAME_MIGRATIONS: Record<
10+
string,
11+
{
12+
oldName: string;
13+
newName: string;
14+
versionThreshold: string;
15+
}
16+
> = {
17+
'k8s.pod.cpu.utilization': {
18+
oldName: 'k8s.pod.cpu.utilization',
19+
newName: 'k8s.pod.cpu.usage',
20+
versionThreshold: '0.125.0',
21+
},
22+
'k8s.node.cpu.utilization': {
23+
oldName: 'k8s.node.cpu.utilization',
24+
newName: 'k8s.node.cpu.usage',
25+
versionThreshold: '0.125.0',
26+
},
27+
'container.cpu.utilization': {
28+
oldName: 'container.cpu.utilization',
29+
newName: 'container.cpu.usage',
30+
versionThreshold: '0.125.0',
31+
},
32+
};
33+
34+
/**
35+
* Generates SQL expression to dynamically select metric name based on ScopeVersion
36+
* @param metricName - The metric name to check for migrations
37+
* @returns SQL expression if migration exists, undefined otherwise
38+
*/
39+
export function getMetricNameSql(metricName: string): string | undefined {
40+
const migration = METRIC_NAME_MIGRATIONS[metricName];
41+
42+
if (!migration) {
43+
return undefined;
44+
}
45+
46+
return `if(greaterOrEquals(ScopeVersion, '${migration.versionThreshold}'), '${migration.newName}', '${migration.oldName}')`;
47+
}

0 commit comments

Comments
 (0)