Skip to content

Commit 6aa4040

Browse files
authored
🌱 Don't assume there is only one replica in e2e tests (#2379)
* Don't assume there is only one replica in e2e tests For the upgrade e2e tests, don't assume there is onle one replica. Get the number of replicas from the deployment and wait for the deployment to have that many available. Use the lease to determine the leader pod and reference that. Note that the name format of leases for operator-controller and catalogd are quite different; this doesn't change that, as it may have an impact on the upgrade test itself. Signed-off-by: Todd Short <tshort@redhat.com> Assisted-by: Claude Code * Update metrics test to scape all pods Signed-off-by: Todd Short <tshort@redhat.com> --------- Signed-off-by: Todd Short <tshort@redhat.com>
1 parent c1197be commit 6aa4040

File tree

2 files changed

+124
-30
lines changed

2 files changed

+124
-30
lines changed

test/e2e/metrics_test.go

Lines changed: 70 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"fmt"
2020
"io"
2121
"os/exec"
22+
"strings"
2223
"testing"
2324
"time"
2425

@@ -33,16 +34,17 @@ func TestOperatorControllerMetricsExportedEndpoint(t *testing.T) {
3334
client := testutil.FindK8sClient(t)
3435
curlNamespace := createRandomNamespace(t, client)
3536
componentNamespace := getComponentNamespace(t, client, "control-plane=operator-controller-controller-manager")
36-
metricsURL := fmt.Sprintf("https://operator-controller-service.%s.svc.cluster.local:8443/metrics", componentNamespace)
3737

3838
config := NewMetricsTestConfig(
3939
client,
4040
curlNamespace,
41+
componentNamespace,
4142
"operator-controller-metrics-reader",
4243
"operator-controller-metrics-binding",
4344
"operator-controller-metrics-reader",
4445
"oper-curl-metrics",
45-
metricsURL,
46+
"app.kubernetes.io/name=operator-controller",
47+
operatorControllerMetricsPort,
4648
)
4749

4850
config.run(t)
@@ -53,42 +55,47 @@ func TestCatalogdMetricsExportedEndpoint(t *testing.T) {
5355
client := testutil.FindK8sClient(t)
5456
curlNamespace := createRandomNamespace(t, client)
5557
componentNamespace := getComponentNamespace(t, client, "control-plane=catalogd-controller-manager")
56-
metricsURL := fmt.Sprintf("https://catalogd-service.%s.svc.cluster.local:7443/metrics", componentNamespace)
5758

5859
config := NewMetricsTestConfig(
5960
client,
6061
curlNamespace,
62+
componentNamespace,
6163
"catalogd-metrics-reader",
6264
"catalogd-metrics-binding",
6365
"catalogd-metrics-reader",
6466
"catalogd-curl-metrics",
65-
metricsURL,
67+
"app.kubernetes.io/name=catalogd",
68+
catalogdMetricsPort,
6669
)
6770

6871
config.run(t)
6972
}
7073

7174
// MetricsTestConfig holds the necessary configurations for testing metrics endpoints.
7275
type MetricsTestConfig struct {
73-
client string
74-
namespace string
75-
clusterRole string
76-
clusterBinding string
77-
serviceAccount string
78-
curlPodName string
79-
metricsURL string
76+
client string
77+
namespace string
78+
componentNamespace string
79+
clusterRole string
80+
clusterBinding string
81+
serviceAccount string
82+
curlPodName string
83+
componentSelector string
84+
metricsPort int
8085
}
8186

8287
// NewMetricsTestConfig initializes a new MetricsTestConfig.
83-
func NewMetricsTestConfig(client, namespace, clusterRole, clusterBinding, serviceAccount, curlPodName, metricsURL string) *MetricsTestConfig {
88+
func NewMetricsTestConfig(client, namespace, componentNamespace, clusterRole, clusterBinding, serviceAccount, curlPodName, componentSelector string, metricsPort int) *MetricsTestConfig {
8489
return &MetricsTestConfig{
85-
client: client,
86-
namespace: namespace,
87-
clusterRole: clusterRole,
88-
clusterBinding: clusterBinding,
89-
serviceAccount: serviceAccount,
90-
curlPodName: curlPodName,
91-
metricsURL: metricsURL,
90+
client: client,
91+
namespace: namespace,
92+
componentNamespace: componentNamespace,
93+
clusterRole: clusterRole,
94+
clusterBinding: clusterBinding,
95+
serviceAccount: serviceAccount,
96+
curlPodName: curlPodName,
97+
componentSelector: componentSelector,
98+
metricsPort: metricsPort,
9299
}
93100
}
94101

@@ -154,19 +161,33 @@ func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) {
154161
require.NoError(t, err, "Error creating curl pod: %s", string(output))
155162
}
156163

157-
// validate verifies if is possible to access the metrics
164+
// validate verifies if is possible to access the metrics from all pods
158165
func (c *MetricsTestConfig) validate(t *testing.T, token string) {
159166
t.Log("Waiting for the curl pod to be ready")
160167
waitCmd := exec.Command(c.client, "wait", "--for=condition=Ready", "pod", c.curlPodName, "--namespace", c.namespace, "--timeout=60s")
161168
waitOutput, waitErr := waitCmd.CombinedOutput()
162169
require.NoError(t, waitErr, "Error waiting for curl pod to be ready: %s", string(waitOutput))
163170

164-
t.Log("Validating the metrics endpoint")
165-
curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--",
166-
"curl", "-v", "-k", "-H", "Authorization: Bearer "+token, c.metricsURL)
167-
output, err := curlCmd.CombinedOutput()
168-
require.NoError(t, err, "Error calling metrics endpoint: %s", string(output))
169-
require.Contains(t, string(output), "200 OK", "Metrics endpoint did not return 200 OK")
171+
// Get all pod IPs for the component
172+
podIPs := c.getComponentPodIPs(t)
173+
require.NotEmpty(t, podIPs, "No pod IPs found for component")
174+
t.Logf("Found %d pod(s) to scrape metrics from", len(podIPs))
175+
176+
// Validate metrics endpoint for each pod
177+
for i, podIP := range podIPs {
178+
// Build metrics URL with pod FQDN: <pod-ip-with-dashes>.<namespace>.pod.cluster.local
179+
// Convert IP dots to dashes (e.g., 10.244.0.11 -> 10-244-0-11)
180+
podIPDashes := strings.ReplaceAll(podIP, ".", "-")
181+
metricsURL := fmt.Sprintf("https://%s.%s.pod.cluster.local:%d/metrics", podIPDashes, c.componentNamespace, c.metricsPort)
182+
t.Logf("Validating metrics endpoint for pod %d/%d: %s", i+1, len(podIPs), metricsURL)
183+
184+
curlCmd := exec.Command(c.client, "exec", c.curlPodName, "--namespace", c.namespace, "--",
185+
"curl", "-v", "-k", "-H", "Authorization: Bearer "+token, metricsURL)
186+
output, err := curlCmd.CombinedOutput()
187+
require.NoError(t, err, "Error calling metrics endpoint %s: %s", metricsURL, string(output))
188+
require.Contains(t, string(output), "200 OK", "Metrics endpoint %s did not return 200 OK", metricsURL)
189+
t.Logf("Successfully scraped metrics from pod %d/%d", i+1, len(podIPs))
190+
}
170191
}
171192

172193
// cleanup removes the created resources. Uses a context with timeout to prevent hangs.
@@ -243,6 +264,29 @@ func getComponentNamespace(t *testing.T, client, selector string) string {
243264
return namespace
244265
}
245266

267+
// getComponentPodIPs returns the IP addresses of all pods matching the component selector
268+
func (c *MetricsTestConfig) getComponentPodIPs(t *testing.T) []string {
269+
cmd := exec.Command(c.client, "get", "pods",
270+
"--namespace="+c.componentNamespace,
271+
"--selector="+c.componentSelector,
272+
"--output=jsonpath={.items[*].status.podIP}")
273+
output, err := cmd.CombinedOutput()
274+
require.NoError(t, err, "Error getting pod IPs: %s", string(output))
275+
276+
podIPsStr := string(bytes.TrimSpace(output))
277+
if podIPsStr == "" {
278+
return []string{}
279+
}
280+
281+
// Split space-separated IPs
282+
fields := bytes.Fields([]byte(podIPsStr))
283+
ips := make([]string, len(fields))
284+
for i, field := range fields {
285+
ips[i] = string(field)
286+
}
287+
return ips
288+
}
289+
246290
func stdoutAndCombined(cmd *exec.Cmd) ([]byte, []byte, error) {
247291
var outOnly, outAndErr bytes.Buffer
248292
allWriter := io.MultiWriter(&outOnly, &outAndErr)

test/upgrade-e2e/post_upgrade_test.go

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"github.com/stretchr/testify/assert"
1212
"github.com/stretchr/testify/require"
1313
appsv1 "k8s.io/api/apps/v1"
14+
coordinationv1 "k8s.io/api/coordination/v1"
1415
corev1 "k8s.io/api/core/v1"
1516
apimeta "k8s.io/apimachinery/pkg/api/meta"
1617
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -195,13 +196,14 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
195196

196197
// waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label
197198
// has reached the desired number of replicas and that the number pods matches that number
198-
// i.e. no old pods remain. It will return a pointer to the first pod. This is only necessary
199+
// i.e. no old pods remain. It will return a pointer to the leader pod. This is only necessary
199200
// to facilitate the mitigation put in place for https://github.com/operator-framework/operator-controller/issues/1626
200201
func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) *corev1.Pod {
201202
deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector()
202203

203-
t.Log("Checking that the deployment is updated")
204+
t.Log("Checking that the deployment is updated and available")
204205
var desiredNumReplicas int32
206+
var deploymentNamespace string
205207
require.EventuallyWithT(t, func(ct *assert.CollectT) {
206208
var managerDeployments appsv1.DeploymentList
207209
require.NoError(ct, c.List(ctx, &managerDeployments, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}))
@@ -214,16 +216,64 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri
214216
managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas &&
215217
managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas,
216218
)
219+
220+
// Check that the deployment has the Available condition set to True
221+
var availableCond *appsv1.DeploymentCondition
222+
for i := range managerDeployment.Status.Conditions {
223+
if managerDeployment.Status.Conditions[i].Type == appsv1.DeploymentAvailable {
224+
availableCond = &managerDeployment.Status.Conditions[i]
225+
break
226+
}
227+
}
228+
require.NotNil(ct, availableCond, "Available condition not found")
229+
require.Equal(ct, corev1.ConditionTrue, availableCond.Status, "Deployment Available condition is not True")
230+
217231
desiredNumReplicas = *managerDeployment.Spec.Replicas
232+
deploymentNamespace = managerDeployment.Namespace
218233
}, time.Minute, time.Second)
219234

220235
var managerPods corev1.PodList
221236
t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas)
222237
require.EventuallyWithT(t, func(ct *assert.CollectT) {
223238
require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}))
224-
require.Len(ct, managerPods.Items, 1)
239+
require.Len(ct, managerPods.Items, int(desiredNumReplicas))
225240
}, time.Minute, time.Second)
226-
return &managerPods.Items[0]
241+
242+
// Find the leader pod by checking the lease
243+
t.Log("Finding the leader pod")
244+
// Map component labels to their leader election lease names
245+
leaseNames := map[string]string{
246+
"catalogd": "catalogd-operator-lock",
247+
"operator-controller": "9c4404e7.operatorframework.io",
248+
}
249+
250+
leaseName, ok := leaseNames[controlPlaneLabel]
251+
if !ok {
252+
t.Fatalf("Unknown control plane component: %s", controlPlaneLabel)
253+
}
254+
255+
var leaderPod *corev1.Pod
256+
require.EventuallyWithT(t, func(ct *assert.CollectT) {
257+
var lease coordinationv1.Lease
258+
require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: leaseName, Namespace: deploymentNamespace}, &lease))
259+
require.NotNil(ct, lease.Spec.HolderIdentity)
260+
261+
leaderIdentity := *lease.Spec.HolderIdentity
262+
// The lease holder identity format is: <pod-name>_<leader-election-id-suffix>
263+
// Extract just the pod name by splitting on '_'
264+
podName := strings.Split(leaderIdentity, "_")[0]
265+
266+
// Find the pod with matching name
267+
for i := range managerPods.Items {
268+
if managerPods.Items[i].Name == podName {
269+
leaderPod = &managerPods.Items[i]
270+
break
271+
}
272+
}
273+
require.NotNil(ct, leaderPod, "leader pod not found with identity: %s (pod name: %s)", leaderIdentity, podName)
274+
}, time.Minute, time.Second)
275+
276+
return leaderPod
227277
}
228278

229279
func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) {

0 commit comments

Comments
 (0)