Skip to content

Commit 6815127

Browse files
committed
Add message to event and update status condition when job fails
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
1 parent 9cdb0e0 commit 6815127

File tree

6 files changed

+106
-74
lines changed

6 files changed

+106
-74
lines changed

e2e/framework/controller/deployment.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ func NewDeployment(name string, opt ...DeploymentOption) *appsv1.Deployment {
5151
}, {
5252
Name: "SYSTEM_UPGRADE_CONTROLLER_LEADER_ELECT",
5353
Value: "true",
54+
}, {
55+
Name: "SYSTEM_UPGRADE_CONTROLLER_DEBUG",
56+
Value: "true",
5457
}, {
5558
Name: "SYSTEM_UPGRADE_CONTROLLER_NAMESPACE",
5659
ValueFrom: &corev1.EnvVarSource{

e2e/suite/channel_resolve_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,6 @@ var _ = Describe("Resolve channel", func() {
6565
Expect(err).To(HaveOccurred())
6666
Expect(latest).To(BeEmpty())
6767
})
68+
AfterEach(CollectLogsOnFailure(e2e))
6869
})
6970
})

e2e/suite/job_generate_test.go

Lines changed: 85 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,17 @@ import (
88

99
. "github.com/onsi/ginkgo/v2"
1010
. "github.com/onsi/gomega"
11+
format "github.com/onsi/gomega/format"
1112
batchv1 "k8s.io/api/batch/v1"
1213
v1 "k8s.io/api/core/v1"
14+
"k8s.io/utils/pointer"
1315
"k8s.io/utils/ptr"
1416

1517
"github.com/rancher/system-upgrade-controller/e2e/framework"
1618
upgradeapiv1 "github.com/rancher/system-upgrade-controller/pkg/apis/upgrade.cattle.io/v1"
1719
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
20+
21+
_ "k8s.io/kubernetes/test/utils/format"
1822
)
1923

2024
var _ = Describe("Job Generation", func() {
@@ -137,29 +141,7 @@ var _ = Describe("Job Generation", func() {
137141
Expect(jobs[0].Spec.Template.Spec.InitContainers[0].Args).To(ContainElement(ContainSubstring("!upgrade.cattle.io/controller")))
138142
Expect(jobs[0].Spec.Template.Spec.InitContainers[0].Args).To(ContainElement(ContainSubstring("component notin (sonobuoy)")))
139143
})
140-
AfterEach(func() {
141-
if CurrentSpecReport().Failed() {
142-
podList, _ := e2e.ClientSet.CoreV1().Pods(e2e.Namespace.Name).List(context.Background(), metav1.ListOptions{})
143-
for _, pod := range podList.Items {
144-
containerNames := []string{}
145-
for _, container := range pod.Spec.InitContainers {
146-
containerNames = append(containerNames, container.Name)
147-
}
148-
for _, container := range pod.Spec.Containers {
149-
containerNames = append(containerNames, container.Name)
150-
}
151-
for _, container := range containerNames {
152-
reportName := fmt.Sprintf("podlogs-%s-%s", pod.Name, container)
153-
logs := e2e.ClientSet.CoreV1().Pods(e2e.Namespace.Name).GetLogs(pod.Name, &v1.PodLogOptions{Container: container})
154-
if logStreamer, err := logs.Stream(context.Background()); err == nil {
155-
if podLogs, err := io.ReadAll(logStreamer); err == nil {
156-
AddReportEntry(reportName, string(podLogs))
157-
}
158-
}
159-
}
160-
}
161-
}
162-
})
144+
AfterEach(CollectLogsOnFailure(e2e))
163145
})
164146

165147
When("fails because of invalid time window", func() {
@@ -206,29 +188,7 @@ var _ = Describe("Job Generation", func() {
206188
Expect(jobs[0].Status.Active).To(BeNumerically("==", 0))
207189
Expect(jobs[0].Status.Failed).To(BeNumerically("==", 0))
208190
})
209-
AfterEach(func() {
210-
if CurrentSpecReport().Failed() {
211-
podList, _ := e2e.ClientSet.CoreV1().Pods(e2e.Namespace.Name).List(context.Background(), metav1.ListOptions{})
212-
for _, pod := range podList.Items {
213-
containerNames := []string{}
214-
for _, container := range pod.Spec.InitContainers {
215-
containerNames = append(containerNames, container.Name)
216-
}
217-
for _, container := range pod.Spec.Containers {
218-
containerNames = append(containerNames, container.Name)
219-
}
220-
for _, container := range containerNames {
221-
reportName := fmt.Sprintf("podlogs-%s-%s", pod.Name, container)
222-
logs := e2e.ClientSet.CoreV1().Pods(e2e.Namespace.Name).GetLogs(pod.Name, &v1.PodLogOptions{Container: container})
223-
if logStreamer, err := logs.Stream(context.Background()); err == nil {
224-
if podLogs, err := io.ReadAll(logStreamer); err == nil {
225-
AddReportEntry(reportName, string(podLogs))
226-
}
227-
}
228-
}
229-
}
230-
}
231-
})
191+
AfterEach(CollectLogsOnFailure(e2e))
232192
})
233193

234194
When("fails because of invalid post complete delay", func() {
@@ -275,32 +235,10 @@ var _ = Describe("Job Generation", func() {
275235
Expect(jobs[0].Status.Active).To(BeNumerically("==", 0))
276236
Expect(jobs[0].Status.Failed).To(BeNumerically("==", 0))
277237
})
278-
AfterEach(func() {
279-
if CurrentSpecReport().Failed() {
280-
podList, _ := e2e.ClientSet.CoreV1().Pods(e2e.Namespace.Name).List(context.Background(), metav1.ListOptions{})
281-
for _, pod := range podList.Items {
282-
containerNames := []string{}
283-
for _, container := range pod.Spec.InitContainers {
284-
containerNames = append(containerNames, container.Name)
285-
}
286-
for _, container := range pod.Spec.Containers {
287-
containerNames = append(containerNames, container.Name)
288-
}
289-
for _, container := range containerNames {
290-
reportName := fmt.Sprintf("podlogs-%s-%s", pod.Name, container)
291-
logs := e2e.ClientSet.CoreV1().Pods(e2e.Namespace.Name).GetLogs(pod.Name, &v1.PodLogOptions{Container: container})
292-
if logStreamer, err := logs.Stream(context.Background()); err == nil {
293-
if podLogs, err := io.ReadAll(logStreamer); err == nil {
294-
AddReportEntry(reportName, string(podLogs))
295-
}
296-
}
297-
}
298-
}
299-
}
300-
})
238+
AfterEach(CollectLogsOnFailure(e2e))
301239
})
302240

303-
When("updated secret should not change hash", func() {
241+
When("updated secret does not change hash", func() {
304242
var (
305243
err error
306244
plan *upgradeapiv1.Plan
@@ -347,5 +285,82 @@ var _ = Describe("Job Generation", func() {
347285
It("hash should be equal", func() {
348286
Expect(plan.Status.LatestHash).Should(Equal(hash))
349287
})
288+
AfterEach(CollectLogsOnFailure(e2e))
289+
})
290+
291+
When("job failure message is reflected in plan status condition", func() {
292+
var (
293+
err error
294+
plan *upgradeapiv1.Plan
295+
jobs []batchv1.Job
296+
)
297+
BeforeEach(func() {
298+
plan = e2e.NewPlan("job-deadline-", "library/alpine:3.18", []string{"sh", "-c"}, "sleep 3600")
299+
plan.Spec.JobActiveDeadlineSecs = pointer.Int64(15)
300+
plan.Spec.Version = "latest"
301+
plan.Spec.Concurrency = 1
302+
plan.Spec.ServiceAccountName = e2e.Namespace.Name
303+
plan.Spec.NodeSelector = &metav1.LabelSelector{
304+
MatchExpressions: []metav1.LabelSelectorRequirement{{
305+
Key: "node-role.kubernetes.io/control-plane",
306+
Operator: metav1.LabelSelectorOpDoesNotExist,
307+
}},
308+
}
309+
plan, err = e2e.CreatePlan(plan)
310+
Expect(err).ToNot(HaveOccurred())
311+
312+
plan, err = e2e.WaitForPlanCondition(plan.Name, upgradeapiv1.PlanLatestResolved, 30*time.Second)
313+
Expect(err).ToNot(HaveOccurred())
314+
})
315+
It("message should contain deadline reason and message", func() {
316+
jobs, err = e2e.WaitForPlanJobs(plan, 1, 120*time.Second)
317+
Expect(err).ToNot(HaveOccurred())
318+
Expect(jobs).To(HaveLen(1))
319+
Expect(jobs[0].Status.Succeeded).To(BeNumerically("==", 0))
320+
Expect(jobs[0].Status.Active).To(BeNumerically("==", 0))
321+
Expect(jobs[0].Status.Failed).To(BeNumerically(">=", 1))
322+
323+
Eventually(e2e.GetPlan).
324+
WithArguments(plan.Name, metav1.GetOptions{}).
325+
WithTimeout(45 * time.Second).
326+
Should(SatisfyAll(
327+
WithTransform(upgradeapiv1.PlanComplete.IsTrue, BeFalse()),
328+
WithTransform(upgradeapiv1.PlanComplete.GetReason, Equal("JobFailed")),
329+
WithTransform(upgradeapiv1.PlanComplete.GetMessage, ContainSubstring("DeadlineExceeded: Job was active longer than specified deadline")),
330+
))
331+
})
332+
AfterEach(CollectLogsOnFailure(e2e))
350333
})
351334
})
335+
336+
func CollectLogsOnFailure(e2e *framework.Client) func() {
337+
return func() {
338+
if CurrentSpecReport().Failed() {
339+
planList, _ := e2e.UpgradeClientSet.UpgradeV1().Plans(e2e.Namespace.Name).List(context.Background(), metav1.ListOptions{})
340+
AddReportEntry("plans", format.Object(planList, 0))
341+
342+
jobList, _ := e2e.ClientSet.BatchV1().Jobs(e2e.Namespace.Name).List(context.Background(), metav1.ListOptions{})
343+
AddReportEntry("jobs", format.Object(jobList, 0))
344+
345+
podList, _ := e2e.ClientSet.CoreV1().Pods(e2e.Namespace.Name).List(context.Background(), metav1.ListOptions{})
346+
for _, pod := range podList.Items {
347+
containerNames := []string{}
348+
for _, container := range pod.Spec.InitContainers {
349+
containerNames = append(containerNames, container.Name)
350+
}
351+
for _, container := range pod.Spec.Containers {
352+
containerNames = append(containerNames, container.Name)
353+
}
354+
for _, container := range containerNames {
355+
reportName := fmt.Sprintf("podlogs-%s-%s", pod.Name, container)
356+
logs := e2e.ClientSet.CoreV1().Pods(e2e.Namespace.Name).GetLogs(pod.Name, &v1.PodLogOptions{Container: container})
357+
if logStreamer, err := logs.Stream(context.Background()); err == nil {
358+
if podLogs, err := io.ReadAll(logStreamer); err == nil {
359+
AddReportEntry(reportName, string(podLogs))
360+
}
361+
}
362+
}
363+
}
364+
}
365+
}
366+
}

e2e/suite/plan_create_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,6 @@ var _ = Describe("Plan Creation", func() {
2222
It("should return an error if upgrade in nil", func() {
2323
Expect(err).To(HaveOccurred())
2424
})
25+
AfterEach(CollectLogsOnFailure(e2e))
2526
})
2627
})

e2e/suite/plan_resolve_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ var _ = Describe("Plan Resolution", func() {
3333
Expect(plan.Status.LatestVersion).To(BeEmpty())
3434
Expect(plan.Status.LatestHash).To(BeEmpty())
3535
})
36+
AfterEach(CollectLogsOnFailure(e2e))
3637
})
3738

3839
When("has version", func() {
@@ -56,6 +57,7 @@ var _ = Describe("Plan Resolution", func() {
5657
Expect(plan.Status.LatestVersion).To(Equal(plan.Spec.Version))
5758
Expect(plan.Status.LatestHash).ToNot(BeEmpty())
5859
})
60+
AfterEach(CollectLogsOnFailure(e2e))
5961
})
6062

6163
When("has version with semver+metadata", func() {
@@ -82,6 +84,7 @@ var _ = Describe("Plan Resolution", func() {
8284
It("should munge the semver", func() {
8385
Expect(plan.Status.LatestVersion).ToNot(ContainSubstring(`+`))
8486
})
87+
AfterEach(CollectLogsOnFailure(e2e))
8588
})
8689

8790
When("has channel", func() {
@@ -114,6 +117,7 @@ var _ = Describe("Plan Resolution", func() {
114117
Expect(plan.Status.LatestVersion).To(Equal(channelTag))
115118
Expect(plan.Status.LatestHash).ToNot(BeEmpty())
116119
})
120+
AfterEach(CollectLogsOnFailure(e2e))
117121
})
118122

119123
When("has channel with semver+metadata", func() {
@@ -148,5 +152,6 @@ var _ = Describe("Plan Resolution", func() {
148152
It("should munge the semver", func() {
149153
Expect(plan.Status.LatestVersion).ToNot(ContainSubstring(`+`))
150154
})
155+
AfterEach(CollectLogsOnFailure(e2e))
151156
})
152157
})

pkg/upgrade/handle_batch.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,20 @@ package upgrade
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"sort"
78
"strconv"
89
"time"
910

1011
upgradeapi "github.com/rancher/system-upgrade-controller/pkg/apis/upgrade.cattle.io"
12+
upgradeapiv1 "github.com/rancher/system-upgrade-controller/pkg/apis/upgrade.cattle.io/v1"
1113
upgradejob "github.com/rancher/system-upgrade-controller/pkg/upgrade/job"
1214
batchctlv1 "github.com/rancher/wrangler/v3/pkg/generated/controllers/batch/v1"
1315
"github.com/sirupsen/logrus"
1416
batchv1 "k8s.io/api/batch/v1"
1517
corev1 "k8s.io/api/core/v1"
16-
"k8s.io/apimachinery/pkg/api/errors"
18+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1719
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1820
"k8s.io/apimachinery/pkg/labels"
1921
)
@@ -50,7 +52,7 @@ func (ctl *Controller) handleJobs(ctx context.Context) error {
5052
// get the plan being applied
5153
plan, err := plans.Get(obj.Namespace, planName, metav1.GetOptions{})
5254
switch {
53-
case errors.IsNotFound(err):
55+
case apierrors.IsNotFound(err):
5456
// plan is gone, delete
5557
return obj, deleteJob(jobs, obj, metav1.DeletePropagationBackground)
5658
case err != nil:
@@ -73,7 +75,7 @@ func (ctl *Controller) handleJobs(ctx context.Context) error {
7375
// get the node that the plan is being applied to
7476
node, err := nodes.Cache().Get(nodeName)
7577
switch {
76-
case errors.IsNotFound(err):
78+
case apierrors.IsNotFound(err):
7779
// node is gone, delete
7880
return obj, deleteJob(jobs, obj, metav1.DeletePropagationBackground)
7981
case err != nil:
@@ -85,7 +87,12 @@ func (ctl *Controller) handleJobs(ctx context.Context) error {
8587
if failedTime.IsZero() {
8688
return obj, fmt.Errorf("condition %q missing field %q", upgradejob.ConditionFailed, "LastTransitionTime")
8789
}
88-
ctl.recorder.Eventf(plan, corev1.EventTypeWarning, "JobFailed", "Job failed on Node %s", node.Name)
90+
message := fmt.Sprintf("Job failed on Node %s: %s: %s", nodeName, upgradejob.ConditionFailed.GetReason(obj), upgradejob.ConditionFailed.GetMessage(obj))
91+
ctl.recorder.Eventf(plan, corev1.EventTypeWarning, "JobFailed", message)
92+
upgradeapiv1.PlanComplete.SetError(plan, "JobFailed", errors.New(message))
93+
if plan, err = plans.UpdateStatus(plan); err != nil {
94+
return obj, err
95+
}
8996
return obj, enqueueOrDelete(jobs, obj, failedTime)
9097
}
9198
// if the job has completed tag the node then enqueue-or-delete depending on the TTL window

0 commit comments

Comments
 (0)