Skip to content

Commit eaf8c71

Browse files
cjeradRoger Torrentsgeneros
andauthored
fix: show actual event kinds in Queue mode (#725)
* feat: emit pod events on drain * feat: make log format select-able Co-authored-by: Roger Torrentsgeneros <rtorrents@newrelic.com> Co-authored-by: Jerad C <jeradc@amazon.com>
1 parent 31c98b9 commit eaf8c71

23 files changed

+266
-70
lines changed

cmd/node-termination-handler.go

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,14 @@ func main() {
8383
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
8484
}
8585

86+
log.Info().Msgf("Using log format version %d", nthConfig.LogFormatVersion)
87+
if err = logging.SetFormatVersion(nthConfig.LogFormatVersion); err != nil {
88+
log.Warn().Err(err).Send()
89+
}
90+
if err = observability.SetReasonForKindVersion(nthConfig.LogFormatVersion); err != nil {
91+
log.Warn().Err(err).Send()
92+
}
93+
8694
err = webhook.ValidateWebhookConfig(nthConfig)
8795
if err != nil {
8896
nthConfig.Print()
@@ -196,13 +204,13 @@ func main() {
196204

197205
for _, fn := range monitoringFns {
198206
go func(monitor monitor.Monitor) {
199-
log.Info().Str("event_type", monitor.Kind()).Msg("Started monitoring for events")
207+
logging.VersionedMsgs.MonitoringStarted(monitor.Kind())
200208
var previousErr error
201209
var duplicateErrCount int
202210
for range time.Tick(time.Second * 2) {
203211
err := monitor.Monitor()
204212
if err != nil {
205-
log.Warn().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events")
213+
logging.VersionedMsgs.ProblemMonitoringForEvents(monitor.Kind(), err)
206214
metrics.ErrorEventsInc(monitor.Kind())
207215
recorder.Emit(nthConfig.NodeName, observability.Warning, observability.MonitorErrReason, observability.MonitorErrMsgFmt, monitor.Kind())
208216
if previousErr != nil && err.Error() == previousErr.Error() {
@@ -239,16 +247,10 @@ func main() {
239247
for event, ok := interruptionEventStore.GetActiveEvent(); ok; event, ok = interruptionEventStore.GetActiveEvent() {
240248
select {
241249
case interruptionEventStore.Workers <- 1:
242-
log.Info().
243-
Str("event-id", event.EventID).
244-
Str("kind", event.Kind).
245-
Str("node-name", event.NodeName).
246-
Str("instance-id", event.InstanceID).
247-
Str("provider-id", event.ProviderID).
248-
Msg("Requesting instance drain")
250+
logging.VersionedMsgs.RequestingInstanceDrain(event)
249251
event.InProgress = true
250252
wg.Add(1)
251-
recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind), event.Description)
253+
recorder.Emit(event.NodeName, observability.Normal, observability.GetReasonForKind(event.Kind, event.Monitor), event.Description)
252254
go drainOrCordonIfNecessary(interruptionEventStore, event, *node, nthConfig, nodeMetadata, metrics, recorder, &wg)
253255
default:
254256
log.Warn().Msg("all workers busy, waiting")

config/helm/aws-node-termination-handler/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
7070
| `extraEnv` | Additional environment variables for the _aws-node-termination-handler_ container. | `[]` |
7171
| `probes` | The Kubernetes liveness probe configuration. | _See values.yaml_ |
7272
| `logLevel` | Sets the log level (`info`,`debug`, or `error`) | `info` |
73+
| `logFormatVersion` | Sets the log format version. Available versions: 1, 2. Version 1 refers to the format that has been used through v1.17.3. Version 2 offers more detail for the "event kind" and "reason", especially when operating in Queue Processor mode. | `1` |
7374
| `jsonLogging` | If `true`, use JSON-formatted logs instead of human readable logs. | `false` |
7475
| `enablePrometheusServer` | If `true`, start an http server exposing `/metrics` endpoint for _Prometheus_. | `false` |
7576
| `prometheusServerPort` | Replaces the default HTTP port for exposing _Prometheus_ metrics. | `9092` |

config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ spec:
8181
value: {{ .Values.logLevel | quote }}
8282
- name: JSON_LOGGING
8383
value: {{ .Values.jsonLogging | quote }}
84+
- name: LOG_FORMAT_VERSION
85+
value: {{ .Values.logFormatVersion | quote }}
8486
- name: ENABLE_PROMETHEUS_SERVER
8587
value: {{ .Values.enablePrometheusServer | quote }}
8688
- name: PROMETHEUS_SERVER_PORT

config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ spec:
8181
value: {{ .Values.logLevel | quote }}
8282
- name: JSON_LOGGING
8383
value: {{ .Values.jsonLogging | quote }}
84+
- name: LOG_FORMAT_VERSION
85+
value: {{ .Values.logFormatVersion | quote }}
8486
- name: ENABLE_PROMETHEUS_SERVER
8587
value: {{ .Values.enablePrometheusServer | quote }}
8688
- name: PROMETHEUS_SERVER_PORT

config/helm/aws-node-termination-handler/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ spec:
7878
value: {{ .Values.logLevel | quote }}
7979
- name: JSON_LOGGING
8080
value: {{ .Values.jsonLogging | quote }}
81+
- name: LOG_FORMAT_VERSION
82+
value: {{ .Values.logFormatVersion | quote }}
8183
- name: ENABLE_PROMETHEUS_SERVER
8284
value: {{ .Values.enablePrometheusServer | quote }}
8385
- name: PROMETHEUS_SERVER_PORT

config/helm/aws-node-termination-handler/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ probes:
6666
# Set the log level
6767
logLevel: info
6868

69+
# Set the log format version
70+
logFormatVersion: 1
71+
6972
# Log messages in JSON format
7073
jsonLogging: false
7174

pkg/config/config.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@ const (
7979
jsonLoggingDefault = false
8080
logLevelConfigKey = "LOG_LEVEL"
8181
logLevelDefault = "INFO"
82+
logFormatVersionKey = "LOG_FORMAT_VERSION"
83+
logFormatVersionDefault = 1
84+
MinSupportedLogFormatVersion = 1
85+
MaxSupportedLogFormatVersion = 2
8286
uptimeFromFileConfigKey = "UPTIME_FROM_FILE"
8387
uptimeFromFileDefault = ""
8488
workersConfigKey = "WORKERS"
@@ -138,6 +142,7 @@ type Config struct {
138142
ExcludeFromLoadBalancers bool
139143
JsonLogging bool
140144
LogLevel string
145+
LogFormatVersion int
141146
UptimeFromFile string
142147
EnablePrometheus bool
143148
PrometheusPort int
@@ -197,6 +202,7 @@ func ParseCliArgs() (config Config, err error) {
197202
flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.")
198203
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")
199204
flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)")
205+
flag.IntVar(&config.LogFormatVersion, "log-format-version", getIntEnv(logFormatVersionKey, logFormatVersionDefault), "Sets the log format version.")
200206
flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).")
201207
flag.BoolVar(&config.EnablePrometheus, "enable-prometheus-server", getBoolEnv(enablePrometheusConfigKey, enablePrometheusDefault), "If true, a http server is used for exposing prometheus metrics in /metrics endpoint.")
202208
flag.IntVar(&config.PrometheusPort, "prometheus-server-port", getIntEnv(prometheusPortConfigKey, prometheusPortDefault), "The port for running the prometheus http server.")
@@ -242,6 +248,15 @@ func ParseCliArgs() (config Config, err error) {
242248
return config, fmt.Errorf("invalid log-level passed: %s Should be one of: info, debug, error", config.LogLevel)
243249
}
244250

251+
if config.LogFormatVersion < MinSupportedLogFormatVersion {
252+
log.Warn().Msgf("Log format version %d is not supported, using format version %d", config.LogFormatVersion, MinSupportedLogFormatVersion)
253+
config.LogFormatVersion = MinSupportedLogFormatVersion
254+
}
255+
if config.LogFormatVersion > MaxSupportedLogFormatVersion {
256+
log.Warn().Msgf("Log format version %d is not supported, using format version %d", config.LogFormatVersion, MaxSupportedLogFormatVersion)
257+
config.LogFormatVersion = MaxSupportedLogFormatVersion
258+
}
259+
245260
if config.NodeName == "" {
246261
panic("You must provide a node-name to the CLI or NODE_NAME environment variable.")
247262
}

pkg/logging/versioned.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Copyright 2016-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License"). You may
4+
// not use this file except in compliance with the License. A copy of the
5+
// License is located at
6+
//
7+
// http://aws.amazon.com/apache2.0/
8+
//
9+
// or in the "license" file accompanying this file. This file is distributed
10+
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
// express or implied. See the License for the specific language governing
12+
// permissions and limitations under the License.
13+
14+
package logging
15+
16+
import (
17+
"fmt"
18+
19+
"github.com/aws/aws-node-termination-handler/pkg/monitor"
20+
"github.com/rs/zerolog/log"
21+
)
22+
23+
type versionedMsgsV1 struct{}
24+
25+
func (versionedMsgsV1) MonitoringStarted(monitorKind string) {
26+
log.Info().Str("event_type", monitorKind).Msg("Started monitoring for events")
27+
}
28+
29+
func (versionedMsgsV1) ProblemMonitoringForEvents(monitorKind string, err error) {
30+
log.Warn().Str("event_type", monitorKind).Err(err).Msg("There was a problem monitoring for events")
31+
}
32+
33+
func (versionedMsgsV1) RequestingInstanceDrain(event *monitor.InterruptionEvent) {
34+
log.Info().
35+
Str("event-id", event.EventID).
36+
Str("kind", event.Kind).
37+
Str("node-name", event.NodeName).
38+
Str("instance-id", event.InstanceID).
39+
Str("provider-id", event.ProviderID).
40+
Msg("Requesting instance drain")
41+
}
42+
43+
func (versionedMsgsV1) SendingInterruptionEventToChannel(_ string) {
44+
log.Debug().Msg("Sending SQS_TERMINATE interruption event to the interruption channel")
45+
}
46+
47+
type versionedMsgsV2 struct{}
48+
49+
func (versionedMsgsV2) MonitoringStarted(monitorKind string) {
50+
log.Info().Str("monitor_type", monitorKind).Msg("Started monitoring for events")
51+
}
52+
53+
func (versionedMsgsV2) ProblemMonitoringForEvents(monitorKind string, err error) {
54+
log.Warn().Str("monitor_type", monitorKind).Err(err).Msg("There was a problem monitoring for events")
55+
}
56+
57+
func (versionedMsgsV2) RequestingInstanceDrain(event *monitor.InterruptionEvent) {
58+
log.Info().
59+
Str("event-id", event.EventID).
60+
Str("kind", event.Kind).
61+
Str("monitor", event.Monitor).
62+
Str("node-name", event.NodeName).
63+
Str("instance-id", event.InstanceID).
64+
Str("provider-id", event.ProviderID).
65+
Msg("Requesting instance drain")
66+
}
67+
68+
func (versionedMsgsV2) SendingInterruptionEventToChannel(eventKind string) {
69+
log.Debug().Msgf("Sending %s interruption event to the interruption channel", eventKind)
70+
}
71+
72+
var VersionedMsgs interface {
73+
MonitoringStarted(monitorKind string)
74+
ProblemMonitoringForEvents(monitorKind string, err error)
75+
RequestingInstanceDrain(event *monitor.InterruptionEvent)
76+
SendingInterruptionEventToChannel(eventKind string)
77+
} = versionedMsgsV1{}
78+
79+
func SetFormatVersion(version int) error {
80+
switch version {
81+
case 1:
82+
VersionedMsgs = versionedMsgsV1{}
83+
return nil
84+
case 2:
85+
VersionedMsgs = versionedMsgsV2{}
86+
return nil
87+
default:
88+
VersionedMsgs = versionedMsgsV1{}
89+
return fmt.Errorf("Unrecognized log format version: %d, using version 1", version)
90+
}
91+
}

pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,8 @@ import (
2323
"github.com/aws/aws-node-termination-handler/pkg/node"
2424
)
2525

26-
const (
27-
// RebalanceRecommendationKind is a const to define a Rebalance Recommendation kind of event
28-
RebalanceRecommendationKind = "REBALANCE_RECOMMENDATION"
29-
)
26+
// RebalanceRecommentadionMonitorKind is a const to define this monitor kind
27+
const RebalanceRecommendationMonitorKind = "REBALANCE_RECOMMENDATION_MONITOR"
3028

3129
// RebalanceRecommendationMonitor is a struct definition which facilitates monitoring of rebalance recommendations from IMDS
3230
type RebalanceRecommendationMonitor struct {
@@ -50,15 +48,15 @@ func (m RebalanceRecommendationMonitor) Monitor() error {
5048
if err != nil {
5149
return err
5250
}
53-
if interruptionEvent != nil && interruptionEvent.Kind == RebalanceRecommendationKind {
51+
if interruptionEvent != nil && interruptionEvent.Kind == monitor.RebalanceRecommendationKind {
5452
m.InterruptionChan <- *interruptionEvent
5553
}
5654
return nil
5755
}
5856

59-
// Kind denotes the kind of event that is processed
57+
// Kind denotes the kind of monitor
6058
func (m RebalanceRecommendationMonitor) Kind() string {
61-
return RebalanceRecommendationKind
59+
return RebalanceRecommendationMonitorKind
6260
}
6361

6462
// checkForRebalanceRecommendation Checks EC2 instance metadata for a rebalance recommendation
@@ -86,7 +84,8 @@ func (m RebalanceRecommendationMonitor) checkForRebalanceRecommendation() (*moni
8684

8785
return &monitor.InterruptionEvent{
8886
EventID: fmt.Sprintf("rebalance-recommendation-%x", hash.Sum(nil)),
89-
Kind: RebalanceRecommendationKind,
87+
Kind: monitor.RebalanceRecommendationKind,
88+
Monitor: RebalanceRecommendationMonitorKind,
9089
StartTime: noticeTime,
9190
NodeName: nodeName,
9291
Description: fmt.Sprintf("Rebalance recommendation received. Instance will be cordoned at %s \n", rebalanceRecommendation.NoticeTime),

pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ func TestMonitor_Success(t *testing.T) {
5555

5656
go func() {
5757
result := <-drainChan
58-
h.Equals(t, rebalancerecommendation.RebalanceRecommendationKind, result.Kind)
58+
h.Equals(t, monitor.RebalanceRecommendationKind, result.Kind)
59+
h.Equals(t, rebalancerecommendation.RebalanceRecommendationMonitorKind, result.Monitor)
5960
h.Equals(t, expFormattedTime, result.StartTime.String())
6061
h.Assert(t, strings.Contains(result.Description, startTime),
6162
"Expected description to contain: "+startTime+" but is actually: "+result.Description)

0 commit comments

Comments
 (0)