diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index c12fa4e1..c19dc237 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -125,7 +125,7 @@ func main() { wait.PollImmediateUntil(2*time.Second, func() (done bool, err error) { err = handleRebootUncordon(nthConfig.NodeName, interruptionEventStore, *node) if err != nil { - log.Log().Err(err).Msgf("Unable to complete the uncordon after reboot workflow on startup, retrying") + log.Err(err).Msgf("Unable to complete the uncordon after reboot workflow on startup, retrying") } return false, nil }, stopCh) @@ -152,7 +152,7 @@ func main() { if nthConfig.EnableSQSTerminationDraining { creds, err := nthConfig.AWSSession.Config.Credentials.Get() if err != nil { - log.Warn().Err(err).Msg("Unable to get AWS credentials") + log.Err(err).Msg("Unable to get AWS credentials") } log.Debug().Msgf("AWS Credentials retrieved from provider: %s", creds.ProviderName) @@ -171,13 +171,13 @@ func main() { for _, fn := range monitoringFns { go func(monitor monitor.Monitor) { - log.Log().Str("event_type", monitor.Kind()).Msg("Started monitoring for events") + log.Info().Str("event_type", monitor.Kind()).Msg("Started monitoring for events") var previousErr error var duplicateErrCount int for range time.Tick(time.Second * 2) { err := monitor.Monitor() if err != nil { - log.Log().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events") + log.Warn().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events") metrics.ErrorEventsInc(monitor.Kind()) if previousErr != nil && err.Error() == previousErr.Error() { duplicateErrCount++ @@ -186,7 +186,7 @@ func main() { previousErr = err } if duplicateErrCount >= duplicateErrThreshold { - log.Log().Msg("Stopping NTH - Duplicate Error Threshold hit.") + log.Warn().Msg("Stopping NTH - Duplicate Error Threshold hit.") panic(fmt.Sprintf("%v", err)) } } @@ -195,11 +195,11 @@ func main() { } go watchForInterruptionEvents(interruptionChan, interruptionEventStore) - log.Log().Msg("Started watching for interruption events") - log.Log().Msg("Kubernetes AWS Node Termination Handler has started successfully!") + log.Info().Msg("Started watching for interruption events") + log.Info().Msg("Kubernetes AWS Node Termination Handler has started successfully!") go watchForCancellationEvents(cancelChan, interruptionEventStore, node, metrics) - log.Log().Msg("Started watching for event cancellations") + log.Info().Msg("Started watching for event cancellations") var wg sync.WaitGroup @@ -222,7 +222,7 @@ func main() { } } } - log.Log().Msg("AWS Node Termination Handler is shutting down") + log.Info().Msg("AWS Node Termination Handler is shutting down") wg.Wait() log.Debug().Msg("all event processors finished") } @@ -260,17 +260,17 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int nodeName := interruptionEvent.NodeName interruptionEventStore.CancelInterruptionEvent(interruptionEvent.EventID) if interruptionEventStore.ShouldUncordonNode(nodeName) { - log.Log().Msg("Uncordoning the node due to a cancellation event") + log.Info().Msg("Uncordoning the node due to a cancellation event") err := node.Uncordon(nodeName) if err != nil { - log.Log().Err(err).Msg("Uncordoning the node failed") + log.Err(err).Msg("Uncordoning the node failed") } metrics.NodeActionsInc("uncordon", nodeName, err) node.RemoveNTHLabels(nodeName) node.RemoveNTHTaints(nodeName) } else { - log.Log().Msg("Another interruption event is active, not uncordoning the node") + log.Info().Msg("Another interruption event is active, not uncordoning the node") } } } @@ -280,13 +280,13 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto nodeName := drainEvent.NodeName nodeLabels, err := node.GetNodeLabels(nodeName) if err != nil { - log.Warn().Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName) + log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName) } drainEvent.NodeLabels = nodeLabels if drainEvent.PreDrainTask != nil { err := drainEvent.PreDrainTask(*drainEvent, node) if err != nil { - log.Log().Err(err).Msg("There was a problem executing the pre-drain task") + log.Err(err).Msg("There was a problem executing the pre-drain task") } metrics.NodeActionsInc("pre-drain", nodeName, err) } @@ -295,21 +295,21 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto err := node.Cordon(nodeName) if err != nil { if errors.IsNotFound(err) { - log.Warn().Err(err).Msgf("node '%s' not found in the cluster", nodeName) + log.Err(err).Msgf("node '%s' not found in the cluster", nodeName) } else { - log.Log().Err(err).Msg("There was a problem while trying to cordon the node") + log.Err(err).Msg("There was a problem while trying to cordon the node") os.Exit(1) } } else { - log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned") + log.Info().Str("node_name", nodeName).Msg("Node successfully cordoned") podNameList, err := node.FetchPodNameList(nodeName) if err != nil { - log.Log().Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName) + log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName) } drainEvent.Pods = podNameList err = node.LogPods(podNameList, nodeName) if err != nil { - log.Log().Err(err).Msg("There was a problem while trying to log all pod names on the node") + log.Err(err).Msg("There was a problem while trying to log all pod names on the node") } metrics.NodeActionsInc("cordon", nodeName, err) } @@ -317,13 +317,13 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto err := node.CordonAndDrain(nodeName) if err != nil { if errors.IsNotFound(err) { - log.Warn().Err(err).Msgf("node '%s' not found in the cluster", nodeName) + log.Err(err).Msgf("node '%s' not found in the cluster", nodeName) } else { - log.Log().Err(err).Msg("There was a problem while trying to cordon and drain the node") + log.Err(err).Msg("There was a problem while trying to cordon and drain the node") os.Exit(1) } } else { - log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned and drained") + log.Info().Str("node_name", nodeName).Msg("Node successfully cordoned and drained") metrics.NodeActionsInc("cordon-and-drain", nodeName, err) } } diff --git a/pkg/config/config.go b/pkg/config/config.go index c19246ca..e6450e00 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -201,9 +201,9 @@ func ParseCliArgs() (config Config, err error) { } if isConfigProvided("pod-termination-grace-period", podTerminationGracePeriodConfigKey) && isConfigProvided("grace-period", gracePeriodConfigKey) { - log.Log().Msg("Deprecated argument \"grace-period\" and the replacement argument \"pod-termination-grace-period\" was provided. Using the newer argument \"pod-termination-grace-period\"") + log.Warn().Msg("Deprecated argument \"grace-period\" and the replacement argument \"pod-termination-grace-period\" was provided. Using the newer argument \"pod-termination-grace-period\"") } else if isConfigProvided("grace-period", gracePeriodConfigKey) { - log.Log().Msg("Deprecated argument \"grace-period\" was provided. This argument will eventually be removed. Please switch to \"pod-termination-grace-period\" instead.") + log.Warn().Msg("Deprecated argument \"grace-period\" was provided. This argument will eventually be removed. Please switch to \"pod-termination-grace-period\" instead.") config.PodTerminationGracePeriod = gracePeriod } @@ -239,7 +239,7 @@ func (c Config) Print() { func (c Config) PrintJsonConfigArgs() { // manually setting fields instead of using log.Log().Interface() to use snake_case instead of PascalCase // intentionally did not log webhook configuration as there may be secrets - log.Log(). + log.Info(). Bool("dry_run", c.DryRun). Str("node_name", c.NodeName). Str("metadata_url", c.MetadataURL). @@ -277,7 +277,7 @@ func (c Config) PrintHumanConfigArgs() { webhookURLDisplay = "" } // intentionally did not log webhook configuration as there may be secrets - log.Log().Msgf( + log.Info().Msgf( "aws-node-termination-handler arguments: \n"+ "\tdry-run: %t,\n"+ "\tnode-name: %s,\n"+ diff --git a/pkg/ec2metadata/ec2metadata.go b/pkg/ec2metadata/ec2metadata.go index 77097915..c83ec757 100644 --- a/pkg/ec2metadata/ec2metadata.go +++ b/pkg/ec2metadata/ec2metadata.go @@ -223,7 +223,7 @@ func (e *Service) Request(contextPath string) (*http.Response, error) { if err != nil { e.v2Token = "" e.tokenTTL = -1 - log.Log().Err(err).Msg("Unable to retrieve an IMDSv2 token, continuing with IMDSv1") + log.Warn().Msgf("Unable to retrieve an IMDSv2 token, continuing with IMDSv1, %v", err) } else { e.v2Token = token e.tokenTTL = ttl @@ -267,7 +267,7 @@ func (e *Service) getV2Token() (string, int, error) { httpReq := func() (*http.Response, error) { return e.httpClient.Do(req) } - log.Log().Msg("Trying to get token from IMDSv2") + log.Debug().Msg("Trying to get token from IMDSv2") resp, err := retry(1, 2*time.Second, httpReq) if err != nil { return "", -1, err @@ -284,7 +284,7 @@ func (e *Service) getV2Token() (string, int, error) { if err != nil { return "", -1, fmt.Errorf("IMDS v2 Token TTL header not sent in response: %w", err) } - log.Log().Msg("Got token from IMDSv2") + log.Debug().Msg("Got token from IMDSv2") return string(token), ttl, nil } @@ -307,8 +307,7 @@ func retry(attempts int, sleep time.Duration, httpReq func() (*http.Response, er jitter := time.Duration(rand.Int63n(int64(sleep))) sleep = sleep + jitter/2 - log.Log().Msgf("Request failed. Attempts remaining: %d", attempts) - log.Log().Msgf("Sleep for %s seconds", sleep) + log.Warn().Msgf("Request failed. Attempts remaining: %d, sleeping for %s seconds", attempts, sleep) time.Sleep(sleep) return retry(attempts, 2*sleep, httpReq) } @@ -322,12 +321,12 @@ func (e *Service) GetNodeMetadata() NodeMetadata { var metadata NodeMetadata identityDoc, err := e.GetMetadataInfo(IdentityDocPath) if err != nil { - log.Log().Err(err).Msg("Unable to fetch metadata from IMDS") + log.Err(err).Msg("Unable to fetch metadata from IMDS") return metadata } err = json.NewDecoder(strings.NewReader(identityDoc)).Decode(&metadata) if err != nil { - log.Log().Msg("Unable to fetch instance identity document from ec2 metadata") + log.Warn().Msg("Unable to fetch instance identity document from ec2 metadata") metadata.InstanceID, _ = e.GetMetadataInfo(InstanceIDPath) metadata.InstanceType, _ = e.GetMetadataInfo(InstanceTypePath) metadata.LocalIP, _ = e.GetMetadataInfo(LocalIPPath) @@ -340,7 +339,7 @@ func (e *Service) GetNodeMetadata() NodeMetadata { metadata.PublicIP, _ = e.GetMetadataInfo(PublicIPPath) metadata.LocalHostname, _ = e.GetMetadataInfo(LocalHostnamePath) - log.Log().Interface("metadata", metadata).Msg("Startup Metadata Retrieved") + log.Info().Interface("metadata", metadata).Msg("Startup Metadata Retrieved") return metadata } diff --git a/pkg/interruptioneventstore/interruption-event-store.go b/pkg/interruptioneventstore/interruption-event-store.go index ffafa284..fe9d73a1 100644 --- a/pkg/interruptioneventstore/interruption-event-store.go +++ b/pkg/interruptioneventstore/interruption-event-store.go @@ -61,7 +61,7 @@ func (s *Store) AddInterruptionEvent(interruptionEvent *monitor.InterruptionEven s.Lock() defer s.Unlock() - log.Log().Interface("event", interruptionEvent).Msg("Adding new event to the event store") + log.Info().Interface("event", interruptionEvent).Msg("Adding new event to the event store") s.interruptionEventStore[interruptionEvent.EventID] = interruptionEvent if _, ignored := s.ignoredEvents[interruptionEvent.EventID]; !ignored { s.atLeastOneEvent = true diff --git a/pkg/monitor/scheduledevent/scheduled-event-monitor.go b/pkg/monitor/scheduledevent/scheduled-event-monitor.go index 58dcbd28..4d43e96d 100644 --- a/pkg/monitor/scheduledevent/scheduled-event-monitor.go +++ b/pkg/monitor/scheduledevent/scheduled-event-monitor.go @@ -96,7 +96,7 @@ func (m ScheduledEventMonitor) checkForScheduledEvents() ([]monitor.Interruption notAfter, err = time.Parse(scheduledEventDateFormat, scheduledEvent.NotAfter) if err != nil { notAfter = notBefore - log.Log().Err(err).Msg("Unable to parse scheduled event end time, continuing") + log.Err(err).Msg("Unable to parse scheduled event end time, continuing") } } events = append(events, monitor.InterruptionEvent{ @@ -128,7 +128,7 @@ func uncordonAfterRebootPreDrain(interruptionEvent monitor.InterruptionEvent, n // if the node is already marked as unschedulable, then don't do anything unschedulable, err := n.IsUnschedulable(nodeName) if err == nil && unschedulable { - log.Log().Msg("Node is already marked unschedulable, not taking any action to add uncordon label.") + log.Debug().Msg("Node is already marked unschedulable, not taking any action to add uncordon label.") return nil } else if err != nil { return fmt.Errorf("Encountered an error while checking if the node is unschedulable. Not setting an uncordon label: %w", err) @@ -137,7 +137,7 @@ func uncordonAfterRebootPreDrain(interruptionEvent monitor.InterruptionEvent, n if err != nil { return fmt.Errorf("Unable to mark the node for uncordon: %w", err) } - log.Log().Msg("Successfully applied uncordon after reboot action label to node.") + log.Info().Msg("Successfully applied uncordon after reboot action label to node.") return nil } diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index c3960764..cdca89c7 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -105,7 +105,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event EventBridgeEvent, me interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID) if err != nil { - log.Warn().Err(err).Msgf("Unable to taint node with taint %s:%s", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID) + log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID) } return nil } diff --git a/pkg/monitor/sqsevent/rebalance-recommendation-event.go b/pkg/monitor/sqsevent/rebalance-recommendation-event.go index e6a8876e..ad06f58c 100644 --- a/pkg/monitor/sqsevent/rebalance-recommendation-event.go +++ b/pkg/monitor/sqsevent/rebalance-recommendation-event.go @@ -78,7 +78,7 @@ func (m SQSMonitor) rebalanceRecommendationToInterruptionEvent(event EventBridge interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { err := n.TaintRebalanceRecommendation(interruptionEvent.NodeName, interruptionEvent.EventID) if err != nil { - log.Warn().Err(err).Msgf("Unable to taint node with taint %s:%s", node.RebalanceRecommendationTaint, interruptionEvent.EventID) + log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.RebalanceRecommendationTaint, interruptionEvent.EventID) } return nil } diff --git a/pkg/monitor/sqsevent/spot-itn-event.go b/pkg/monitor/sqsevent/spot-itn-event.go index e77ae9a5..d5b25fec 100644 --- a/pkg/monitor/sqsevent/spot-itn-event.go +++ b/pkg/monitor/sqsevent/spot-itn-event.go @@ -79,7 +79,7 @@ func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event EventBridgeEvent interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { err := n.TaintSpotItn(interruptionEvent.NodeName, interruptionEvent.EventID) if err != nil { - log.Warn().Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID) + log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID) } return nil } diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 6f1b66df..74e3f69e 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -68,16 +68,16 @@ func (m SQSMonitor) Monitor() error { switch { case errors.Is(err, ErrNodeStateNotRunning): // If the node is no longer running, just log and delete the message. If message deletion fails, count it as an error. - log.Warn().Err(err).Msg("dropping event for an already terminated node") + log.Err(err).Msg("dropping event for an already terminated node") errs := m.deleteMessages([]*sqs.Message{message}) if len(errs) > 0 { - log.Warn().Err(errs[0]).Msg("error deleting event for already terminated node") + log.Err(errs[0]).Msg("error deleting event for already terminated node") failedEvents++ } case err != nil: // Log errors and record as failed events - log.Warn().Err(err).Msg("ignoring event due to error") + log.Err(err).Msg("ignoring event due to error") failedEvents++ case err == nil && interruptionEvent != nil && interruptionEvent.Kind == SQSTerminateKind: @@ -176,7 +176,7 @@ func (m SQSMonitor) deleteMessages(messages []*sqs.Message) []error { if err != nil { errs = append(errs, err) } - log.Log().Msgf("SQS Deleted Message: %s", message) + log.Debug().Msgf("SQS Deleted Message: %s", message) } return errs } diff --git a/pkg/node/node.go b/pkg/node/node.go index 15023fbd..dba6e840 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -91,16 +91,16 @@ func NewWithValues(nthConfig config.Config, drainHelper *drain.Helper, uptime up // CordonAndDrain will cordon the node and evict pods based on the config func (n Node) CordonAndDrain(nodeName string) error { if n.nthConfig.DryRun { - log.Log().Str("node_name", nodeName).Msg("Node would have been cordoned and drained, but dry-run flag was set") + log.Info().Str("node_name", nodeName).Msg("Node would have been cordoned and drained, but dry-run flag was set") return nil } - log.Log().Msg("Cordoning the node") + log.Info().Msg("Cordoning the node") err := n.Cordon(nodeName) if err != nil { return err } // Delete all pods on the node - log.Log().Msg("Draining the node") + log.Info().Msg("Draining the node") node, err := n.fetchKubernetesNode(nodeName) if err != nil { return err @@ -115,7 +115,7 @@ func (n Node) CordonAndDrain(nodeName string) error { // Cordon will add a NoSchedule on the node func (n Node) Cordon(nodeName string) error { if n.nthConfig.DryRun { - log.Log().Str("node_name", nodeName).Msg("Node would have been cordoned, but dry-run flag was set") + log.Info().Str("node_name", nodeName).Msg("Node would have been cordoned, but dry-run flag was set") return nil } node, err := n.fetchKubernetesNode(nodeName) @@ -132,7 +132,7 @@ func (n Node) Cordon(nodeName string) error { // Uncordon will remove the NoSchedule on the node func (n Node) Uncordon(nodeName string) error { if n.nthConfig.DryRun { - log.Log().Str("node_name", nodeName).Msg("Node would have been uncordoned, but dry-run flag was set") + log.Info().Str("node_name", nodeName).Msg("Node would have been uncordoned, but dry-run flag was set") return nil } node, err := n.fetchKubernetesNode(nodeName) @@ -149,7 +149,7 @@ func (n Node) Uncordon(nodeName string) error { // IsUnschedulable checks if the node is marked as unschedulable func (n Node) IsUnschedulable(nodeName string) (bool, error) { if n.nthConfig.DryRun { - log.Log().Msg("IsUnschedulable returning false since dry-run is set") + log.Info().Msg("IsUnschedulable returning false since dry-run is set") return false, nil } node, err := n.fetchKubernetesNode(nodeName) @@ -187,7 +187,7 @@ func (n Node) GetEventID(nodeName string) (string, error) { } val, ok := node.Labels[EventIDLabelKey] if n.nthConfig.DryRun && !ok { - log.Log().Msgf("Would have returned Error: 'Event ID Label %s was not found on the node', but dry-run flag was set", EventIDLabelKey) + log.Warn().Msgf("Would have returned Error: 'Event ID Label %s was not found on the node', but dry-run flag was set", EventIDLabelKey) return "", nil } if !ok { @@ -237,7 +237,7 @@ func (n Node) addLabel(nodeName string, key string, value string) error { return err } if n.nthConfig.DryRun { - log.Log().Msgf("Would have added label (%s=%s) to node %s, but dry-run flag was set", key, value, nodeName) + log.Info().Msgf("Would have added label (%s=%s) to node %s, but dry-run flag was set", key, value, nodeName) return nil } _, err = n.drainHelper.Client.CoreV1().Nodes().Patch(node.Name, types.StrategicMergePatchType, payloadBytes) @@ -268,7 +268,7 @@ func (n Node) removeLabel(nodeName string, key string) error { return err } if n.nthConfig.DryRun { - log.Log().Msgf("Would have removed label with key %s from node %s, but dry-run flag was set", key, nodeName) + log.Info().Msgf("Would have removed label with key %s from node %s, but dry-run flag was set", key, nodeName) return nil } _, err = n.drainHelper.Client.CoreV1().Nodes().Patch(node.Name, types.JSONPatchType, payload) @@ -281,7 +281,7 @@ func (n Node) removeLabel(nodeName string, key string) error { // GetNodeLabels will fetch node labels for a given nodeName func (n Node) GetNodeLabels(nodeName string) (map[string]string, error) { if n.nthConfig.DryRun { - log.Log().Str("node_name", nodeName).Msg("Node labels would have been fetched, but dry-run flag was set") + log.Info().Str("node_name", nodeName).Msg("Node labels would have been fetched, but dry-run flag was set") return nil, nil } node, err := n.fetchKubernetesNode(nodeName) @@ -351,7 +351,7 @@ func (n Node) LogPods(podList []string, nodeName string) error { for _, pod := range podList { podNamesArr = podNamesArr.Str(pod) } - log.Log().Array("pod_names", podNamesArr).Str("node_name", nodeName).Msg("Pods on node") + log.Info().Array("pod_names", podNamesArr).Str("node_name", nodeName).Msg("Pods on node") return nil } @@ -431,7 +431,7 @@ func (n Node) UncordonIfRebooted(nodeName string) error { } timeVal, ok := k8sNode.Labels[ActionLabelTimeKey] if !ok { - log.Log().Msgf("There was no %s label found requiring action label handling", ActionLabelTimeKey) + log.Debug().Msgf("There was no %s label found requiring action label handling", ActionLabelTimeKey) return nil } timeValNum, err := strconv.ParseInt(timeVal, 10, 64) @@ -446,7 +446,7 @@ func (n Node) UncordonIfRebooted(nodeName string) error { return err } if secondsSinceLabel < uptime { - log.Log().Msg("The system has not restarted yet.") + log.Debug().Msg("The system has not restarted yet.") return nil } err = n.Uncordon(nodeName) @@ -463,9 +463,9 @@ func (n Node) UncordonIfRebooted(nodeName string) error { return err } - log.Log().Msgf("Successfully completed action %s.", UncordonAfterRebootLabelVal) + log.Info().Msgf("Successfully completed action %s.", UncordonAfterRebootLabelVal) default: - log.Log().Msg("There are no label actions to handle.") + log.Debug().Msg("There are no label actions to handle.") } return nil } @@ -484,7 +484,7 @@ func (n Node) fetchKubernetesNode(nodeName string) (*corev1.Node, error) { listOptions := metav1.ListOptions{LabelSelector: labels.Set(labelSelector.MatchLabels).String()} matchingNodes, err := n.drainHelper.Client.CoreV1().Nodes().List(listOptions) if err != nil || len(matchingNodes.Items) == 0 { - log.Warn().Err(err).Msgf("Error when trying to list Nodes w/ label, falling back to direct Get lookup of node") + log.Err(err).Msgf("Error when trying to list Nodes w/ label, falling back to direct Get lookup of node") return n.drainHelper.Client.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{}) } return &matchingNodes.Items[0], nil @@ -532,7 +532,7 @@ func jsonPatchEscape(value string) string { func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, effect corev1.TaintEffect) error { if nth.nthConfig.DryRun { - log.Log().Msgf("Would have added taint (%s=%s:%s) to node %s, but dry-run flag was set", taintKey, taintValue, effect, nth.nthConfig.NodeName) + log.Info().Msgf("Would have added taint (%s=%s:%s) to node %s, but dry-run flag was set", taintKey, taintValue, effect, nth.nthConfig.NodeName) return nil } @@ -546,11 +546,12 @@ func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, e // Get the newest version of the node. freshNode, err = client.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{}) if err != nil || freshNode == nil { - log.Log(). + nodeErr := fmt.Errorf("failed to get node %v: %w", node.Name, err) + log.Err(nodeErr). Str("taint_key", taintKey). Str("node_name", node.Name). Msg("Error while adding taint on node") - return fmt.Errorf("failed to get node %v: %v", node.Name, err) + return nodeErr } } @@ -570,13 +571,13 @@ func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, e } if err != nil { - log.Log(). + log.Err(err). Str("taint_key", taintKey). Str("node_name", node.Name). Msg("Error while adding taint on node") return err } - log.Log(). + log.Warn(). Str("taint_key", taintKey). Str("node_name", node.Name). Msg("Successfully added taint on node") @@ -587,7 +588,7 @@ func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, e func addTaintToSpec(node *corev1.Node, taintKey string, taintValue string, effect corev1.TaintEffect) bool { for _, taint := range node.Spec.Taints { if taint.Key == taintKey { - log.Log(). + log.Debug(). Str("taint_key", taintKey). Interface("taint", taint). Str("node_name", node.Name). @@ -619,7 +620,7 @@ func removeTaint(node *corev1.Node, client kubernetes.Interface, taintKey string newTaints := make([]corev1.Taint, 0) for _, taint := range freshNode.Spec.Taints { if taint.Key == taintKey { - log.Log(). + log.Info(). Interface("taint", taint). Str("node_name", node.Name). Msg("Releasing taint on node") @@ -646,13 +647,13 @@ func removeTaint(node *corev1.Node, client kubernetes.Interface, taintKey string } if err != nil { - log.Log(). + log.Err(err). Str("taint_key", taintKey). Str("node_name", node.Name). Msg("Error while releasing taint on node") return false, err } - log.Log(). + log.Info(). Str("taint_key", taintKey). Str("node_name", node.Name). Msg("Successfully released taint on node") diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index f2fd90a8..210b5e8d 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -42,9 +42,8 @@ func Post(additionalInfo ec2metadata.NodeMetadata, event *monitor.InterruptionEv if nthConfig.WebhookTemplateFile != "" { content, err := ioutil.ReadFile(nthConfig.WebhookTemplateFile) if err != nil { - log.Log(). + log.Err(err). Str("webhook_template_file", nthConfig.WebhookTemplateFile). - Err(err). Msg("Webhook Error: Could not read template file") return } @@ -56,7 +55,7 @@ func Post(additionalInfo ec2metadata.NodeMetadata, event *monitor.InterruptionEv webhookTemplate, err := template.New("message").Parse(webhookTemplateContent) if err != nil { - log.Log().Err(err).Msg("Webhook Error: Template parsing failed") + log.Err(err).Msg("Webhook Error: Template parsing failed") return } @@ -70,20 +69,20 @@ func Post(additionalInfo ec2metadata.NodeMetadata, event *monitor.InterruptionEv var byteBuffer bytes.Buffer err = webhookTemplate.Execute(&byteBuffer, combined) if err != nil { - log.Log().Err(err).Msg("Webhook Error: Template execution failed") + log.Err(err).Msg("Webhook Error: Template execution failed") return } request, err := http.NewRequest("POST", nthConfig.WebhookURL, &byteBuffer) if err != nil { - log.Log().Err(err).Msg("Webhook Error: Http NewRequest failed") + log.Err(err).Msg("Webhook Error: Http NewRequest failed") return } headerMap := make(map[string]interface{}) err = json.Unmarshal([]byte(nthConfig.WebhookHeaders), &headerMap) if err != nil { - log.Log().Err(err).Msg("Webhook Error: Header Unmarshal failed") + log.Err(err).Msg("Webhook Error: Header Unmarshal failed") return } for key, value := range headerMap { @@ -108,18 +107,18 @@ func Post(additionalInfo ec2metadata.NodeMetadata, event *monitor.InterruptionEv } response, err := client.Do(request) if err != nil { - log.Log().Err(err).Msg("Webhook Error: Client Do failed") + log.Err(err).Msg("Webhook Error: Client Do failed") return } defer response.Body.Close() if response.StatusCode < 200 || response.StatusCode > 299 { - log.Log().Int("status_code", response.StatusCode).Msg("Webhook Error: Received Non-Successful Status Code") + log.Warn().Int("status_code", response.StatusCode).Msg("Webhook Error: Received Non-Successful Status Code") return } - log.Log().Msg("Webhook Success: Notification Sent!") + log.Info().Msg("Webhook Success: Notification Sent!") } // ValidateWebhookConfig will check if the template provided in nthConfig with parse and execute diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 5c5076f2..b0de42d8 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -46,7 +46,7 @@ func parseScheduledEventTime(inputTime string) time.Time { func getExpectedMessage(event *monitor.InterruptionEvent) string { webhookTemplate, err := template.New("").Parse(testWebhookTemplate) if err != nil { - log.Log().Err(err).Msg("Webhook Error: Template parsing failed") + log.Err(err).Msg("Webhook Error: Template parsing failed") return "" } diff --git a/test/e2e/prometheus-metrics-test b/test/e2e/prometheus-metrics-test index 9e42da83..5070e628 100755 --- a/test/e2e/prometheus-metrics-test +++ b/test/e2e/prometheus-metrics-test @@ -143,45 +143,49 @@ PORT_FORWARD_PID=$! trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR echo "✅ Port-forwarded pod $POD_NAME" -sleep 30 +sleep 10 -METRICS_RESPONSE=$(curl -L localhost:7000/metrics) -echo "✅ Fetched /metrics." +for i in $(seq 1 10); do + METRICS_RESPONSE=$(curl -L localhost:7000/metrics) + echo "✅ Fetched /metrics." -if [[ $METRICS_RESPONSE == *"cordon-and-drain"* ]]; then - echo "✅ Metric cordon-and-drain!" -else - echo "❌ Failed checking metric for cordon-and-drain" - EXIT_STATUS=3 -fi + if [[ $METRICS_RESPONSE == *"cordon-and-drain"* ]]; then + echo "✅ Metric cordon-and-drain!" + else + echo "❌ Failed checking metric for cordon-and-drain" + EXIT_STATUS=3 + fi -if [[ $METRICS_RESPONSE == *"pre-drain"* ]]; then - echo "✅ Metric pre-drain!" -else - echo "❌ Failed checking metric for pre-drain" - EXIT_STATUS=3 -fi + if [[ $METRICS_RESPONSE == *"pre-drain"* ]]; then + echo "✅ Metric pre-drain!" + else + echo "❌ Failed checking metric for pre-drain" + EXIT_STATUS=3 + fi -if [[ $METRICS_RESPONSE == *"runtime_go_gc"* ]]; then - echo "✅ Metric runtime_go_gc!" -else - echo "❌ Failed checking runtime_go_gc metric" - EXIT_STATUS=3 -fi + if [[ $METRICS_RESPONSE == *"runtime_go_gc"* ]]; then + echo "✅ Metric runtime_go_gc!" + else + echo "❌ Failed checking runtime_go_gc metric" + EXIT_STATUS=3 + fi -if [[ $METRICS_RESPONSE == *"runtime_go_goroutines"* ]]; then - echo "✅ Metric runtime_go_goroutines!" -else - echo "❌ Failed checking runtime_go_goroutines metric" - EXIT_STATUS=3 -fi + if [[ $METRICS_RESPONSE == *"runtime_go_goroutines"* ]]; then + echo "✅ Metric runtime_go_goroutines!" + else + echo "❌ Failed checking runtime_go_goroutines metric" + EXIT_STATUS=3 + fi -if [[ $METRICS_RESPONSE == *"runtime_go_mem"* ]]; then - echo "✅ Metric runtime_go_mem!" -else - echo "❌ Failed checking runtime_go_mem metric" - EXIT_STATUS=3 -fi + if [[ $METRICS_RESPONSE == *"runtime_go_mem"* ]]; then + echo "✅ Metric runtime_go_mem!" + else + echo "❌ Failed checking runtime_go_mem metric" + EXIT_STATUS=3 + fi + sleep 10 + [[ $EXIT_STATUS -ne 0 ]] || break +done exit $EXIT_STATUS