Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix undefined log levels #402

Merged
merged 4 commits into from
Apr 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 22 additions & 22 deletions cmd/node-termination-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func main() {
wait.PollImmediateUntil(2*time.Second, func() (done bool, err error) {
err = handleRebootUncordon(nthConfig.NodeName, interruptionEventStore, *node)
if err != nil {
log.Log().Err(err).Msgf("Unable to complete the uncordon after reboot workflow on startup, retrying")
log.Err(err).Msgf("Unable to complete the uncordon after reboot workflow on startup, retrying")
}
return false, nil
}, stopCh)
Expand All @@ -152,7 +152,7 @@ func main() {
if nthConfig.EnableSQSTerminationDraining {
creds, err := nthConfig.AWSSession.Config.Credentials.Get()
if err != nil {
log.Warn().Err(err).Msg("Unable to get AWS credentials")
log.Err(err).Msg("Unable to get AWS credentials")
}
log.Debug().Msgf("AWS Credentials retrieved from provider: %s", creds.ProviderName)

Expand All @@ -171,13 +171,13 @@ func main() {

for _, fn := range monitoringFns {
go func(monitor monitor.Monitor) {
log.Log().Str("event_type", monitor.Kind()).Msg("Started monitoring for events")
log.Info().Str("event_type", monitor.Kind()).Msg("Started monitoring for events")
var previousErr error
var duplicateErrCount int
for range time.Tick(time.Second * 2) {
err := monitor.Monitor()
if err != nil {
log.Log().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events")
log.Warn().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events")
metrics.ErrorEventsInc(monitor.Kind())
if previousErr != nil && err.Error() == previousErr.Error() {
duplicateErrCount++
Expand All @@ -186,7 +186,7 @@ func main() {
previousErr = err
}
if duplicateErrCount >= duplicateErrThreshold {
log.Log().Msg("Stopping NTH - Duplicate Error Threshold hit.")
log.Warn().Msg("Stopping NTH - Duplicate Error Threshold hit.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

log.panic since we're panicking ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could, but IMHO it's clearer when they're separate. I don't think I'd expect a log to panic if I was reading and I didn't know of that weirdness.

panic(fmt.Sprintf("%v", err))
}
}
Expand All @@ -195,11 +195,11 @@ func main() {
}

go watchForInterruptionEvents(interruptionChan, interruptionEventStore)
log.Log().Msg("Started watching for interruption events")
log.Log().Msg("Kubernetes AWS Node Termination Handler has started successfully!")
log.Info().Msg("Started watching for interruption events")
log.Info().Msg("Kubernetes AWS Node Termination Handler has started successfully!")

go watchForCancellationEvents(cancelChan, interruptionEventStore, node, metrics)
log.Log().Msg("Started watching for event cancellations")
log.Info().Msg("Started watching for event cancellations")

var wg sync.WaitGroup

Expand All @@ -222,7 +222,7 @@ func main() {
}
}
}
log.Log().Msg("AWS Node Termination Handler is shutting down")
log.Info().Msg("AWS Node Termination Handler is shutting down")
wg.Wait()
log.Debug().Msg("all event processors finished")
}
Expand Down Expand Up @@ -260,17 +260,17 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
nodeName := interruptionEvent.NodeName
interruptionEventStore.CancelInterruptionEvent(interruptionEvent.EventID)
if interruptionEventStore.ShouldUncordonNode(nodeName) {
log.Log().Msg("Uncordoning the node due to a cancellation event")
log.Info().Msg("Uncordoning the node due to a cancellation event")
err := node.Uncordon(nodeName)
if err != nil {
log.Log().Err(err).Msg("Uncordoning the node failed")
log.Err(err).Msg("Uncordoning the node failed")
}
metrics.NodeActionsInc("uncordon", nodeName, err)

node.RemoveNTHLabels(nodeName)
node.RemoveNTHTaints(nodeName)
} else {
log.Log().Msg("Another interruption event is active, not uncordoning the node")
log.Info().Msg("Another interruption event is active, not uncordoning the node")
}
}
}
Expand All @@ -280,13 +280,13 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
nodeName := drainEvent.NodeName
nodeLabels, err := node.GetNodeLabels(nodeName)
if err != nil {
log.Warn().Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName)
log.Err(err).Msgf("Unable to fetch node labels for node '%s' ", nodeName)
}
drainEvent.NodeLabels = nodeLabels
if drainEvent.PreDrainTask != nil {
err := drainEvent.PreDrainTask(*drainEvent, node)
if err != nil {
log.Log().Err(err).Msg("There was a problem executing the pre-drain task")
log.Err(err).Msg("There was a problem executing the pre-drain task")
}
metrics.NodeActionsInc("pre-drain", nodeName, err)
}
Expand All @@ -295,35 +295,35 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
err := node.Cordon(nodeName)
if err != nil {
if errors.IsNotFound(err) {
log.Warn().Err(err).Msgf("node '%s' not found in the cluster", nodeName)
log.Err(err).Msgf("node '%s' not found in the cluster", nodeName)
} else {
log.Log().Err(err).Msg("There was a problem while trying to cordon the node")
log.Err(err).Msg("There was a problem while trying to cordon the node")
os.Exit(1)
}
} else {
log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned")
log.Info().Str("node_name", nodeName).Msg("Node successfully cordoned")
podNameList, err := node.FetchPodNameList(nodeName)
if err != nil {
log.Log().Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName)
log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName)
}
drainEvent.Pods = podNameList
err = node.LogPods(podNameList, nodeName)
if err != nil {
log.Log().Err(err).Msg("There was a problem while trying to log all pod names on the node")
log.Err(err).Msg("There was a problem while trying to log all pod names on the node")
}
metrics.NodeActionsInc("cordon", nodeName, err)
}
} else {
err := node.CordonAndDrain(nodeName)
if err != nil {
if errors.IsNotFound(err) {
log.Warn().Err(err).Msgf("node '%s' not found in the cluster", nodeName)
log.Err(err).Msgf("node '%s' not found in the cluster", nodeName)
} else {
log.Log().Err(err).Msg("There was a problem while trying to cordon and drain the node")
log.Err(err).Msg("There was a problem while trying to cordon and drain the node")
os.Exit(1)
}
} else {
log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned and drained")
log.Info().Str("node_name", nodeName).Msg("Node successfully cordoned and drained")
metrics.NodeActionsInc("cordon-and-drain", nodeName, err)
}
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,9 @@ func ParseCliArgs() (config Config, err error) {
}

if isConfigProvided("pod-termination-grace-period", podTerminationGracePeriodConfigKey) && isConfigProvided("grace-period", gracePeriodConfigKey) {
log.Log().Msg("Deprecated argument \"grace-period\" and the replacement argument \"pod-termination-grace-period\" was provided. Using the newer argument \"pod-termination-grace-period\"")
log.Warn().Msg("Deprecated argument \"grace-period\" and the replacement argument \"pod-termination-grace-period\" was provided. Using the newer argument \"pod-termination-grace-period\"")
} else if isConfigProvided("grace-period", gracePeriodConfigKey) {
log.Log().Msg("Deprecated argument \"grace-period\" was provided. This argument will eventually be removed. Please switch to \"pod-termination-grace-period\" instead.")
log.Warn().Msg("Deprecated argument \"grace-period\" was provided. This argument will eventually be removed. Please switch to \"pod-termination-grace-period\" instead.")
config.PodTerminationGracePeriod = gracePeriod
}

Expand Down Expand Up @@ -239,7 +239,7 @@ func (c Config) Print() {
func (c Config) PrintJsonConfigArgs() {
// manually setting fields instead of using log.Log().Interface() to use snake_case instead of PascalCase
// intentionally did not log webhook configuration as there may be secrets
log.Log().
log.Info().
Bool("dry_run", c.DryRun).
Str("node_name", c.NodeName).
Str("metadata_url", c.MetadataURL).
Expand Down Expand Up @@ -277,7 +277,7 @@ func (c Config) PrintHumanConfigArgs() {
webhookURLDisplay = "<provided-not-displayed>"
}
// intentionally did not log webhook configuration as there may be secrets
log.Log().Msgf(
log.Info().Msgf(
"aws-node-termination-handler arguments: \n"+
"\tdry-run: %t,\n"+
"\tnode-name: %s,\n"+
Expand Down
15 changes: 7 additions & 8 deletions pkg/ec2metadata/ec2metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ func (e *Service) Request(contextPath string) (*http.Response, error) {
if err != nil {
e.v2Token = ""
e.tokenTTL = -1
log.Log().Err(err).Msg("Unable to retrieve an IMDSv2 token, continuing with IMDSv1")
log.Warn().Msgf("Unable to retrieve an IMDSv2 token, continuing with IMDSv1, %v", err)
} else {
e.v2Token = token
e.tokenTTL = ttl
Expand Down Expand Up @@ -267,7 +267,7 @@ func (e *Service) getV2Token() (string, int, error) {
httpReq := func() (*http.Response, error) {
return e.httpClient.Do(req)
}
log.Log().Msg("Trying to get token from IMDSv2")
log.Debug().Msg("Trying to get token from IMDSv2")
resp, err := retry(1, 2*time.Second, httpReq)
if err != nil {
return "", -1, err
Expand All @@ -284,7 +284,7 @@ func (e *Service) getV2Token() (string, int, error) {
if err != nil {
return "", -1, fmt.Errorf("IMDS v2 Token TTL header not sent in response: %w", err)
}
log.Log().Msg("Got token from IMDSv2")
log.Debug().Msg("Got token from IMDSv2")
return string(token), ttl, nil
}

Expand All @@ -307,8 +307,7 @@ func retry(attempts int, sleep time.Duration, httpReq func() (*http.Response, er
jitter := time.Duration(rand.Int63n(int64(sleep)))
sleep = sleep + jitter/2

log.Log().Msgf("Request failed. Attempts remaining: %d", attempts)
log.Log().Msgf("Sleep for %s seconds", sleep)
log.Warn().Msgf("Request failed. Attempts remaining: %d, sleeping for %s seconds", attempts, sleep)
time.Sleep(sleep)
return retry(attempts, 2*sleep, httpReq)
}
Expand All @@ -322,12 +321,12 @@ func (e *Service) GetNodeMetadata() NodeMetadata {
var metadata NodeMetadata
identityDoc, err := e.GetMetadataInfo(IdentityDocPath)
if err != nil {
log.Log().Err(err).Msg("Unable to fetch metadata from IMDS")
log.Err(err).Msg("Unable to fetch metadata from IMDS")
return metadata
}
err = json.NewDecoder(strings.NewReader(identityDoc)).Decode(&metadata)
if err != nil {
log.Log().Msg("Unable to fetch instance identity document from ec2 metadata")
log.Warn().Msg("Unable to fetch instance identity document from ec2 metadata")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

log.Err(err) ?

  • Both comments are nit for sure, but thinking if there's a log within if err != nil then it should be error for consistency

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can make a strict rule like that. We have a fallback for this error, so I think it can stay a Warn.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I agree that all loggin within err != nil should be error level, but we should come up with a consistent understanding of what these levels mean

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed the rule can't be that strict, but if having a fallback doesn't classify as error, then what about:

ERR Unable to retrieve an IMDSv2 token, continuing with IMDSv1

v1 is the fallback in this case? Are there any formal docs for log-levels similar to semantic versioning or something? Agreed we need some kinda consistency

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I changed that one to WARN

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there's anything formal for log levels... I think we probably log way too much in NTH, so maybe we should consider what actually makes sense to log so that someone can view the logs and be confident things are working and have enough information to debug certain error scenarios.

metadata.InstanceID, _ = e.GetMetadataInfo(InstanceIDPath)
metadata.InstanceType, _ = e.GetMetadataInfo(InstanceTypePath)
metadata.LocalIP, _ = e.GetMetadataInfo(LocalIPPath)
Expand All @@ -340,7 +339,7 @@ func (e *Service) GetNodeMetadata() NodeMetadata {
metadata.PublicIP, _ = e.GetMetadataInfo(PublicIPPath)
metadata.LocalHostname, _ = e.GetMetadataInfo(LocalHostnamePath)

log.Log().Interface("metadata", metadata).Msg("Startup Metadata Retrieved")
log.Info().Interface("metadata", metadata).Msg("Startup Metadata Retrieved")

return metadata
}
2 changes: 1 addition & 1 deletion pkg/interruptioneventstore/interruption-event-store.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func (s *Store) AddInterruptionEvent(interruptionEvent *monitor.InterruptionEven

s.Lock()
defer s.Unlock()
log.Log().Interface("event", interruptionEvent).Msg("Adding new event to the event store")
log.Info().Interface("event", interruptionEvent).Msg("Adding new event to the event store")
s.interruptionEventStore[interruptionEvent.EventID] = interruptionEvent
if _, ignored := s.ignoredEvents[interruptionEvent.EventID]; !ignored {
s.atLeastOneEvent = true
Expand Down
6 changes: 3 additions & 3 deletions pkg/monitor/scheduledevent/scheduled-event-monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func (m ScheduledEventMonitor) checkForScheduledEvents() ([]monitor.Interruption
notAfter, err = time.Parse(scheduledEventDateFormat, scheduledEvent.NotAfter)
if err != nil {
notAfter = notBefore
log.Log().Err(err).Msg("Unable to parse scheduled event end time, continuing")
log.Err(err).Msg("Unable to parse scheduled event end time, continuing")
}
}
events = append(events, monitor.InterruptionEvent{
Expand Down Expand Up @@ -128,7 +128,7 @@ func uncordonAfterRebootPreDrain(interruptionEvent monitor.InterruptionEvent, n
// if the node is already marked as unschedulable, then don't do anything
unschedulable, err := n.IsUnschedulable(nodeName)
if err == nil && unschedulable {
log.Log().Msg("Node is already marked unschedulable, not taking any action to add uncordon label.")
log.Debug().Msg("Node is already marked unschedulable, not taking any action to add uncordon label.")
return nil
} else if err != nil {
return fmt.Errorf("Encountered an error while checking if the node is unschedulable. Not setting an uncordon label: %w", err)
Expand All @@ -137,7 +137,7 @@ func uncordonAfterRebootPreDrain(interruptionEvent monitor.InterruptionEvent, n
if err != nil {
return fmt.Errorf("Unable to mark the node for uncordon: %w", err)
}
log.Log().Msg("Successfully applied uncordon after reboot action label to node.")
log.Info().Msg("Successfully applied uncordon after reboot action label to node.")
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/monitor/sqsevent/asg-lifecycle-event.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event EventBridgeEvent, me
interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID)
if err != nil {
log.Warn().Err(err).Msgf("Unable to taint node with taint %s:%s", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID)
log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID)
}
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/monitor/sqsevent/rebalance-recommendation-event.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ func (m SQSMonitor) rebalanceRecommendationToInterruptionEvent(event EventBridge
interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
err := n.TaintRebalanceRecommendation(interruptionEvent.NodeName, interruptionEvent.EventID)
if err != nil {
log.Warn().Err(err).Msgf("Unable to taint node with taint %s:%s", node.RebalanceRecommendationTaint, interruptionEvent.EventID)
log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.RebalanceRecommendationTaint, interruptionEvent.EventID)
}
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/monitor/sqsevent/spot-itn-event.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event EventBridgeEvent
interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
err := n.TaintSpotItn(interruptionEvent.NodeName, interruptionEvent.EventID)
if err != nil {
log.Warn().Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID)
log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID)
}
return nil
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/monitor/sqsevent/sqs-monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,16 @@ func (m SQSMonitor) Monitor() error {
switch {
case errors.Is(err, ErrNodeStateNotRunning):
// If the node is no longer running, just log and delete the message. If message deletion fails, count it as an error.
log.Warn().Err(err).Msg("dropping event for an already terminated node")
log.Err(err).Msg("dropping event for an already terminated node")
errs := m.deleteMessages([]*sqs.Message{message})
if len(errs) > 0 {
log.Warn().Err(errs[0]).Msg("error deleting event for already terminated node")
log.Err(errs[0]).Msg("error deleting event for already terminated node")
failedEvents++
}

case err != nil:
// Log errors and record as failed events
log.Warn().Err(err).Msg("ignoring event due to error")
log.Err(err).Msg("ignoring event due to error")
failedEvents++

case err == nil && interruptionEvent != nil && interruptionEvent.Kind == SQSTerminateKind:
Expand Down Expand Up @@ -176,7 +176,7 @@ func (m SQSMonitor) deleteMessages(messages []*sqs.Message) []error {
if err != nil {
errs = append(errs, err)
}
log.Log().Msgf("SQS Deleted Message: %s", message)
log.Debug().Msgf("SQS Deleted Message: %s", message)
}
return errs
}
Expand Down
Loading