From d4d57a8432d5aa4623bb628bfad3c9db990645c8 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 3 Aug 2023 13:20:32 -0500 Subject: [PATCH 1/8] Changed drainHelper output stream to Stdout --- pkg/node/node.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/node/node.go b/pkg/node/node.go index ffd04bb5..04874fa7 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -17,6 +17,7 @@ import ( "context" "encoding/json" "fmt" + "os" "strconv" "strings" "time" @@ -644,7 +645,7 @@ func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) { AdditionalFilters: []drain.PodFilter{filterPodForDeletion(nthConfig.PodName, nthConfig.PodNamespace)}, DeleteEmptyDirData: nthConfig.DeleteLocalData, Timeout: time.Duration(nthConfig.NodeTerminationGracePeriod) * time.Second, - Out: log.Logger, + Out: zerolog.New(os.Stdout).With().Timestamp().Logger(), ErrOut: log.Logger, } From 71a0432a1539d53c2824a83908397f577008ce9c Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Wed, 9 Aug 2023 17:24:09 -0500 Subject: [PATCH 2/8] Updated README.md to include Replica Usage information --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index de3b82fc..4c81e3eb 100644 --- a/README.md +++ b/README.md @@ -447,6 +447,12 @@ helm upgrade --install aws-node-termination-handler \ For a full list of configuration options see our [Helm readme](https://github.com/aws/aws-node-termination-handler/blob/v1.20.0/config/helm/aws-node-termination-handler#readme). +#### Replica Usage +- Using a **single** replica can delay message processing as only one message is processed at a time. +- Using **multiple** replicas allows for the simultaneous processing of messages. + - Replicas may attempt to process the same message. This is avoided by the `visibilityTimeout`, however, draining a Node may take longer than the set visibilityTimeout, default of 20s. Increasing the visibilityTimeout to coincide with the maximum drainage time can mitigate this issue. + - `visibilityTimeout` can be increased up to 12 hours. An increase can lead to other replicas waiting longer to process the message upon failure. This can result in the Node not being cordoned or drained. + #### Kubectl Apply Queue Processor needs an **SQS queue URL** to function; therefore, manifest changes are **REQUIRED** before using kubectl to directly add all of the above resources into your cluster. From 8bc14f227d14adc113f4f6f3765809b23304a665 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 10 Aug 2023 10:07:41 -0500 Subject: [PATCH 3/8] Undo change carried over from evictLogs branch --- pkg/node/node.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/node/node.go b/pkg/node/node.go index 04874fa7..ffd04bb5 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -17,7 +17,6 @@ import ( "context" "encoding/json" "fmt" - "os" "strconv" "strings" "time" @@ -645,7 +644,7 @@ func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) { AdditionalFilters: []drain.PodFilter{filterPodForDeletion(nthConfig.PodName, nthConfig.PodNamespace)}, DeleteEmptyDirData: nthConfig.DeleteLocalData, Timeout: time.Duration(nthConfig.NodeTerminationGracePeriod) * time.Second, - Out: zerolog.New(os.Stdout).With().Timestamp().Logger(), + Out: log.Logger, ErrOut: log.Logger, } From fd5dde85b65d30fc1025760997b51dbae10965cd Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 10 Aug 2023 15:56:05 -0500 Subject: [PATCH 4/8] Updated replica section. Changed title to Single vs Multiple Replicas, and provided example scenarios for use cases. --- README.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4c81e3eb..7e6cb5cb 100644 --- a/README.md +++ b/README.md @@ -447,11 +447,22 @@ helm upgrade --install aws-node-termination-handler \ For a full list of configuration options see our [Helm readme](https://github.com/aws/aws-node-termination-handler/blob/v1.20.0/config/helm/aws-node-termination-handler#readme). -#### Replica Usage -- Using a **single** replica can delay message processing as only one message is processed at a time. -- Using **multiple** replicas allows for the simultaneous processing of messages. - - Replicas may attempt to process the same message. This is avoided by the `visibilityTimeout`, however, draining a Node may take longer than the set visibilityTimeout, default of 20s. Increasing the visibilityTimeout to coincide with the maximum drainage time can mitigate this issue. - - `visibilityTimeout` can be increased up to 12 hours. An increase can lead to other replicas waiting longer to process the message upon failure. This can result in the Node not being cordoned or drained. +#### Single vs Multiple Replicas +**Single** replica usage, by default, can process up to 10 messages at a time. Provides less throughput than **multiple** replica usage, but uses less resources. + +Example use cases for using a single replica: +- Working with a relitively small (not large) cluster. +- Node drainage that doesn't take significant time. +- Limitted resources (memory or monetary) for allocation of NTH Pods on a cluster. + +**Multiple** replica usage allows for increased throughput and increased availability. + +Example use cases for using multiple replicas: +- Working with a large scale cluster that can make use of the larger throughput from more NTH instances. +- Node drainage that takes a significant time. Additional replicas increase throughput allowing for other messages to be processed. +- A Node running an NTH instance is terminated, leaving downtime until K8s deploys another Pod to take its place. Replicas provide mitigation with increased availabilty, allowing for NTH to still be operational. + +With **multiple** replicas, drainage times longer than the set visibility timeout of 20s may result in multiple NTH instances processing the same message. This will still result in a successful cordon/drainage, but will waste resources. #### Kubectl Apply From 92126bc42bb6e1ad372699c690d9294958603563 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Thu, 10 Aug 2023 16:31:51 -0500 Subject: [PATCH 5/8] Fixed spelling errors --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7e6cb5cb..f40dc536 100644 --- a/README.md +++ b/README.md @@ -451,16 +451,16 @@ For a full list of configuration options see our [Helm readme](https://github.co **Single** replica usage, by default, can process up to 10 messages at a time. Provides less throughput than **multiple** replica usage, but uses less resources. Example use cases for using a single replica: -- Working with a relitively small (not large) cluster. +- Working with a relatively small (not large) cluster. - Node drainage that doesn't take significant time. -- Limitted resources (memory or monetary) for allocation of NTH Pods on a cluster. +- Limited resources (memory or monetary) for allocation of NTH Pods on a cluster. **Multiple** replica usage allows for increased throughput and increased availability. Example use cases for using multiple replicas: - Working with a large scale cluster that can make use of the larger throughput from more NTH instances. -- Node drainage that takes a significant time. Additional replicas increase throughput allowing for other messages to be processed. -- A Node running an NTH instance is terminated, leaving downtime until K8s deploys another Pod to take its place. Replicas provide mitigation with increased availabilty, allowing for NTH to still be operational. +- Node drainage that takes a significant time. Additional replicas increase throughput, allowing for other messages to be processed. +- A Node running an NTH instance is terminated, leaving downtime until K8s deploys another Pod to take its place. Replicas provide mitigation with increased availability, allowing for NTH to still be operational. With **multiple** replicas, drainage times longer than the set visibility timeout of 20s may result in multiple NTH instances processing the same message. This will still result in a successful cordon/drainage, but will waste resources. From 84cefc673b6da69f50c33d5cab3252d6fc206d92 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 15 Aug 2023 15:55:54 -0500 Subject: [PATCH 6/8] Revised the Replica documentation --- README.md | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f40dc536..91491a22 100644 --- a/README.md +++ b/README.md @@ -447,22 +447,21 @@ helm upgrade --install aws-node-termination-handler \ For a full list of configuration options see our [Helm readme](https://github.com/aws/aws-node-termination-handler/blob/v1.20.0/config/helm/aws-node-termination-handler#readme). -#### Single vs Multiple Replicas -**Single** replica usage, by default, can process up to 10 messages at a time. Provides less throughput than **multiple** replica usage, but uses less resources. +#### Single Instance vs Multiple Replicas -Example use cases for using a single replica: -- Working with a relatively small (not large) cluster. -- Node drainage that doesn't take significant time. -- Limited resources (memory or monetary) for allocation of NTH Pods on a cluster. +The Helm chart, by default, will deploy a single instance of Amazon Node Termination Handler. With minimal usage of resources this still provides good responsiveness in processing SQS messages. -**Multiple** replica usage allows for increased throughput and increased availability. +**When should multiple instances of Amazon Node Termination Handler be used?** -Example use cases for using multiple replicas: -- Working with a large scale cluster that can make use of the larger throughput from more NTH instances. -- Node drainage that takes a significant time. Additional replicas increase throughput, allowing for other messages to be processed. -- A Node running an NTH instance is terminated, leaving downtime until K8s deploys another Pod to take its place. Replicas provide mitigation with increased availability, allowing for NTH to still be operational. +* Responsiveness: The deployment of multiple Amazon Node Termination Handler instances will increase the throughput of processing SQS messages. This can aide in Amazon Node Termination Handler not responding to events as quickly as needed -- potentially because of a large number of concurrent events or drained Pods taking a long time to terminate. -With **multiple** replicas, drainage times longer than the set visibility timeout of 20s may result in multiple NTH instances processing the same message. This will still result in a successful cordon/drainage, but will waste resources. +* Availability: The deployment of multiple Amazon Node Termination Handler instances will provide mitigation in the case that Amazon Node Termination Handler itself is drained. The replica Amazon Node Termination Handlers will process SQS messages, avoiding a delay until the Deployment can start another instance. + +**Notes** + +* Running multiple instances of Amazon Node Termination Handler will not load balance responding to events. Each instance will greedily consume and respond to events. +* Logs from multiple instances of Amazon Node Termination Handler are not aggregated. +* Multiple instances of Amazon Node Termination Handler will respond to the same event, if the event takes longer than 20s to process. This will not result in any errors, with the first responce having any affect. #### Kubectl Apply From 8a68db65b1264229d81045d85bdc89c7470b0ad5 Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Tue, 15 Aug 2023 16:26:07 -0500 Subject: [PATCH 7/8] Fixed spelling errors --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 91491a22..09020b01 100644 --- a/README.md +++ b/README.md @@ -449,11 +449,11 @@ For a full list of configuration options see our [Helm readme](https://github.co #### Single Instance vs Multiple Replicas -The Helm chart, by default, will deploy a single instance of Amazon Node Termination Handler. With minimal usage of resources this still provides good responsiveness in processing SQS messages. +The Helm chart, by default, will deploy a single instance of Amazon Node Termination Handler. With minimal usage of resources, this still provides good responsiveness in processing SQS messages. **When should multiple instances of Amazon Node Termination Handler be used?** -* Responsiveness: The deployment of multiple Amazon Node Termination Handler instances will increase the throughput of processing SQS messages. This can aide in Amazon Node Termination Handler not responding to events as quickly as needed -- potentially because of a large number of concurrent events or drained Pods taking a long time to terminate. +* Responsiveness: The deployment of multiple Amazon Node Termination Handler instances will increase the throughput of processing SQS messages. This can aid in Amazon Node Termination Handler not responding to events as quickly as needed -- potentially because of numerous concurrent events or drained Pods taking a long time to terminate. * Availability: The deployment of multiple Amazon Node Termination Handler instances will provide mitigation in the case that Amazon Node Termination Handler itself is drained. The replica Amazon Node Termination Handlers will process SQS messages, avoiding a delay until the Deployment can start another instance. @@ -461,7 +461,7 @@ The Helm chart, by default, will deploy a single instance of Amazon Node Termina * Running multiple instances of Amazon Node Termination Handler will not load balance responding to events. Each instance will greedily consume and respond to events. * Logs from multiple instances of Amazon Node Termination Handler are not aggregated. -* Multiple instances of Amazon Node Termination Handler will respond to the same event, if the event takes longer than 20s to process. This will not result in any errors, with the first responce having any affect. +* Multiple instances of Amazon Node Termination Handler will respond to the same event, if the event takes longer than 20s to process. This will not result in any errors, with the first response having any effect. #### Kubectl Apply From 3a01c8cc54d11acb61bcfa87137397bbd94968cf Mon Sep 17 00:00:00 2001 From: Gavin Burris Date: Wed, 16 Aug 2023 12:28:13 -0500 Subject: [PATCH 8/8] Made documentation more customer centric --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 09020b01..95edd001 100644 --- a/README.md +++ b/README.md @@ -449,19 +449,19 @@ For a full list of configuration options see our [Helm readme](https://github.co #### Single Instance vs Multiple Replicas -The Helm chart, by default, will deploy a single instance of Amazon Node Termination Handler. With minimal usage of resources, this still provides good responsiveness in processing SQS messages. +The Helm chart, by default, will deploy a single instance of Amazon Node Termination Handler. With the minimizing of resource usage, a single instance still provides good responsiveness in processing SQS messages. **When should multiple instances of Amazon Node Termination Handler be used?** -* Responsiveness: The deployment of multiple Amazon Node Termination Handler instances will increase the throughput of processing SQS messages. This can aid in Amazon Node Termination Handler not responding to events as quickly as needed -- potentially because of numerous concurrent events or drained Pods taking a long time to terminate. +* Responsiveness: Amazon Node Termination Handler may be taking longer than desired to process certain events, potentially in processing numerous concurrent events or taking too long to drain Pods. The deployment of multiple Amazon Node Termination Handler instances may help. -* Availability: The deployment of multiple Amazon Node Termination Handler instances will provide mitigation in the case that Amazon Node Termination Handler itself is drained. The replica Amazon Node Termination Handlers will process SQS messages, avoiding a delay until the Deployment can start another instance. +* Availability: The deployment of multiple Amazon Node Termination Handler instances provides mitigation in the case that Amazon Node Termination Handler itself is drained. Replica Amazon Node Termination Handlers will process SQS messages, avoiding a delay until the Deployment can start another instance. **Notes** * Running multiple instances of Amazon Node Termination Handler will not load balance responding to events. Each instance will greedily consume and respond to events. * Logs from multiple instances of Amazon Node Termination Handler are not aggregated. -* Multiple instances of Amazon Node Termination Handler will respond to the same event, if the event takes longer than 20s to process. This will not result in any errors, with the first response having any effect. +* Multiple instances of Amazon Node Termination Handler may respond to the same event, if it takes longer than 20s to process. This is not an error case, only the first response will have an affect. #### Kubectl Apply