make host networking optional (#270)

* make host networking optional (attribution: Leo Palmer Sunmo @leosunmo) * update helm readme and add hostnetworking=false test * generate queue-processor assets * updated test output
aws · Oct 21, 2020 · 2d609a4 · 2d609a4
1 parent a20febc
commit 2d609a4
Show file tree

Hide file tree

Showing 7 changed files with 224 additions and 27 deletions.
diff --git a/config/helm/aws-node-termination-handler/README.md b/config/helm/aws-node-termination-handler/README.md
@@ -47,7 +47,9 @@ The command removes all the Kubernetes components associated with the chart and
 
 The following tables lists the configurable parameters of the chart and their default values.
 
-### AWS Node Termination Handler Configuration
+### AWS Node Termination Handler Common Configuration
+
+The configuration in this table applies to both queue-processor mode and IMDS mode.
 
 Parameter | Description | Default
 --- | --- | ---
@@ -64,25 +66,33 @@ Parameter | Description | Default
 `webhookTemplate` | Replaces the default webhook message template. | `{"text":"[NTH][Instance Interruption] EventID: {{ .EventID }} - Kind: {{ .Kind }} - Instance: {{ .InstanceID }} - Description: {{ .Description }} - Start Time: {{ .StartTime }}"}`
 `webhookTemplateConfigMapName` | Pass Webhook template file as configmap | None
 `webhookTemplateConfigMapKey` | Name of the template file stored in the configmap| None
-`enableScheduledEventDraining` | [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event | `false`
-`enableSpotInterruptionDraining` | If true, drain nodes when the spot interruption termination notice is received | `true`
-`enableSqsTerminationDraining` | If true, drain nodes when an SQS termination event is received | `false`
-`queueURL` | Listens for messages on the specified SQS queue URL | None
-`awsRegion` | If specified, use the AWS region for AWS API calls, else NTH will try to find the region through AWS_REGION env var, IMDS, or the specified queue URL | ``
 `metadataTries` | The number of times to try requesting metadata. If you would like 2 retries, set metadata-tries to 3. | `3`
 `cordonOnly` | If true, nodes will be cordoned but not drained when an interruption event occurs. | `false`
 `taintNode` | If true, nodes will be tainted when an interruption event occurs. Currently used taint keys are `aws-node-termination-handler/scheduled-maintenance`, `aws-node-termination-handler/spot-itn`, and `aws-node-termination-handler/asg-lifecycle-termination` | `false`
 `jsonLogging` | If true, use JSON-formatted logs instead of human readable logs. | `false`
+`enablePrometheusServer` | If true, start an http server exposing `/metrics` endpoint for prometheus. | `false`
+`prometheusServerPort` | Replaces the default HTTP port for exposing prometheus metrics. | `9092`
+`podMonitor.create` | if `true`, create a PodMonitor | `false`
+`podMonitor.interval` | Prometheus scrape interval | `30s`
+`podMonitor.sampleLimit` | Number of scraped samples accepted | `5000`
+`podMonitor.labels` | Additional PodMonitor metadata labels | `{}`
 
-### Testing Configuration (NOT RECOMMENDED FOR PROD DEPLOYMENTS)
+
+### AWS Node Termination Handler - Queue-Processor Mode Configuration
 
 Parameter | Description | Default
 --- | --- | ---
-`procUptimeFile` | (Used for Testing) Specify the uptime file | `/proc/uptime`
-`awsEndpoint` | (Used for testing) If specified, use the AWS endpoint to make API calls | None
-`awsSecretAccessKey` | (Used for testing) Pass-thru env var | None
-`awsAccessKeyID` | (Used for testing) Pass-thru env var | None
-`dryRun` | If true, only log if a node would be drained | `false`
+`enableSqsTerminationDraining` | If true, this turns on queue-processor mode which drains nodes when an SQS termination event is received| `false`
+`queueURL` | Listens for messages on the specified SQS queue URL | None
+`awsRegion` | If specified, use the AWS region for AWS API calls, else NTH will try to find the region through AWS_REGION env var, IMDS, or the specified queue URL | ``
+
+### AWS Node Termination Handler - IMDS Mode Configuration
+
+Parameter | Description | Default
+--- | --- | ---
+`enableScheduledEventDraining` | [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event | `false`
+`enableSpotInterruptionDraining` | If true, drain nodes when the spot interruption termination notice is received | `true`
+`useHostNetwork` | If `true`, enables `hostNetwork` for the Linux DaemonSet. NOTE: setting this to `false` may cause issues accessing IMDSv2 if your account is not configured with an IP hop count of 2 | `true`
 
 ### Kubernetes Configuration
 
@@ -118,17 +128,21 @@ Parameter | Description | Default
 `nodeSelectorTermsOs` | Operating System Node Selector Key | >=1.14: `kubernetes.io/os`, <1.14: `beta.kubernetes.io/os`
 `nodeSelectorTermsArch` | CPU Architecture Node Selector Key | >=1.14: `kubernetes.io/arch`, <1.14: `beta.kubernetes.io/arch`
 `targetNodeOs` | Space separated list of node OS's to target, e.g. "linux", "windows", "linux windows".  Note: Windows support is experimental. | `"linux"`
-`enablePrometheusServer` | If true, start an http server exposing `/metrics` endpoint for prometheus. | `false`
-`prometheusServerPort` | Replaces the default HTTP port for exposing prometheus metrics. | `9092`
-`podMonitor.create` | if `true`, create a PodMonitor | `false`
-`podMonitor.interval` | Prometheus scrape interval | `30s`
-`podMonitor.sampleLimit` | Number of scraped samples accepted | `5000`
-`podMonitor.labels` | Additional PodMonitor metadata labels | `{}`
 `updateStrategy` | Update strategy for the all DaemonSets (Linux and Windows) | `type=RollingUpdate,rollingUpdate.maxUnavailable=1`
 `linuxUpdateStrategy` | Update strategy for the Linux DaemonSet | `type=RollingUpdate,rollingUpdate.maxUnavailable=1`
 `windowsUpdateStrategy` | Update strategy for the Windows DaemonSet | `type=RollingUpdate,rollingUpdate.maxUnavailable=1`
 
+### Testing Configuration (NOT RECOMMENDED FOR PROD DEPLOYMENTS)
+
+Parameter | Description | Default
+--- | --- | ---
+`procUptimeFile` | (Used for Testing) Specify the uptime file | `/proc/uptime`
+`awsEndpoint` | (Used for testing) If specified, use the AWS endpoint to make API calls | None
+`awsSecretAccessKey` | (Used for testing) Pass-thru env var | None
+`awsAccessKeyID` | (Used for testing) Pass-thru env var | None
+`dryRun` | If true, only log if a node would be drained | `false`
+
 ## Metrics endpoint consideration
-If prometheus server is enabled and since NTH is a daemonset with `host_networking=true`, nothing else will be able to bind to `:9092` (or the port configured) in the root network namespace
-since it's listening on all interfaces.
-Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint.
+NTH in IMDS mode runs as a DaemonSet w/ `host_networking=true` by default. If the prometheus server is enabled, nothing else will be able to bind to the configured port (by default `:9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint.
+
+You can switch NTH in IMDS mode to run w/ `host_networking=false`, but you will need to make sure that IMDSv1 is enabled or IMDSv2 IP hop count will need to be incremented to 2. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html
diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml
@@ -74,7 +74,7 @@ spec:
           {{- toYaml . | nindent 8 }}
         {{- end }}
       serviceAccountName: {{ template "aws-node-termination-handler.serviceAccountName" . }}
-      hostNetwork: true
+      hostNetwork: {{ .Values.useHostNetwork }} 
       dnsPolicy: {{ .Values.dnsPolicy | default "ClusterFirstWithHostNet" | quote }}
       containers:
         - name: {{ include "aws-node-termination-handler.name" . }}

diff --git a/config/helm/aws-node-termination-handler/templates/psp.yaml b/config/helm/aws-node-termination-handler/templates/psp.yaml
@@ -10,7 +10,7 @@ metadata:
 spec:
   privileged: false
   hostIPC: false
-  hostNetwork: true
+  hostNetwork: {{ .Values.useHostNetwork }} 
   hostPID: false
   readOnlyRootFilesystem: false
   allowPrivilegeEscalation: false

diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml
@@ -33,7 +33,7 @@ resources:
     memory: "128Mi"
     cpu: "100m"
 
-# enableSqsTerminationDraining If true, drain nodes when an SQS termination event is received 
+# enableSqsTerminationDraining If true, this turns on queue-processor mode which drains nodes when an SQS termination event is received 
 enableSqsTerminationDraining: false
 
 # queueURL Listens for messages on the specified SQS queue URL
@@ -174,3 +174,8 @@ updateStrategy:
     maxUnavailable: 1
 linuxUpdateStrategy: ""
 windowsUpdateStrategy: ""
+
+# Determines if NTH uses host networking for Linux when running the DaemonSet (only IMDS mode; queue-processor never runs with host networking)
+# If you have disabled IMDSv1 and are relying on IMDSv2, you'll need to increase the IP hop count to 2 before switching this to false
+# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html
+useHostNetwork: true
diff --git a/scripts/generate-k8s-yaml b/scripts/generate-k8s-yaml
@@ -10,11 +10,17 @@ NAMESPACE="kube-system"
 MAKEFILEPATH=$SCRIPTPATH/../Makefile
 VERSION=$(make -s -f $MAKEFILEPATH version)
 BUILD_DIR=$SCRIPTPATH/../build/k8s-resources/$VERSION
+
 INDV_RESOURCES_DIR=$BUILD_DIR/individual-resources
 TAR_RESOURCES_FILE=$BUILD_DIR/individual-resources.tar
 AGG_RESOURCES_YAML=$BUILD_DIR/all-resources.yaml
 mkdir -p $INDV_RESOURCES_DIR
 
+QP_INDV_RESOURCES_DIR=$BUILD_DIR/individual-resources-queue-processor
+QP_TAR_RESOURCES_FILE=$BUILD_DIR/individual-resources-queue-processor.tar
+QP_AGG_RESOURCES_YAML=$BUILD_DIR/all-resources-queue-processor.yaml
+mkdir -p $QP_INDV_RESOURCES_DIR
+
 USAGE=$(cat << 'EOM'
   Usage: generate-k8s-yaml  [-n <K8s_NAMESPACE>]
   Generates the kubernetes yaml resource files from the helm chart
@@ -46,30 +52,58 @@ mv $BUILD_DIR/$PLATFORM-amd64/helm $BUILD_DIR/.
 rm -rf $BUILD_DIR/$PLATFORM-amd64
 chmod +x $BUILD_DIR/helm
 
+## IMDS Mode
 $BUILD_DIR/helm template aws-node-termination-handler \
     --namespace $NAMESPACE \
     --set targetNodeOs="linux windows" \
     $SCRIPTPATH/../config/helm/aws-node-termination-handler/ > $AGG_RESOURCES_YAML
 
-# remove helm annotations from template
+## Queue Processor Mode
+$BUILD_DIR/helm template aws-node-termination-handler \
+    --namespace $NAMESPACE \
+    --set enableSqsTerminationDraining="true" \
+    $SCRIPTPATH/../config/helm/aws-node-termination-handler/ > $QP_AGG_RESOURCES_YAML
+
+# IMDS mode - remove helm annotations from template
 cat $AGG_RESOURCES_YAML | grep -v 'helm.sh\|app.kubernetes.io/managed-by: Helm' > $BUILD_DIR/helm_annotations_removed.yaml
 mv $BUILD_DIR/helm_annotations_removed.yaml $AGG_RESOURCES_YAML
 
+# Queue Processor Mode - remove helm annotations from template
+cat $QP_AGG_RESOURCES_YAML | grep -v 'helm.sh\|app.kubernetes.io/managed-by: Helm' > $BUILD_DIR/helm_annotations_removed.yaml
+mv $BUILD_DIR/helm_annotations_removed.yaml $QP_AGG_RESOURCES_YAML
+
+# IMDS Mode
 $BUILD_DIR/helm template aws-node-termination-handler \
     --namespace $NAMESPACE \
     --set targetNodeOs="linux windows" \
     --output-dir $INDV_RESOURCES_DIR/ \
     $SCRIPTPATH/../config/helm/aws-node-termination-handler/
 
-# remove helm annotations from template
+# Queue Processor Mode
+$BUILD_DIR/helm template aws-node-termination-handler \
+    --namespace $NAMESPACE \
+    --set enableSqsTerminationDraining="true" \
+    --output-dir $QP_INDV_RESOURCES_DIR/ \
+    $SCRIPTPATH/../config/helm/aws-node-termination-handler/
+
+# Queue Processor Mode - remove helm annotations from template
 for i in $INDV_RESOURCES_DIR/aws-node-termination-handler/templates/*; do
   cat $i | grep -v 'helm.sh\|app.kubernetes.io/managed-by: Helm' > $BUILD_DIR/helm_annotations_removed.yaml
   mv $BUILD_DIR/helm_annotations_removed.yaml $i
 done
 
+# IMDS Mode - remove helm annotations from template
+for i in $QP_INDV_RESOURCES_DIR/aws-node-termination-handler/templates/*; do
+  cat $i | grep -v 'helm.sh\|app.kubernetes.io/managed-by: Helm' > $BUILD_DIR/helm_annotations_removed.yaml
+  mv $BUILD_DIR/helm_annotations_removed.yaml $i
+done
+
 cd $INDV_RESOURCES_DIR/aws-node-termination-handler/ && tar cvf $TAR_RESOURCES_FILE templates/*
+cd $QP_INDV_RESOURCES_DIR/aws-node-termination-handler/ && tar cvf $QP_TAR_RESOURCES_FILE templates/*
 cd $SCRIPTPATH
 
 echo "Generated aws-node-termination-handler kubernetes yaml resources files in:"
 echo "    - $AGG_RESOURCES_YAML"
 echo "    - $TAR_RESOURCES_FILE"
+echo "    - $QP_AGG_RESOURCES_YAML"
+echo "    - $QP_TAR_RESOURCES_FILE"
diff --git a/scripts/upload-resources-to-github b/scripts/upload-resources-to-github
@@ -10,6 +10,8 @@ BUILD_DIR=$SCRIPTPATH/../build/k8s-resources/$VERSION
 BINARY_DIR=$SCRIPTPATH/../build/bin
 INDV_K8S_RESOURCES=$BUILD_DIR/individual-resources.tar
 AGG_RESOURCES_YAML=$BUILD_DIR/all-resources.yaml
+QP_TAR_RESOURCES_FILE=$BUILD_DIR/individual-resources-queue-processor.tar
+QP_AGG_RESOURCES_YAML=$BUILD_DIR/all-resources-queue-processor.yaml
 BINARIES_ONLY="false"
 
 USAGE=$(cat << 'EOM'
@@ -66,7 +68,7 @@ gather_assets_to_upload() {
       resources+=("$binary")
     done
     if [ $BINARIES_ONLY != "true" ]; then
-      resources+=("$INDV_K8S_RESOURCES" "$AGG_RESOURCES_YAML")
+      resources+=("$INDV_K8S_RESOURCES" "$AGG_RESOURCES_YAML" "$QP_INDV_K8S_RESOURCES" "$QP_AGG_RESOURCES_YAML")
     fi
     echo "${resources[@]}"
 }

diff --git a/test/e2e/spot-interruption-test-host-networking-off b/test/e2e/spot-interruption-test-host-networking-off
@@ -0,0 +1,142 @@
+#!/bin/bash
+set -euo pipefail
+
+# Available env vars:
+#   $TMP_DIR
+#   $CLUSTER_NAME
+#   $KUBECONFIG
+#   $NODE_TERMINATION_HANDLER_DOCKER_REPO
+#   $NODE_TERMINATION_HANDLER_DOCKER_TAG
+#   $WEBHOOK_DOCKER_REPO
+#   $WEBHOOK_DOCKER_TAG
+#   $AEMM_URL
+#   $AEMM_VERSION
+
+function fail_and_exit {
+    echo "❌ Spot Interruption w/o Host Networking test failed $CLUSTER_NAME ❌"
+    exit ${1:-1}
+}
+
+echo "Starting Spot Interruption w/o Host Networking Test for Node Termination Handler"
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+common_helm_args=()
+[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows")
+[[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL")
+
+anth_helm_args=(
+  upgrade
+  --install
+  "$CLUSTER_NAME-anth"
+  "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
+  --wait
+  --force
+  --namespace kube-system
+  --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}"
+  --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
+  --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
+  --set enableScheduledEventDraining="false"
+  --set enableSpotInterruptionDraining="true"
+  --set taintNode="true"
+  --set useHostNetwork="false"
+  --set tolerations=""
+)
+[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] &&
+    anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY")
+[[ ${#common_helm_args[@]} -gt 0 ]] &&
+    anth_helm_args+=("${common_helm_args[@]}")
+
+set -x
+helm "${anth_helm_args[@]}"
+set +x
+
+emtp_helm_args=(
+  upgrade
+  --install
+  "$CLUSTER_NAME-emtp"
+  "$SCRIPTPATH/../../config/helm/webhook-test-proxy/"
+  --wait
+  --force
+  --namespace default
+  --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO"
+  --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG"
+)
+[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] &&
+    emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY")
+[[ ${#common_helm_args[@]} -gt 0 ]] &&
+    emtp_helm_args+=("${common_helm_args[@]}")
+
+set -x
+helm "${emtp_helm_args[@]}"
+set +x
+
+aemm_helm_args=(
+  upgrade
+  --install
+  "$CLUSTER_NAME-aemm"
+  "$AEMM_DL_URL"
+  --wait
+  --namespace default
+  --set servicePort="$IMDS_PORT"
+  --set 'tolerations[0].effect=NoSchedule'
+  --set 'tolerations[0].operator=Exists'
+  --set arguments='{spot}'
+)
+[[ ${#common_helm_args[@]} -gt 0 ]] &&
+    aemm_helm_args+=("${common_helm_args[@]}")
+
+set -x
+retry 5 helm "${aemm_helm_args[@]}"
+set +x
+
+TAINT_CHECK_CYCLES=15
+TAINT_CHECK_SLEEP=15
+
+deployed=0
+for i in `seq 1 $TAINT_CHECK_CYCLES`; do
+    if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
+        echo "✅ Verified regular-pod-test pod was scheduled and started!"
+        deployed=1
+        break
+    fi
+    echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
+    sleep $TAINT_CHECK_SLEEP
+done
+
+if [[ $deployed -eq 0 ]]; then
+    echo "❌ regular-pod-test pod deployment failed"
+    fail_and_exit 2
+fi
+
+cordoned=0
+tainted=0
+test_node=${TEST_NODE:-$CLUSTER_NAME-worker}
+for i in `seq 1 $TAINT_CHECK_CYCLES`; do
+    if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then
+        echo "✅ Verified the worker node was cordoned!"
+        cordoned=1
+    fi
+
+    if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then
+    echo "✅ Verified the worked node was tainted!"
+    tainted=1
+    fi
+
+    if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
+        echo "✅ Verified the regular-pod-test pod was evicted!"
+        echo "✅ Spot Interruption w/o Host Networking Test Passed $CLUSTER_NAME! ✅"
+        exit 0
+    fi
+    echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
+    sleep $TAINT_CHECK_SLEEP
+done
+
+if [[ $cordoned -eq 0 ]]; then
+    echo "❌ Worker node was not cordoned"
+elif [[ $tainted -eq 0 ]]; then
+    echo "❌ Worker node was not tainted"
+else
+    echo "❌ regular-pod-test pod was not evicted"
+fi
+fail_and_exit 1