diff --git a/changelog/fragments/1036-ansible-liveness.yaml b/changelog/fragments/1036-ansible-liveness.yaml new file mode 100644 index 00000000000..3e082202c54 --- /dev/null +++ b/changelog/fragments/1036-ansible-liveness.yaml @@ -0,0 +1,46 @@ +# entries is a list of entries to include in +# release notes and/or the migration guide +entries: + - description: > + The Ansible operator now includes a healthz endpoint and liveness probe. + All operators will now have a running healthz endpoint (not publicly + exposed) without changes. Existing Oper + + # kind is one of: + # - addition + # - change + # - deprecation + # - removal + # - bugfix + kind: "addition" + + # Is this a breaking change? + breaking: false + + # NOTE: ONLY USE `pull_request_override` WHEN ADDING THIS + # FILE FOR A PREVIOUSLY MERGED PULL_REQUEST! + # + # The generator auto-detects the PR number from the commit + # message in which this file was originally added. + # + # What is the pull request number (without the "#")? + # pull_request_override: 0 + + + # Migration can be defined to automatically add a section to + # the migration guide. This is required for breaking changes. + migration: + header: (Optional) Add livenessProbe check for Ansible-based operators + body: > + Existing operators will have a healthz endpoint without intervention, + but to take advantage of it, a liveness probe should be manually added + to the operator manifest. For example, the `deploy/operator.yaml` file would need to include: + + ```yaml + livenessProbe: + httpGet: + path: /healthz + port: 6789 + initialDelaySeconds: 5 + periodSeconds: 3 + ``` diff --git a/hack/tests/e2e-ansible.sh b/hack/tests/e2e-ansible.sh index c59a516b0c6..cb0b2ccdc84 100755 --- a/hack/tests/e2e-ansible.sh +++ b/hack/tests/e2e-ansible.sh @@ -98,6 +98,25 @@ test_operator() { exit 1 fi + header_text "Wait for Operator Pod" + if ! timeout 60s bash -c -- "until kubectl get pod -l name=memcached-operator; do sleep 1; done" + then + error_text "FAIL: Operator pod does not exist." + operator_logs + exit 1 + fi + + header_text "Ensure no liveness probe fail events" + # We can't directly hit the endpoint, which is not publicly exposed. If k8s sees a failing endpoint, it will create a "Killing" event. + live_pod=$(kubectl get pod -l name=memcached-operator -o jsonpath="{..metadata.name}") + if kubectl get events --field-selector involvedObject.name=$live_pod | grep Killing + then + error_text "FAIL: Operator pod killed due to failed liveness probe." + kubectl get events --field-selector involvedObject.name=$live_pod,reason=Killing + operator_logs + exit 1 + fi + header_text "Verify that a config map owned by the CR has been created." if ! timeout 1m bash -c -- "until kubectl get configmap test-blacklist-watches > /dev/null 2>&1; do sleep 1; done"; then diff --git a/internal/scaffold/ansible/deploy_operator.go b/internal/scaffold/ansible/deploy_operator.go index 0b254ed100f..3cde60554ff 100644 --- a/internal/scaffold/ansible/deploy_operator.go +++ b/internal/scaffold/ansible/deploy_operator.go @@ -75,6 +75,12 @@ spec: value: "[[.ProjectName]]" - name: ANSIBLE_GATHERING value: explicit + livenessProbe: + httpGet: + path: /healthz + port: 6789 + initialDelaySeconds: 5 + periodSeconds: 3 volumes: - name: runner emptyDir: {} diff --git a/internal/scaffold/ansible/molecule_templates_operator.go b/internal/scaffold/ansible/molecule_templates_operator.go index 22fee7abb03..56249dd3e8b 100644 --- a/internal/scaffold/ansible/molecule_templates_operator.go +++ b/internal/scaffold/ansible/molecule_templates_operator.go @@ -74,6 +74,13 @@ spec: value: "[[.ProjectName]]" - name: ANSIBLE_GATHERING value: explicit + livenessProbe: + httpGet: + path: /healthz + port: 6789 + initialDelaySeconds: 5 + periodSeconds: 3 + volumes: - name: runner emptyDir: {} diff --git a/pkg/ansible/run.go b/pkg/ansible/run.go index ae1f2f5bb0f..6a6a0a6b736 100644 --- a/pkg/ansible/run.go +++ b/pkg/ansible/run.go @@ -34,6 +34,7 @@ import ( "github.com/operator-framework/operator-sdk/pkg/leader" "github.com/operator-framework/operator-sdk/pkg/metrics" sdkVersion "github.com/operator-framework/operator-sdk/version" + "sigs.k8s.io/controller-runtime/pkg/healthz" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -53,6 +54,7 @@ var ( log = logf.Log.WithName("cmd") metricsPort int32 = 8383 operatorMetricsPort int32 = 8686 + healthProbePort int32 = 6789 ) func printVersion() { @@ -75,7 +77,8 @@ func Run(flags *aoflags.AnsibleOperatorFlags) error { // Set default manager options // TODO: probably should expose the host & port as an environment variables options := manager.Options{ - MetricsBindAddress: fmt.Sprintf("%s:%d", metricsHost, metricsPort), + HealthProbeBindAddress: fmt.Sprintf("%s:%d", metricsHost, healthProbePort), + MetricsBindAddress: fmt.Sprintf("%s:%d", metricsHost, metricsPort), NewClient: func(cache cache.Cache, config *rest.Config, options client.Options) (client.Client, error) { c, err := client.New(config, options) if err != nil { @@ -166,6 +169,10 @@ func Run(flags *aoflags.AnsibleOperatorFlags) error { } addMetrics(context.TODO(), cfg, gvks) + err = mgr.AddHealthzCheck("ping", healthz.Ping) + if err != nil { + log.Error(err, "Failed to add Healthz check.") + } done := make(chan error) diff --git a/test/ansible/deploy/operator.yaml b/test/ansible/deploy/operator.yaml index bb52bc0e04d..df0545a8271 100644 --- a/test/ansible/deploy/operator.yaml +++ b/test/ansible/deploy/operator.yaml @@ -37,6 +37,13 @@ spec: value: explicit - name: ANSIBLE_INVENTORY value: /opt/ansible/inventory + livenessProbe: + httpGet: + path: /healthz + port: 6789 + initialDelaySeconds: 5 + periodSeconds: 3 + volumes: - name: runner emptyDir: {} diff --git a/test/ansible/molecule/cluster/tasks/liveness_test.yml b/test/ansible/molecule/cluster/tasks/liveness_test.yml new file mode 100644 index 00000000000..9f05a749815 --- /dev/null +++ b/test/ansible/molecule/cluster/tasks/liveness_test.yml @@ -0,0 +1,16 @@ +- name: get the operator pod + set_fact: + op_pod: "{{ lookup('k8s', kind='Pod', label_selector='name=ansible') }}" + +- name: Check for liveness probe failure events + # We can't directly hit the endpoint, which is not publicly exposed. If k8s sees a failing endpoint, it will create a "Killing" event. + k8s_info: + kind: Event + field_selectors: + - "involvedObject.name={{ op_pod.metadata.name }}" + - "reason=Killing" + register: liveness_failures + +- name: Assert that the Pod has not been liveness probe Killed + assert: + that: liveness_failures.resources|length == 0 diff --git a/test/ansible/molecule/templates/operator.yaml.j2 b/test/ansible/molecule/templates/operator.yaml.j2 index 82aa6fca11d..8c72325a112 100644 --- a/test/ansible/molecule/templates/operator.yaml.j2 +++ b/test/ansible/molecule/templates/operator.yaml.j2 @@ -42,6 +42,12 @@ spec: value: /opt/ansible/inventory - name: ANSIBLE_DEBUG_LOGS value: "TRUE" + livenessProbe: + httpGet: + path: /healthz + port: 6789 + initialDelaySeconds: 5 + periodSeconds: 3 volumes: - name: runner emptyDir: {}