Skip to content

Commit

Permalink
Add healthz liveness probe (#2936)
Browse files Browse the repository at this point in the history
* Add healthz liveness probe

Sets up a Ping healthz endpoint (ping)
Adds a liveness probe to ansible operator deployment scaffolding

* why doesnt molecule work

* fix scaffold

* rm dev artifacts

* molecule liveness FAILING

* Add e2e and e2e molecule liveness tests

* check err

* add changelog and comments

* add example to migration instructions

* reformat migration header
  • Loading branch information
asmacdo authored May 8, 2020
1 parent 8a2d7c1 commit 18a7bbb
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 1 deletion.
46 changes: 46 additions & 0 deletions changelog/fragments/1036-ansible-liveness.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# entries is a list of entries to include in
# release notes and/or the migration guide
entries:
- description: >
The Ansible operator now includes a healthz endpoint and liveness probe.
All operators will now have a running healthz endpoint (not publicly
exposed) without changes. Existing Oper
# kind is one of:
# - addition
# - change
# - deprecation
# - removal
# - bugfix
kind: "addition"
# Is this a breaking change?
breaking: false
# NOTE: ONLY USE `pull_request_override` WHEN ADDING THIS
# FILE FOR A PREVIOUSLY MERGED PULL_REQUEST!
#
# The generator auto-detects the PR number from the commit
# message in which this file was originally added.
#
# What is the pull request number (without the "#")?
# pull_request_override: 0
# Migration can be defined to automatically add a section to
# the migration guide. This is required for breaking changes.
migration:
header: (Optional) Add livenessProbe check for Ansible-based operators
body: >
Existing operators will have a healthz endpoint without intervention,
but to take advantage of it, a liveness probe should be manually added
to the operator manifest. For example, the `deploy/operator.yaml` file would need to include:
```yaml
livenessProbe:
httpGet:
path: /healthz
port: 6789
initialDelaySeconds: 5
periodSeconds: 3
```
19 changes: 19 additions & 0 deletions hack/tests/e2e-ansible.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,25 @@ test_operator() {
exit 1
fi

header_text "Wait for Operator Pod"
if ! timeout 60s bash -c -- "until kubectl get pod -l name=memcached-operator; do sleep 1; done"
then
error_text "FAIL: Operator pod does not exist."
operator_logs
exit 1
fi

header_text "Ensure no liveness probe fail events"
# We can't directly hit the endpoint, which is not publicly exposed. If k8s sees a failing endpoint, it will create a "Killing" event.
live_pod=$(kubectl get pod -l name=memcached-operator -o jsonpath="{..metadata.name}")
if kubectl get events --field-selector involvedObject.name=$live_pod | grep Killing
then
error_text "FAIL: Operator pod killed due to failed liveness probe."
kubectl get events --field-selector involvedObject.name=$live_pod,reason=Killing
operator_logs
exit 1
fi

header_text "Verify that a config map owned by the CR has been created."
if ! timeout 1m bash -c -- "until kubectl get configmap test-blacklist-watches > /dev/null 2>&1; do sleep 1; done";
then
Expand Down
6 changes: 6 additions & 0 deletions internal/scaffold/ansible/deploy_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ spec:
value: "[[.ProjectName]]"
- name: ANSIBLE_GATHERING
value: explicit
livenessProbe:
httpGet:
path: /healthz
port: 6789
initialDelaySeconds: 5
periodSeconds: 3
volumes:
- name: runner
emptyDir: {}
Expand Down
7 changes: 7 additions & 0 deletions internal/scaffold/ansible/molecule_templates_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,13 @@ spec:
value: "[[.ProjectName]]"
- name: ANSIBLE_GATHERING
value: explicit
livenessProbe:
httpGet:
path: /healthz
port: 6789
initialDelaySeconds: 5
periodSeconds: 3
volumes:
- name: runner
emptyDir: {}
Expand Down
9 changes: 8 additions & 1 deletion pkg/ansible/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/operator-framework/operator-sdk/pkg/leader"
"github.com/operator-framework/operator-sdk/pkg/metrics"
sdkVersion "github.com/operator-framework/operator-sdk/version"
"sigs.k8s.io/controller-runtime/pkg/healthz"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -53,6 +54,7 @@ var (
log = logf.Log.WithName("cmd")
metricsPort int32 = 8383
operatorMetricsPort int32 = 8686
healthProbePort int32 = 6789
)

func printVersion() {
Expand All @@ -75,7 +77,8 @@ func Run(flags *aoflags.AnsibleOperatorFlags) error {
// Set default manager options
// TODO: probably should expose the host & port as an environment variables
options := manager.Options{
MetricsBindAddress: fmt.Sprintf("%s:%d", metricsHost, metricsPort),
HealthProbeBindAddress: fmt.Sprintf("%s:%d", metricsHost, healthProbePort),
MetricsBindAddress: fmt.Sprintf("%s:%d", metricsHost, metricsPort),
NewClient: func(cache cache.Cache, config *rest.Config, options client.Options) (client.Client, error) {
c, err := client.New(config, options)
if err != nil {
Expand Down Expand Up @@ -166,6 +169,10 @@ func Run(flags *aoflags.AnsibleOperatorFlags) error {
}

addMetrics(context.TODO(), cfg, gvks)
err = mgr.AddHealthzCheck("ping", healthz.Ping)
if err != nil {
log.Error(err, "Failed to add Healthz check.")
}

done := make(chan error)

Expand Down
7 changes: 7 additions & 0 deletions test/ansible/deploy/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ spec:
value: explicit
- name: ANSIBLE_INVENTORY
value: /opt/ansible/inventory
livenessProbe:
httpGet:
path: /healthz
port: 6789
initialDelaySeconds: 5
periodSeconds: 3

volumes:
- name: runner
emptyDir: {}
16 changes: 16 additions & 0 deletions test/ansible/molecule/cluster/tasks/liveness_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
- name: get the operator pod
set_fact:
op_pod: "{{ lookup('k8s', kind='Pod', label_selector='name=ansible') }}"

- name: Check for liveness probe failure events
# We can't directly hit the endpoint, which is not publicly exposed. If k8s sees a failing endpoint, it will create a "Killing" event.
k8s_info:
kind: Event
field_selectors:
- "involvedObject.name={{ op_pod.metadata.name }}"
- "reason=Killing"
register: liveness_failures

- name: Assert that the Pod has not been liveness probe Killed
assert:
that: liveness_failures.resources|length == 0
6 changes: 6 additions & 0 deletions test/ansible/molecule/templates/operator.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ spec:
value: /opt/ansible/inventory
- name: ANSIBLE_DEBUG_LOGS
value: "TRUE"
livenessProbe:
httpGet:
path: /healthz
port: 6789
initialDelaySeconds: 5
periodSeconds: 3
volumes:
- name: runner
emptyDir: {}

0 comments on commit 18a7bbb

Please sign in to comment.