From a81272bbf52d3e8a40ed9afc540a537a3fac5ddf Mon Sep 17 00:00:00 2001
From: Alexey Lebedeff <alebedev@mirantis.com>
Date: Wed, 10 Aug 2016 14:41:40 +0300
Subject: [PATCH] Use new rabbitmqctl features for monitoring

To stop wasting network bandwidth during health checks (e.g. list_queues
in 3-node cluster with 10k queues costs on average 12 megabytes of
traffic and 27k TCP packets).

Features are disabled by default to preserve compatibility, but they
SHOULD be enabled when following patches are present in currently used
rabbitmq version:
- https://github.com/rabbitmq/rabbitmq-server/pull/915
- https://github.com/rabbitmq/rabbitmq-server/pull/911
- https://github.com/rabbitmq/rabbitmq-server/pull/883
---
 scripts/rabbitmq-server-ha.ocf | 88 ++++++++++++++++++++++++++++++----
 1 file changed, 80 insertions(+), 8 deletions(-)
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index cd07d0c1b0d3..2669737d2f60 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -47,6 +47,8 @@ OCF_RESKEY_use_fqdn_default=false
 OCF_RESKEY_fqdn_prefix_default=""
 OCF_RESKEY_max_rabbitmqctl_timeouts_default=3
 OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy"
+OCF_RESKEY_rmq_feature_health_check_default=false
+OCF_RESKEY_rmq_feature_local_list_queues_default=false
 
 : ${HA_LOGTAG="lrmd"}
 : ${HA_LOGFACILITY="daemon"}
@@ -68,6 +70,8 @@ OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy"
 : ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}}
 : ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}}
 : ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}}
+: ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}}
+: ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}}
 
 #######################################################################
 
@@ -298,6 +302,26 @@ A path to the shell script to setup RabbitMQ policies
 <content type="string" default="${OCF_RESKEY_policy_file_default}" />
 </parameter>
 
+<parameter name="rmq_feature_health_check" unique="0" required="0">
+<longdesc lang="en">
+Since rabbit 3.6.4 list_queues/list_channels-based monitoring should
+be replaced with "node_health_check" command, as it creates no network
+load at all.
+</longdesc>
+<shortdesc lang="en">Use node_health_check for monitoring</shortdesc>
+<content type="string" default="${OCF_RESKEY_rmq_feature_health_check_default}" />
+</parameter>
+
+<parameter name="rmq_feature_local_list_queues" unique="0" required="0">
+<longdesc lang="en">
+For rabbit version that implements --local flag for list_queues, this
+can greatly reduce network overhead in cases when node is
+stopped/demoted.
+</longdesc>
+<shortdesc lang="en">Use --local option for list_queues</shortdesc>
+<content type="string" default="${OCF_RESKEY_rmq_feature_local_list_queues_default}" />
+</parameter>
+
 $EXTENDED_OCF_PARAMS
 
 </parameters>
@@ -1361,7 +1385,9 @@ check_timeouts() {
     local timeouts_attr_name=$2
     local op_name=$3
 
-    if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then
+    # 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about
+    # timeout.
+    if [ $op_rc -ne 124 -a $op_rc -ne 137 -a $op_rc -ne 75 ]; then
         ocf_run attrd_updater -p --name $timeouts_attr_name --update 0
         return 0
     fi
@@ -1392,12 +1418,20 @@ check_timeouts() {
 }
 
 wait_sync() {
-  wait_time=$1
+    local wait_time=${1:?}
+    local queues
+    local opt_arg=""
 
-  queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues name state"
-  su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \
-      do sleep 2; done\""
-  return $?
+    if [ "$OCF_RESKEY_rmq_feature_local_list_queues" = "true" ]; then
+        opt_arg="--local"
+    fi
+
+    queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues $opt_arg name state"
+
+    su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \
+          do sleep 2; done\""
+
+    return $?
 }
 
 get_monitor() {
@@ -1537,7 +1571,45 @@ get_monitor() {
         return $rc
     fi
 
-    # Check if the rabbitmqctl control plane is alive.
+    node_health_check
+    rc=$?
+
+    return $rc
+}
+
+# Check if the rabbitmqctl control plane is alive.
+node_health_check() {
+    if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then
+        node_health_check_local
+    else
+        node_health_check_legacy
+    fi
+}
+
+node_health_check_local() {
+    local LH="${LH} node_health_check_local():"
+    local rc
+    local rc_timeouts
+
+    local timeout=$((TIMEOUT_ARG - 2)) # give node_health_check some time to handle timeout by itself
+    su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout"
+    rc=$?
+
+    check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check"
+    rc_timeouts=$?
+
+    if [ "$rc_timeouts" -eq 2 ]; then
+        master_score 0
+        return $OCF_ERR_GENERIC
+    elif [ $rc -ne 0 ]; then
+        ocf_log err "${LH} rabbitmqctl node_health_check exited with errors."
+        return $OCF_ERR_GENERIC
+    else
+        return $OCF_SUCCESS
+    fi
+}
+
+node_health_check_legacy() {
     local rc_alive
     local timeout_alive
     su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null"
@@ -1704,7 +1776,7 @@ action_start() {
         return $OCF_SUCCESS
     fi
 
-    local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts"
+    local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts"
     local attr_name_to_reset
     for attr_name_to_reset in $attrs_to_zero; do
         ocf_update_private_attr $attr_name_to_reset 0