From 6d0ca9cca257e61ffe522933a1bf6460c8047d5c Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 15 Aug 2016 11:37:48 +0100 Subject: [PATCH 1/8] Safely handle (and log) anonymous info messages, most likely from the gm process' neighbours --- src/gm.erl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/gm.erl b/src/gm.erl index 176e14537f2d..822d50252b62 100644 --- a/src/gm.erl +++ b/src/gm.erl @@ -757,7 +757,14 @@ handle_info({'DOWN', MRef, process, _Pid, Reason}, catch lost_membership -> {stop, normal, State} - end. + end; +handle_info(Msg, State = #state { self = Self }) -> + %% TODO: For #gm_group{} related info messages, it could be worthwhile to + %% change_view/2, as this might reflect an alteration in the gm group, meaning + %% we now need to update our state. see rabbitmq-server#914. + rabbit_log:info("GM member ~p received unexpected message ~p~n" + "When Server state == ~p", [Self, Msg, State]), + noreply(State). terminate(Reason, #state { module = Module, callback_args = Args }) -> Module:handle_terminate(Args, Reason). From 53f10c98bb16202c09c2567a2a0590dd8e1f8dc9 Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 15 Aug 2016 11:43:23 +0100 Subject: [PATCH 2/8] Adds check_membership/2 clause for handling non-existant gm group --- src/gm.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gm.erl b/src/gm.erl index 822d50252b62..1507961df647 100644 --- a/src/gm.erl +++ b/src/gm.erl @@ -1600,7 +1600,9 @@ check_membership(Self, #gm_group{members = M} = Group) -> Group; false -> throw(lost_membership) - end. + end; +check_membership(_Self, {error, not_found}) -> + throw(lost_membership). check_membership(GroupName) -> case dirty_read_group(GroupName) of From a41606031791651ecea255070395b4cbfb075be4 Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 15 Aug 2016 12:30:38 +0100 Subject: [PATCH 3/8] Handle unexpected gm group alterations prior to removal of dead pids from queue --- src/rabbit_mirror_queue_coordinator.erl | 9 +++++++++ src/rabbit_mirror_queue_misc.erl | 16 +++++++++++++--- src/rabbit_mirror_queue_slave.erl | 12 +++++++++--- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/rabbit_mirror_queue_coordinator.erl b/src/rabbit_mirror_queue_coordinator.erl index 221f11f18a01..017d1d1fa202 100644 --- a/src/rabbit_mirror_queue_coordinator.erl +++ b/src/rabbit_mirror_queue_coordinator.erl @@ -355,6 +355,15 @@ handle_cast({gm_deaths, DeadGMPids}, DeadPids), rabbit_mirror_queue_misc:add_mirrors(QueueName, ExtraNodes, async), noreply(State); + {ok, _MPid0, DeadPids, _ExtraNodes} -> + %% see rabbitmq-server#914; + %% Different slave is now master, stop current coordinator normally. + %% Initiating queue is now slave and the least we could do is report + %% deaths which we 'think' we saw. + %% NOTE: Reported deaths here, could be inconsistant. + rabbit_mirror_queue_misc:report_deaths(MPid, false, QueueName, + DeadPids), + {stop, normal, State}; {error, not_found} -> {stop, normal, State} end; diff --git a/src/rabbit_mirror_queue_misc.erl b/src/rabbit_mirror_queue_misc.erl index 83350920e6c9..4205fabb83d6 100644 --- a/src/rabbit_mirror_queue_misc.erl +++ b/src/rabbit_mirror_queue_misc.erl @@ -76,7 +76,7 @@ remove_from_queue(QueueName, Self, DeadGMPids) -> rabbit_misc:execute_mnesia_transaction( fun () -> %% Someone else could have deleted the queue before we - %% get here. + %% get here. Or, gm group could've altered. see rabbitmq-server#914 case mnesia:read({rabbit_queue, QueueName}) of [] -> {error, not_found}; [Q = #amqqueue { pid = QPid, @@ -90,7 +90,16 @@ remove_from_queue(QueueName, Self, DeadGMPids) -> AlivePids = [Pid || {_GM, Pid} <- AliveGM], Alive = [Pid || Pid <- [QPid | SPids], lists:member(Pid, AlivePids)], - {QPid1, SPids1} = promote_slave(Alive), + {QPid1, SPids1} = case Alive of + [] -> + %% GM altered, & if all pids are + %% perceived as dead, rather do + %% do nothing here, & trust the + %% promoted slave to have updated + %% mnesia during the alteration. + {QPid, SPids}; + _ -> promote_slave(Alive) + end, Extra = case {{QPid, SPids}, {QPid1, SPids1}} of {Same, Same} -> @@ -98,7 +107,8 @@ remove_from_queue(QueueName, Self, DeadGMPids) -> _ when QPid =:= QPid1 orelse QPid1 =:= Self -> %% Either master hasn't changed, so %% we're ok to update mnesia; or we have - %% become the master. + %% become the master. If gm altered, + %% we have no choice but to proceed. Q1 = Q#amqqueue{pid = QPid1, slave_pids = SPids1, gm_pids = AliveGM}, diff --git a/src/rabbit_mirror_queue_slave.erl b/src/rabbit_mirror_queue_slave.erl index 6f46cdc69881..2cb445518071 100644 --- a/src/rabbit_mirror_queue_slave.erl +++ b/src/rabbit_mirror_queue_slave.erl @@ -225,9 +225,15 @@ handle_call({gm_deaths, DeadGMPids}, From, _ -> %% master has changed to not us gen_server2:reply(From, ok), - %% assertion, we don't need to add_mirrors/2 in this - %% branch, see last clause in remove_from_queue/2 - [] = ExtraNodes, + %% see rabbitmq-server#914; + %% It's not always guaranteed that we won't have ExtraNodes. + %% If gm alters, master can change to not us with extra nodes, + %% in which case we attempt to add mirrors on those nodes. + case ExtraNodes of + [] -> void; + _ -> rabbit_mirror_queue_misc:add_mirrors( + QName, ExtraNodes, async) + end, %% Since GM is by nature lazy we need to make sure %% there is some traffic when a master dies, to %% make sure all slaves get informed of the From f7d5d6e48c186b1e0e973d7ee63224644d94ae0a Mon Sep 17 00:00:00 2001 From: Bogdan Dobrelya Date: Thu, 18 Aug 2016 09:59:32 +0200 Subject: [PATCH 4/8] Fix bashisms in rabbitmq OCF RA Change "printf %b" to be passing the checkbashisms. Signed-off-by: Bogdan Dobrelya --- scripts/rabbitmq-server-ha.ocf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 9e07dc8490e0..8f9cb16aa53e 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -1616,13 +1616,13 @@ get_monitor() { elif [ -n "${queues}" ]; then local q_c - q_c=`printf "%b\n" "${queues}" | wc -l` + q_c=`printf %b "${queues}\n" | wc -l` local mem - mem=`printf "%b\n" "${queues}" | awk -v sum=0 '{sum+=$1} END {print (sum/1048576)}'` + mem=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$1} END {print (sum/1048576)}'` local mes - mes=`printf "%b\n" "${queues}" | awk -v sum=0 '{sum+=$2} END {print sum}'` + mes=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$2} END {print sum}'` local c_u - c_u=`printf "%b\n" "${queues}" | awk -v sum=0 -v cnt=${q_c} '{sum+=$3} END {print (sum+1)/(cnt+1)}'` + c_u=`printf %b "${queues}\n" | awk -v sum=0 -v cnt=${q_c} '{sum+=$3} END {print (sum+1)/(cnt+1)}'` local status status=`echo $(su_rabbit_cmd "${OCF_RESKEY_ctl} -q status")` ocf_log info "${LH} RabbitMQ is running ${q_c} queues consuming ${mem}m of ${TOTALVMEM}m total, with ${mes} queued messages, average consumer utilization ${c_u}" From 39a5faf4cee7fc480607e51ad75806fbecede12f Mon Sep 17 00:00:00 2001 From: Dmitry Mescheryakov Date: Thu, 18 Aug 2016 15:35:38 +0300 Subject: [PATCH 5/8] [OCF HA] Add ocf_get_private_attr function to RabbitMQ OCF script The function is extracted from check_timeouts to be re-used later in other parts of the script. Also, swtich check_timeouts to use existing ocf_update_private_attr function. --- scripts/rabbitmq-server-ha.ocf | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 8f9cb16aa53e..84baaba8259f 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -1362,25 +1362,18 @@ check_timeouts() { local op_name=$3 if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then - ocf_run attrd_updater -p --name $timeouts_attr_name --update 0 + ocf_update_private_attr $timeouts_attr_name 0 return 0 fi local count - count=`attrd_updater --name $timeouts_attr_name --query 2>/dev/null` - if [ $? -ne 0 ]; then - # the attrd_updater exited with error. In that case most probably it printed garbage - # instead of the number we need. So defensively assume that it is zero. - - count=0 - fi - count=`echo "${count}" | awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d'` + count=$(ocf_get_private_attr $timeouts_attr_name 0) count=$((count+1)) # There is a slight chance that this piece of code will be executed twice simultaneously. # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need # precise calculation here. - ocf_run attrd_updater -p --name $timeouts_attr_name --update $count + ocf_update_private_attr $timeouts_attr_name $count if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now." @@ -1634,6 +1627,18 @@ get_monitor() { return $rc } +ocf_get_private_attr() { + local attr_name="${1:?}" + local attr_default_value="${2:?}" + local count + count=$(attrd_updater -p --name "$attr_name" --query) + if [ $? -ne 0 ]; then + echo $attr_default_value + else + echo "$count" | awk -vdef_val="$attr_default_value" '{ gsub(/"/, "", $3); split($3, vals, "="); if (vals[2] != "(null)") print vals[2]; else print def_val }' + fi +} + ocf_update_private_attr() { local attr_name="${1:?}" local attr_value="${2:?}" From 6fc561f14213446f6bfc7ba7e6d946640d5f0d56 Mon Sep 17 00:00:00 2001 From: Diana Corbacho Date: Fri, 19 Aug 2016 10:45:55 +0100 Subject: [PATCH 6/8] Test GM crash when group is deleted while processing a DOWN message --- test/gm_SUITE.erl | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/test/gm_SUITE.erl b/test/gm_SUITE.erl index e4c68a257a91..8b07c9efad32 100644 --- a/test/gm_SUITE.erl +++ b/test/gm_SUITE.erl @@ -39,7 +39,8 @@ all() -> confirmed_broadcast, member_death, receive_in_order, - unexpected_msg + unexpected_msg, + down_in_members_change ]. init_per_suite(Config) -> @@ -123,6 +124,41 @@ unexpected_msg(_Config) -> passed end). +down_in_members_change(_Config) -> + %% Setup + ok = gm:create_tables(), + {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self(), + fun rabbit_misc:execute_mnesia_transaction/1), + passed = receive_joined(Pid, [Pid], timeout_joining_gm_group_1), + {ok, Pid2} = gm:start_link(?MODULE, ?MODULE, self(), + fun rabbit_misc:execute_mnesia_transaction/1), + passed = receive_joined(Pid2, [Pid, Pid2], timeout_joining_gm_group_2), + passed = receive_birth(Pid, Pid2, timeout_waiting_for_birth_2), + + %% Test. Simulate that the gm group is deleted (forget_group) while + %% processing the 'DOWN' message from the neighbour + process_flag(trap_exit, true), + ok = meck:new(mnesia, [passthrough]), + ok = meck:expect(mnesia, read, fun({gm_group, ?MODULE}) -> + []; + (Key) -> + meck:passthrough([Key]) + end), + gm:leave(Pid2), + Passed = receive + {'EXIT', Pid, normal} -> + passed; + {'EXIT', Pid, _} -> + crashed + after 15000 -> + timeout + end, + %% Cleanup + meck:unload(mnesia), + process_flag(trap_exit, false), + passed = Passed. + + do_broadcast(Fun) -> with_two_members(broadcast_fun(Fun)). From 5a6f61c423b19b33e36f0be0f995a1636b310873 Mon Sep 17 00:00:00 2001 From: Dmitry Mescheryakov Date: Mon, 22 Aug 2016 14:19:21 +0300 Subject: [PATCH 7/8] [OCF HA] Rank master score based on start time Right now we assign 1000 to the oldest nodes and 1 to others. That creates a problem when Master restarts and no node is promoted until that node starts back. In that case the returned node will have score of 1, like all other slaves and Pacemaker will select to promote it again. The node is clean empty and afterwards other slaves join to it, wiping their data as well. As a result, we loose all the messages. The new algorithm actually ranks nodes, not just selects the oldest one. It also maintains the invariant that if node A started later than node B, then node A score must be smaller than that of node B. As a result, freshly started node has no chance of being selected in preference to older node. If several nodes start simultaneously, among them an older node might temporarily receive lower score than a younger one, but that is neglectable. Also remove any action on demote or demote notification - all of these duplicate actions done in stop or stop notification. With these removed, changing master on a running cluster does not affect RabbitMQ cluster in any way - we just declare another node master and that is it. It is important for the current change because master score might change after initial cluster start up causing master migration from one node to another. This fix is a prerequsite for fix to Fuel bugs https://bugs.launchpad.net/fuel/+bug/1559136 https://bugs.launchpad.net/mos/+bug/1561894 --- scripts/rabbitmq-server-ha.ocf | 214 ++++++++------------------------- 1 file changed, 48 insertions(+), 166 deletions(-) diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 84baaba8259f..6a9e4488531d 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -319,6 +319,11 @@ $EXTENDED_OCF_PARAMS END } + +MIN_MASTER_SCORE=100 +BEST_MASTER_SCORE=1000 + + ####################################################################### # Functions invoked by resource manager actions @@ -571,17 +576,21 @@ my_host() { return $rc } -srv_uptime() { - local stime - stime=$( crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --query 2>/dev/null | awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d' ) - - if [ -z "${stime}" -o "${stime}" = "(null)" ] ; then - echo 0 - else - echo $(( $(now) - ${stime} )) +get_integer_node_attr() { + local value + value=$(crm_attribute -N $1 -l reboot --name "$2" --query 2>/dev/null | awk '{ split($3, vals, "="); if (vals[2] != "(null)") print vals[2] }') + if [ $? -ne 0 -o -z "$value" ] ; then + value=0 fi + echo $value +} - return $OCF_SUCCESS +get_node_start_time() { + get_integer_node_attr $1 'rabbit-start-time' +} + +get_node_master_score() { + get_integer_node_attr $1 'master-p_rabbitmq-server' } # Return either rabbit node name as FQDN or shortname, depends on the OCF_RESKEY_use_fqdn. @@ -1245,7 +1254,7 @@ start_rmq_server_app() { rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then # rabbitmq-server started successfuly as master of cluster - master_score 1 # minimal positive master-score for this node. + master_score $MIN_MASTER_SCORE stop_rmq_server_app rc=$? if [ $rc -ne 0 ] ; then @@ -1269,7 +1278,7 @@ start_rmq_server_app() { if [ $rc -eq $OCF_SUCCESS ]; then ocf_log info "${LH} RMQ-server app Mnesia cleaned successfully." rc=$OCF_SUCCESS - master_score 1 + master_score $MIN_MASTER_SCORE break else ocf_log err "${LH} RMQ-server app can't be stopped during Mnesia cleaning. Beam will be killed." @@ -1400,10 +1409,6 @@ get_monitor() { local rabbit_running local name local node - local nodelist - local max - local our_uptime - local node_uptime local node_start_time ocf_log info "${LH} CHECK LEVEL IS: ${OCF_CHECK_LEVEL}" @@ -1484,45 +1489,37 @@ get_monitor() { ocf_log info "${LH} ensuring this slave does not get promoted." master_score 0 return $OCF_ERR_GENERIC - elif [ $rc -ne $OCF_RUNNING_MASTER ] ; then - ocf_log info "${LH} preparing to update master score for node" - our_uptime=$(srv_uptime) - nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) - max=1 - for node in $nodelist + fi + + # Recounting our master score + ocf_log info "${LH} preparing to update master score for node" + local our_start_time + local new_score + local node_start_time + local node_score + + our_start_time=$(get_node_start_time $THIS_PCMK_NODE) + + if [ $our_start_time -eq 0 ]; then + new_score=$MIN_MASTER_SCORE + else + new_score=$BEST_MASTER_SCORE + for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) do - node_start_time=`crm_attribute -N $node -l reboot --name 'rabbit-start-time' --query 2>/dev/null | awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d'` - if [ -z "${node_start_time}" -o "${node_start_time}" = "(null)" ] ; then - node_uptime=0 - else - node_uptime=$(( $(now) - ${node_start_time} )) - fi - ocf_log info "${LH} comparing our uptime (${our_uptime}) with $node (${node_uptime})" - if [ ${our_uptime} -lt ${node_uptime} ] - then - max=1 - break - else - # When uptime is equal, accept the existing master - if any - as the oldest node - is_master $node - status_master=$? - if [ $status_master -eq 0 ] ; then - max=1 - ocf_log info "${LH} Found the oldest master node $node with uptime (${node_uptime})" - break - else - max=0 - fi + node_start_time=$(get_node_start_time $node) + node_score=$(get_node_master_score $node) + + ocf_log info "${LH} comparing us (start time: $our_start_time, score: $new_score) with $node (start time: $node_start_time, score: $node_score)" + if [ $node_start_time -ne 0 -a $node_score -ne 0 -a $node_start_time -lt $our_start_time ]; then + new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) fi done + fi - - if [ $max -eq 0 ] - then - ocf_log info "${LH} we are the oldest node" - master_score 1000 - fi + if [ "$new_score" -ne "$(get_node_master_score $THIS_PCMK_NODE)" ]; then + master_score $new_score fi + ocf_log info "${LH} our start time is $our_start_time and score is $new_score" # Skip all other checks if rabbit app is not running if [ $rabbit_running -ne $OCF_SUCCESS ]; then @@ -1929,28 +1926,6 @@ action_notify() { echo "$d [notify] ${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation} promote='${OCF_RESKEY_CRM_meta_notify_promote_uname}' demote='${OCF_RESKEY_CRM_meta_notify_demote_uname}' master='${OCF_RESKEY_CRM_meta_notify_master_uname}' slave='${OCF_RESKEY_CRM_meta_notify_slave_uname}' start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log fi - if [ "${OCF_RESKEY_CRM_meta_notify_type}" = 'pre' ] ; then - # PRE- anything notify section - case "$OCF_RESKEY_CRM_meta_notify_operation" in - promote) - ocf_log info "${LH} pre-promote begin." - my_host "$OCF_RESKEY_CRM_meta_notify_promote_uname" - rc=$? - if [ $rc -eq $OCF_SUCCESS ] ; then - nodelist=$(get_all_pacemaker_nodes) - for i in $nodelist - do - ocf_log info "${LH} Deleting master attribute for node ${i}" - ocf_run crm_attribute -N $i -l reboot --name 'rabbit-master' --delete - done - ocf_log info "${LH} pre-promote end." - fi - ;; - *) - ;; - esac - fi - if [ "${OCF_RESKEY_CRM_meta_notify_type}" = 'post' ] ; then # POST- anything notify section case "$OCF_RESKEY_CRM_meta_notify_operation" in @@ -2069,42 +2044,6 @@ action_notify() { # always returns OCF_SUCCESS ocf_log info "${LH} post-stop end." ;; - demote) - # if rabbitmq-server stops on any another node, we should remove it from cluster (as ordinary operation) - ocf_log info "${LH} post-demote begin." - # Report not running, if the list of nodes being demoted reported empty - if [ -z "${OCF_RESKEY_CRM_meta_notify_demote_uname}" ] ; then - ocf_log warn "${LH} there are no nodes being demoted reported on post-demote. The resource will be restarted." - ocf_log info "${LH} post-demote end." - return $OCF_ERR_GENERIC - fi - my_host "${OCF_RESKEY_CRM_meta_notify_demote_uname}" - rc=$? - if [ $rc -ne $OCF_SUCCESS ] ; then - # On ohter nodes processing the post-demote, make sure the demoted node will be forgotten - unjoin_nodes_from_cluster "${OCF_RESKEY_CRM_meta_notify_demote_uname}" - else - # Wait for synced state first - ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" - wait_sync $((OCF_RESKEY_stop_time/2)) - # On the nodes being demoted, reset the master score - ocf_log info "${LH} resetting the master score." - master_score 0 - ocf_log info "${LH} Deleting start time attribute" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete - ocf_log info "${LH} Deleting master attribute" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete - ocf_log info "${LH} master was demoted. stopping RabbitMQ app." - stop_rmq_server_app - rc2=$? - if [ $rc2 -ne $OCF_SUCCESS ] ; then - ocf_log err "${LH} RMQ-server app can't be stopped on post-demote. Master resource is failed" - ocf_log info "${LH} post-demote end." - exit $OCF_FAILED_MASTER - fi - fi - ocf_log info "${LH} post-demote end." - ;; *) ;; esac fi @@ -2211,68 +2150,11 @@ action_promote() { action_demote() { - local rc=$OCF_ERR_GENERIC local LH="${LL} demote:" - - if [ "${OCF_RESKEY_debug}" = 'true' ] ; then - d=`date '+%Y%m%d %H:%M:%S'` - echo $d >> /tmp/rmq-demote.log - env >> /tmp/rmq-demote.log - echo "$d [demote] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log - - fi - ocf_log info "${LH} action begin." - - get_monitor - rc=$? - case "$rc" in - "$OCF_RUNNING_MASTER") - # Running as master. Normal, expected behavior. - ocf_log warn "${LH} Resource is currently running as Master" - ocf_log info "${LH} Deleting master attribute" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete - ocf_log info "${LH} Deleting start timestamp" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete - - # Wait for synced state first - ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" - wait_sync $((OCF_RESKEY_stop_time/2)) - - stop_rmq_server_app - rc=$? - ;; - "$OCF_SUCCESS") - # Alread running as slave. Nothing to do. - ocf_log warn "${LH} Resource is currently running as Slave" - rc=$OCF_SUCCESS - ;; - "$OCF_FAILED_MASTER") - # Master failed and being demoted. - ocf_log err "${LH} Demoting of a failed Master." - ocf_log info "${LH} action end." - exit $OCF_FAILED_MASTER - ;; - "$OCF_NOT_RUNNING") - ocf_log warn "${LH} Try to demote currently not running resource. Nothing to do." - rc=$OCF_SUCCESS - ;; - "$OCF_ERR_GENERIC") - ocf_log err "${LH} Error while demote. Stopping resource." - action_stop - rc=$? - ;; - *) - # Failed resource. Let the cluster manager recover. - ocf_log err "${LH} Unexpected error, cannot demote" - ocf_log info "${LH} action end." - exit $rc - ;; - esac - - # transform master RMQ-server to slave + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete ocf_log info "${LH} action end." - return $rc + return $OCF_SUCCESS } ####################################################################### From be86bf4e207cb9cb00f786d9476ef3dc58ced72c Mon Sep 17 00:00:00 2001 From: Dmitry Mescheryakov Date: Mon, 22 Aug 2016 14:22:21 +0300 Subject: [PATCH 8/8] [OCF HA] Enhance split-brain detection logic Previous split brain logic worked as follows: each slave checked that it is connected to master. If check fails, slave restarts. The ultimate flaw in that logic is that there is little guarantee that master is alive at the moment. Moreover, if master dies, it is very probable that during the next monitor check slaves will detect its death and restart, causing complete RabbitMQ cluster downtime. With the new approach master node checks that slaves are connected to it and orders them to restart if they are not. The check is performed after master node health check, meaning that at least that node survives. Also, orders expire in one minute and freshly started node ignores orders to restart for three minutes to give cluster time to stabilize. Also corrected the problem, when node starts and is already clustered. In that case OCF script forgot to start the RabbitMQ app, causing subsequent restart. Now we ensure that RabbitMQ app is running. The two introduced attributes rabbit-start-phase-1-time and rabbit-ordered-to-restart are made private. In order to allow master to set node's order to restart, both ocf_update_private_attr and ocf_get_private_attr signatures are expanded to allow passing node name. Finally, a bug is fixed in ocf_get_private_attr. Unlike crm_attribute, attrd_updater returns empty string instead of "(null)", when an attribute is not defined on needed node, but is defined on some other node. Correspondingly changed code to expect empty string, not a "(null)". This fix is a fix for Fuel bugs https://bugs.launchpad.net/fuel/+bug/1559136 https://bugs.launchpad.net/mos/+bug/1561894 --- scripts/rabbitmq-server-ha.ocf | 120 ++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 56 deletions(-) diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 6a9e4488531d..0dd27c72c4f8 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -882,12 +882,21 @@ action_validate() { return $OCF_SUCCESS } +update_rabbit_start_time_if_rc() { + local nowtime + local rc=$1 + if [ $rc -eq 0 ]; then + nowtime="$(now)" + ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" + fi +} + join_to_cluster() { local node="$1" local rmq_node local rc=$OCF_ERR_GENERIC local LH="${LL} join_to_cluster():" - local nowtime ocf_log info "${LH} start." @@ -921,9 +930,7 @@ join_to_cluster() { action_stop return $OCF_ERR_GENERIC else - nowtime="$(now)" - ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" + update_rabbit_start_time_if_rc 0 ocf_log info "${LH} Joined to cluster succesfully." fi @@ -1410,6 +1417,7 @@ get_monitor() { local name local node local node_start_time + local nowtime ocf_log info "${LH} CHECK LEVEL IS: ${OCF_CHECK_LEVEL}" get_status @@ -1440,46 +1448,27 @@ get_monitor() { rabbit_running=$? ocf_log info "${LH} checking if rabbit app is running" - if [ $rabbit_running -eq $OCF_SUCCESS ] - then - ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster" - - if [ $rc -eq $OCF_RUNNING_MASTER ] ; then - # The master is always running inside of its cluster + if [ $rc -eq $OCF_RUNNING_MASTER ]; then + if [ $rabbit_running -eq $OCF_SUCCESS ]; then ocf_log info "${LH} rabbit app is running and is master of cluster" - else - local master_name=$(get_master_name_but $THIS_PCMK_NODE) - - if [ -z "$master_name" ]; then - ocf_log info "${LH} no master is elected currently. Skipping cluster health check." - - elif is_clustered_with $master_name; then - ocf_log info "${LH} rabbit app is running and is member of healthy cluster" - - else - # Rabbit is running but is not connected to master - # Failing to avoid split brain - ocf_log err "${LH} rabbit node is running out of the cluster" - stop_server_process - rc=$OCF_ERR_GENERIC - fi + ocf_log err "${LH} we are the master and rabbit app is not running. This is a failure" + exit $OCF_FAILED_MASTER fi else - if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then - ocf_log info "${LH} rabbit app is not running. checking if there is a master" - # Do not refetch the master status as we know it already - if [ $rc -eq $OCF_RUNNING_MASTER ]; then - ocf_log err "${LH} we are the master and rabbit app is not running. this is a failure" - exit $OCF_FAILED_MASTER - fi - - local master_name=$(get_master_name_but $THIS_PCMK_NODE) - - if [ -n "$master_name" ]; then - ocf_log info "${LH} master exists and rabbit app is not running. Exiting to be restarted by pacemaker" + start_time=$((180 + $(ocf_get_private_attr 'rabbit-start-phase-1-time' 0))) + restart_order_time=$((60 + $(ocf_get_private_attr 'rabbit-ordered-to-restart' 0))) + nowtime=$(now) + + # If we started more than 3 minutes ago, and + # we got order to restart less than 1 minute ago + if [ $nowtime -lt $restart_order_time ]; then + if [ $nowtime -gt $start_time ]; then + ocf_log err "${LH} failing because we have received an order to restart from the master" stop_server_process rc=$OCF_ERR_GENERIC + else + ocf_log warn "${LH} received an order to restart from the master, ignoring it because we have just started" fi fi fi @@ -1620,6 +1609,19 @@ get_monitor() { fi fi + # If we are the master and healthy, check that we see other cluster members + # Order a member to restart if we don't see it + if [ $rc -eq $OCF_RUNNING_MASTER ] ; then + for node in $(get_all_pacemaker_nodes); do + if ! is_clustered_with $node; then + nowtime=$(now) + + ocf_log warn "${LH} node $node is not connected with us, ordering it to restart." + ocf_update_private_attr 'rabbit-ordered-to-restart' "$nowtime" "$node" + fi + done + fi + ocf_log info "${LH} get_monitor function ready to return ${rc}" return $rc } @@ -1627,19 +1629,21 @@ get_monitor() { ocf_get_private_attr() { local attr_name="${1:?}" local attr_default_value="${2:?}" + local nodename="${3:-$THIS_PCMK_NODE}" local count - count=$(attrd_updater -p --name "$attr_name" --query) + count=$(attrd_updater -p --name "$attr_name" --node "$nodename" --query) if [ $? -ne 0 ]; then echo $attr_default_value else - echo "$count" | awk -vdef_val="$attr_default_value" '{ gsub(/"/, "", $3); split($3, vals, "="); if (vals[2] != "(null)") print vals[2]; else print def_val }' + echo "$count" | awk -vdef_val="$attr_default_value" '{ gsub(/"/, "", $3); split($3, vals, "="); if (vals[2] != "") print vals[2]; else print def_val }' fi } ocf_update_private_attr() { local attr_name="${1:?}" local attr_value="${2:?}" - ocf_run attrd_updater -p --name "$attr_name" --update "$attr_value" + local nodename="${3:-$THIS_PCMK_NODE}" + ocf_run attrd_updater -p --name "$attr_name" --node "$nodename" --update "$attr_value" } rabbitmqctl_with_timeout_check() { @@ -1689,6 +1693,7 @@ action_monitor() { action_start() { local rc=$OCF_ERR_GENERIC local LH="${LL} start:" + local nowtime if [ "${OCF_RESKEY_debug}" = 'true' ] ; then d=`date '+%Y%m%d %H:%M:%S'` @@ -1712,6 +1717,9 @@ action_start() { ocf_update_private_attr $attr_name_to_reset 0 done + nowtime=$(now) + ocf_log info "${LH} Setting phase 1 one start time to $nowtime" + ocf_update_private_attr 'rabbit-start-phase-1-time' "$nowtime" ocf_log info "${LH} Deleting start time attribute" ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete ocf_log info "${LH} Deleting master attribute" @@ -1917,7 +1925,6 @@ action_notify() { local rc2=$OCF_ERR_GENERIC local LH="${LL} notify:" local nodelist - local nowtime if [ "${OCF_RESKEY_debug}" = 'true' ] ; then d=`date '+%Y%m%d %H:%M:%S'` @@ -1943,7 +1950,15 @@ action_notify() { ocf_log info "${LH} ignoring post-promote of self" elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then - ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do." + if get_status rabbit; then + ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do." + else + ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. We only need to start the app." + + try_to_start_rmq_app + rc2=$? + update_rabbit_start_time_if_rc $rc2 + fi else # Note, this should fail when the mnesia is inconsistent. @@ -1992,14 +2007,10 @@ action_notify() { rc2=$? else ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}" - if try_to_start_rmq_app; then - rc2=$OCF_SUCCESS - nowtime="$(now)" - ocf_log info "${LH} Updating start time attribute with ${nowtime}" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" - else - rc2=$OCF_ERR_GENERIC - fi + + try_to_start_rmq_app + rc2=$? + update_rabbit_start_time_if_rc $rc2 fi ocf_log info "${LH} post-start end." if [ -s "${OCF_RESKEY_definitions_dump_file}" ] ; then @@ -2055,7 +2066,6 @@ action_notify() { action_promote() { local rc=$OCF_ERR_GENERIC local LH="${LL} promote:" - local nowtime if [ "${OCF_RESKEY_debug}" = 'true' ] ; then d=$(date '+%Y%m%d %H:%M:%S') @@ -2093,10 +2103,8 @@ action_promote() { [ -f "${OCF_RESKEY_policy_file}" ] && . "${OCF_RESKEY_policy_file}" - # create timestamp file - nowtime="$(now)" - ocf_log info "${LH} Updating start timestamp with ${nowtime}" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" + update_rabbit_start_time_if_rc $rc + ocf_log info "${LH} Checking master status" get_monitor rc=$?