From 6d0ca9cca257e61ffe522933a1bf6460c8047d5c Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 15 Aug 2016 11:37:48 +0100 Subject: [PATCH 1/5] Safely handle (and log) anonymous info messages, most likely from the gm process' neighbours --- src/gm.erl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/gm.erl b/src/gm.erl index 176e14537f2d..822d50252b62 100644 --- a/src/gm.erl +++ b/src/gm.erl @@ -757,7 +757,14 @@ handle_info({'DOWN', MRef, process, _Pid, Reason}, catch lost_membership -> {stop, normal, State} - end. + end; +handle_info(Msg, State = #state { self = Self }) -> + %% TODO: For #gm_group{} related info messages, it could be worthwhile to + %% change_view/2, as this might reflect an alteration in the gm group, meaning + %% we now need to update our state. see rabbitmq-server#914. + rabbit_log:info("GM member ~p received unexpected message ~p~n" + "When Server state == ~p", [Self, Msg, State]), + noreply(State). terminate(Reason, #state { module = Module, callback_args = Args }) -> Module:handle_terminate(Args, Reason). From 53f10c98bb16202c09c2567a2a0590dd8e1f8dc9 Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 15 Aug 2016 11:43:23 +0100 Subject: [PATCH 2/5] Adds check_membership/2 clause for handling non-existant gm group --- src/gm.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gm.erl b/src/gm.erl index 822d50252b62..1507961df647 100644 --- a/src/gm.erl +++ b/src/gm.erl @@ -1600,7 +1600,9 @@ check_membership(Self, #gm_group{members = M} = Group) -> Group; false -> throw(lost_membership) - end. + end; +check_membership(_Self, {error, not_found}) -> + throw(lost_membership). check_membership(GroupName) -> case dirty_read_group(GroupName) of From a41606031791651ecea255070395b4cbfb075be4 Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 15 Aug 2016 12:30:38 +0100 Subject: [PATCH 3/5] Handle unexpected gm group alterations prior to removal of dead pids from queue --- src/rabbit_mirror_queue_coordinator.erl | 9 +++++++++ src/rabbit_mirror_queue_misc.erl | 16 +++++++++++++--- src/rabbit_mirror_queue_slave.erl | 12 +++++++++--- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/rabbit_mirror_queue_coordinator.erl b/src/rabbit_mirror_queue_coordinator.erl index 221f11f18a01..017d1d1fa202 100644 --- a/src/rabbit_mirror_queue_coordinator.erl +++ b/src/rabbit_mirror_queue_coordinator.erl @@ -355,6 +355,15 @@ handle_cast({gm_deaths, DeadGMPids}, DeadPids), rabbit_mirror_queue_misc:add_mirrors(QueueName, ExtraNodes, async), noreply(State); + {ok, _MPid0, DeadPids, _ExtraNodes} -> + %% see rabbitmq-server#914; + %% Different slave is now master, stop current coordinator normally. + %% Initiating queue is now slave and the least we could do is report + %% deaths which we 'think' we saw. + %% NOTE: Reported deaths here, could be inconsistant. + rabbit_mirror_queue_misc:report_deaths(MPid, false, QueueName, + DeadPids), + {stop, normal, State}; {error, not_found} -> {stop, normal, State} end; diff --git a/src/rabbit_mirror_queue_misc.erl b/src/rabbit_mirror_queue_misc.erl index 83350920e6c9..4205fabb83d6 100644 --- a/src/rabbit_mirror_queue_misc.erl +++ b/src/rabbit_mirror_queue_misc.erl @@ -76,7 +76,7 @@ remove_from_queue(QueueName, Self, DeadGMPids) -> rabbit_misc:execute_mnesia_transaction( fun () -> %% Someone else could have deleted the queue before we - %% get here. + %% get here. Or, gm group could've altered. see rabbitmq-server#914 case mnesia:read({rabbit_queue, QueueName}) of [] -> {error, not_found}; [Q = #amqqueue { pid = QPid, @@ -90,7 +90,16 @@ remove_from_queue(QueueName, Self, DeadGMPids) -> AlivePids = [Pid || {_GM, Pid} <- AliveGM], Alive = [Pid || Pid <- [QPid | SPids], lists:member(Pid, AlivePids)], - {QPid1, SPids1} = promote_slave(Alive), + {QPid1, SPids1} = case Alive of + [] -> + %% GM altered, & if all pids are + %% perceived as dead, rather do + %% do nothing here, & trust the + %% promoted slave to have updated + %% mnesia during the alteration. + {QPid, SPids}; + _ -> promote_slave(Alive) + end, Extra = case {{QPid, SPids}, {QPid1, SPids1}} of {Same, Same} -> @@ -98,7 +107,8 @@ remove_from_queue(QueueName, Self, DeadGMPids) -> _ when QPid =:= QPid1 orelse QPid1 =:= Self -> %% Either master hasn't changed, so %% we're ok to update mnesia; or we have - %% become the master. + %% become the master. If gm altered, + %% we have no choice but to proceed. Q1 = Q#amqqueue{pid = QPid1, slave_pids = SPids1, gm_pids = AliveGM}, diff --git a/src/rabbit_mirror_queue_slave.erl b/src/rabbit_mirror_queue_slave.erl index 6f46cdc69881..2cb445518071 100644 --- a/src/rabbit_mirror_queue_slave.erl +++ b/src/rabbit_mirror_queue_slave.erl @@ -225,9 +225,15 @@ handle_call({gm_deaths, DeadGMPids}, From, _ -> %% master has changed to not us gen_server2:reply(From, ok), - %% assertion, we don't need to add_mirrors/2 in this - %% branch, see last clause in remove_from_queue/2 - [] = ExtraNodes, + %% see rabbitmq-server#914; + %% It's not always guaranteed that we won't have ExtraNodes. + %% If gm alters, master can change to not us with extra nodes, + %% in which case we attempt to add mirrors on those nodes. + case ExtraNodes of + [] -> void; + _ -> rabbit_mirror_queue_misc:add_mirrors( + QName, ExtraNodes, async) + end, %% Since GM is by nature lazy we need to make sure %% there is some traffic when a master dies, to %% make sure all slaves get informed of the From 39a5faf4cee7fc480607e51ad75806fbecede12f Mon Sep 17 00:00:00 2001 From: Dmitry Mescheryakov Date: Thu, 18 Aug 2016 15:35:38 +0300 Subject: [PATCH 4/5] [OCF HA] Add ocf_get_private_attr function to RabbitMQ OCF script The function is extracted from check_timeouts to be re-used later in other parts of the script. Also, swtich check_timeouts to use existing ocf_update_private_attr function. --- scripts/rabbitmq-server-ha.ocf | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 8f9cb16aa53e..84baaba8259f 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -1362,25 +1362,18 @@ check_timeouts() { local op_name=$3 if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then - ocf_run attrd_updater -p --name $timeouts_attr_name --update 0 + ocf_update_private_attr $timeouts_attr_name 0 return 0 fi local count - count=`attrd_updater --name $timeouts_attr_name --query 2>/dev/null` - if [ $? -ne 0 ]; then - # the attrd_updater exited with error. In that case most probably it printed garbage - # instead of the number we need. So defensively assume that it is zero. - - count=0 - fi - count=`echo "${count}" | awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d'` + count=$(ocf_get_private_attr $timeouts_attr_name 0) count=$((count+1)) # There is a slight chance that this piece of code will be executed twice simultaneously. # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need # precise calculation here. - ocf_run attrd_updater -p --name $timeouts_attr_name --update $count + ocf_update_private_attr $timeouts_attr_name $count if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now." @@ -1634,6 +1627,18 @@ get_monitor() { return $rc } +ocf_get_private_attr() { + local attr_name="${1:?}" + local attr_default_value="${2:?}" + local count + count=$(attrd_updater -p --name "$attr_name" --query) + if [ $? -ne 0 ]; then + echo $attr_default_value + else + echo "$count" | awk -vdef_val="$attr_default_value" '{ gsub(/"/, "", $3); split($3, vals, "="); if (vals[2] != "(null)") print vals[2]; else print def_val }' + fi +} + ocf_update_private_attr() { local attr_name="${1:?}" local attr_value="${2:?}" From 6fc561f14213446f6bfc7ba7e6d946640d5f0d56 Mon Sep 17 00:00:00 2001 From: Diana Corbacho Date: Fri, 19 Aug 2016 10:45:55 +0100 Subject: [PATCH 5/5] Test GM crash when group is deleted while processing a DOWN message --- test/gm_SUITE.erl | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/test/gm_SUITE.erl b/test/gm_SUITE.erl index e4c68a257a91..8b07c9efad32 100644 --- a/test/gm_SUITE.erl +++ b/test/gm_SUITE.erl @@ -39,7 +39,8 @@ all() -> confirmed_broadcast, member_death, receive_in_order, - unexpected_msg + unexpected_msg, + down_in_members_change ]. init_per_suite(Config) -> @@ -123,6 +124,41 @@ unexpected_msg(_Config) -> passed end). +down_in_members_change(_Config) -> + %% Setup + ok = gm:create_tables(), + {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self(), + fun rabbit_misc:execute_mnesia_transaction/1), + passed = receive_joined(Pid, [Pid], timeout_joining_gm_group_1), + {ok, Pid2} = gm:start_link(?MODULE, ?MODULE, self(), + fun rabbit_misc:execute_mnesia_transaction/1), + passed = receive_joined(Pid2, [Pid, Pid2], timeout_joining_gm_group_2), + passed = receive_birth(Pid, Pid2, timeout_waiting_for_birth_2), + + %% Test. Simulate that the gm group is deleted (forget_group) while + %% processing the 'DOWN' message from the neighbour + process_flag(trap_exit, true), + ok = meck:new(mnesia, [passthrough]), + ok = meck:expect(mnesia, read, fun({gm_group, ?MODULE}) -> + []; + (Key) -> + meck:passthrough([Key]) + end), + gm:leave(Pid2), + Passed = receive + {'EXIT', Pid, normal} -> + passed; + {'EXIT', Pid, _} -> + crashed + after 15000 -> + timeout + end, + %% Cleanup + meck:unload(mnesia), + process_flag(trap_exit, false), + passed = Passed. + + do_broadcast(Fun) -> with_two_members(broadcast_fun(Fun)).