Skip to content

Commit

Permalink
Merge branch 'master' into rabbitmq-server-500-squashed
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelklishin committed Aug 24, 2016
2 parents 922a43c + f057a8c commit 2994c42
Show file tree
Hide file tree
Showing 4 changed files with 248 additions and 73 deletions.
132 changes: 110 additions & 22 deletions scripts/rabbitmq-server-ha.ocf
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ OCF_RESKEY_use_fqdn_default=false
OCF_RESKEY_fqdn_prefix_default=""
OCF_RESKEY_max_rabbitmqctl_timeouts_default=3
OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy"
OCF_RESKEY_rmq_feature_health_check_default=true
OCF_RESKEY_rmq_feature_local_list_queues_default=true

: ${HA_LOGTAG="lrmd"}
: ${HA_LOGFACILITY="daemon"}
Expand All @@ -68,6 +70,8 @@ OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy"
: ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}}
: ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}}
: ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}}
: ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}}
: ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}}

#######################################################################

Expand Down Expand Up @@ -298,6 +302,26 @@ A path to the shell script to setup RabbitMQ policies
<content type="string" default="${OCF_RESKEY_policy_file_default}" />
</parameter>
<parameter name="rmq_feature_health_check" unique="0" required="0">
<longdesc lang="en">
Since rabbit 3.6.4 list_queues/list_channels-based monitoring should
be replaced with "node_health_check" command, as it creates no network
load at all.
</longdesc>
<shortdesc lang="en">Use node_health_check for monitoring</shortdesc>
<content type="boolean" default="${OCF_RESKEY_rmq_feature_health_check_default}" />
</parameter>
<parameter name="rmq_feature_local_list_queues" unique="0" required="0">
<longdesc lang="en">
For rabbit version that implements --local flag for list_queues, this
can greatly reduce network overhead in cases when node is
stopped/demoted.
</longdesc>
<shortdesc lang="en">Use --local option for list_queues</shortdesc>
<content type="boolean" default="${OCF_RESKEY_rmq_feature_local_list_queues_default}" />
</parameter>
$EXTENDED_OCF_PARAMS
</parameters>
Expand Down Expand Up @@ -1377,7 +1401,9 @@ check_timeouts() {
local timeouts_attr_name=$2
local op_name=$3

if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then
# 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about
# timeout.
if [ $op_rc -ne 124 -a $op_rc -ne 137 -a $op_rc -ne 75 ]; then
ocf_update_private_attr $timeouts_attr_name 0
return 0
fi
Expand All @@ -1401,12 +1427,20 @@ check_timeouts() {
}

wait_sync() {
wait_time=$1
local wait_time=$1
local queues
local opt_arg=""

queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues name state"
su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \
do sleep 2; done\""
return $?
if [ "$OCF_RESKEY_rmq_feature_local_list_queues" = "true" ]; then
opt_arg="--local"
fi

queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues $opt_arg name state"

su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \
do sleep 2; done\""

return $?
}

get_monitor() {
Expand Down Expand Up @@ -1516,7 +1550,75 @@ get_monitor() {
return $rc
fi

# Check if the rabbitmqctl control plane is alive.
# rc can be SUCCESS or RUNNING_MASTER, don't touch it unless there
# is some error uncovered by node_health_check
if ! node_health_check; then
rc=$OCF_ERR_GENERIC
fi

# If we are the master and healthy, check that we see other cluster members
# Order a member to restart if we don't see it
if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
for node in $(get_all_pacemaker_nodes); do
if ! is_clustered_with $node; then
nowtime=$(now)

ocf_log warn "${LH} node $node is not connected with us, ordering it to restart."
ocf_update_private_attr 'rabbit-ordered-to-restart' "$nowtime" "$node"
fi
done
fi

ocf_log info "${LH} get_monitor function ready to return ${rc}"
return $rc
}

# Check if the rabbitmqctl control plane is alive.
node_health_check() {
local rc
if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then
node_health_check_local
rc=$?
else
node_health_check_legacy
rc=$?
fi
return $rc
}

node_health_check_local() {
local LH="${LH} node_health_check_local():"
local rc
local rc_timeouts

# Give node_health_check some time to handle timeout by itself.
# By using internal rabbitmqctl timeouts, we allow it to print
# more useful diagnostics
local timeout=$((TIMEOUT_ARG - 2))
su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout"
rc=$?

check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check"
rc_timeouts=$?

if [ "$rc_timeouts" -eq 2 ]; then
master_score 0
ocf_log info "${LH} node_health_check timed out, retry limit reached"
return $OCF_ERR_GENERIC
elif [ "$rc_timeouts" -eq 1 ]; then
ocf_log info "${LH} node_health_check timed out, going to retry"
return $OCF_ERR_GENERIC
fi

if [ "$rc" -ne 0 ]; then
ocf_log err "${LH} rabbitmqctl node_health_check exited with errors."
return $OCF_ERR_GENERIC
else
return $OCF_SUCCESS
fi
}

node_health_check_legacy() {
local rc_alive
local timeout_alive
su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null"
Expand Down Expand Up @@ -1609,20 +1711,6 @@ get_monitor() {
fi
fi

# If we are the master and healthy, check that we see other cluster members
# Order a member to restart if we don't see it
if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
for node in $(get_all_pacemaker_nodes); do
if ! is_clustered_with $node; then
nowtime=$(now)

ocf_log warn "${LH} node $node is not connected with us, ordering it to restart."
ocf_update_private_attr 'rabbit-ordered-to-restart' "$nowtime" "$node"
fi
done
fi

ocf_log info "${LH} get_monitor function ready to return ${rc}"
return $rc
}

Expand Down Expand Up @@ -1711,7 +1799,7 @@ action_start() {
return $OCF_SUCCESS
fi

local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts"
local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts"
local attr_name_to_reset
for attr_name_to_reset in $attrs_to_zero; do
ocf_update_private_attr $attr_name_to_reset 0
Expand Down
113 changes: 78 additions & 35 deletions src/rabbit_autoheal.erl
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,18 @@ node_down(_Node, not_healing) ->
node_down(Node, {winner_waiting, _, Notify}) ->
abort([Node], Notify);

node_down(Node, {leader_waiting, Node, _Notify}) ->
%% The winner went down, we don't know what to do so we simply abort.
rabbit_log:info("Autoheal: aborting - winner ~p went down~n", [Node]),
not_healing;

node_down(Node, {leader_waiting, _, _} = St) ->
%% If it is a partial partition, the winner might continue with the
%% healing process. If it is a full partition, the winner will also
%% see it and abort. Let's wait for it.
rabbit_log:info("Autoheal: ~p went down, waiting for winner decision ~n", [Node]),
St;

node_down(Node, _State) ->
rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
not_healing.
Expand Down Expand Up @@ -218,14 +230,24 @@ handle_msg({become_winner, Losers},
not_healing, _Partitions) ->
rabbit_log:info("Autoheal: I am the winner, waiting for ~p to stop~n",
[Losers]),
%% The leader said everything was ready - do we agree? If not then
%% give up.
Down = Losers -- rabbit_node_monitor:alive_rabbit_nodes(Losers),
case Down of
[] -> [send(L, {winner_is, node()}) || L <- Losers],
{winner_waiting, Losers, Losers};
_ -> abort(Down, Losers)
end;
stop_partition(Losers);

handle_msg({become_winner, Losers},
{winner_waiting, _, Losers}, _Partitions) ->
%% The leader has aborted the healing, might have seen us down but
%% we didn't see the same. Let's try again as it is the same partition.
rabbit_log:info("Autoheal: I am the winner and received a duplicated "
"request, waiting again for ~p to stop~n", [Losers]),
stop_partition(Losers);

handle_msg({become_winner, _},
{winner_waiting, _, Losers}, _Partitions) ->
%% Something has happened to the leader, it might have seen us down but we
%% are still alive. Partitions have changed, cannot continue.
rabbit_log:info("Autoheal: I am the winner and received another healing "
"request, partitions have changed. Aborting ~n", [Losers]),
winner_finish(Losers),
not_healing;

handle_msg({winner_is, Winner}, State = not_healing,
_Partitions) ->
Expand Down Expand Up @@ -269,6 +291,14 @@ handle_msg({autoheal_finished, Winner}, not_healing, _Partitions)
%% We are the leader and the winner. The state already transitioned
%% to "not_healing" at the end of the autoheal process.
rabbit_log:info("Autoheal finished according to winner ~p~n", [node()]),
not_healing;

handle_msg({autoheal_finished, Winner}, not_healing, _Partitions) ->
%% We might have seen the winner down during a partial partition and
%% transitioned to not_healing. However, the winner was still able
%% to finish. Let it pass.
rabbit_log:info("Autoheal finished according to winner ~p."
" Unexpected, I might have previously seen the winner down~n", [Winner]),
not_healing.

%%----------------------------------------------------------------------------
Expand All @@ -279,7 +309,9 @@ abort(Down, Notify) ->
rabbit_log:info("Autoheal: aborting - ~p down~n", [Down]),
%% Make sure any nodes waiting for us start - it won't necessarily
%% heal the partition but at least they won't get stuck.
winner_finish(Notify).
%% If we are executing this, we are not stopping. Thus, don't wait
%% for ourselves!
winner_finish(Notify -- [node()]).

winner_finish(Notify) ->
%% There is a race in Mnesia causing a starting loser to hang
Expand All @@ -297,32 +329,33 @@ winner_finish(Notify) ->
send(leader(), {autoheal_finished, node()}),
not_healing.

%% XXX This can enter infinite loop, if mnesia was somehow restarted
%% outside of our control - i.e. somebody started app back by hand or
%% completely restarted node. One possible solution would be something
%% like this (but it needs some more pondering and is left for some
%% other patch):
%% - monitor top-level mnesia supervisors of all losers
%% - notify loosers about the fact that they are indeed loosers
%% - wait for all monitors to go 'DOWN' (+ maybe some timeout on the whole process)
%% - do one round of parallel rpc calls to check whether mnesia is still stoppend on all
%% loosers
%% - If everything is still stopped, continue autoheall process. Or cancel it otherwise.
wait_for_mnesia_shutdown([Node | Rest] = AllNodes) ->
case rpc:call(Node, mnesia, system_info, [is_running]) of
no ->
wait_for_mnesia_shutdown(Rest);
Running when
Running =:= yes orelse
Running =:= starting orelse
Running =:= stopping ->
timer:sleep(?MNESIA_STOPPED_PING_INTERNAL),
wait_for_mnesia_shutdown(AllNodes);
_ ->
wait_for_mnesia_shutdown(Rest)
end;
wait_for_mnesia_shutdown([]) ->
ok.
%% This improves the previous implementation, but could still potentially enter an infinity
%% loop. If it also possible that for when it finishes some of the nodes have been
%% manually restarted, but we can't do much more (apart from stop them again). So let it
%% continue and notify all the losers to restart.
wait_for_mnesia_shutdown(AllNodes) ->
Monitors = lists:foldl(fun(Node, Monitors0) ->
pmon:monitor({mnesia_sup, Node}, Monitors0)
end, pmon:new(), AllNodes),
wait_for_supervisors(Monitors).

wait_for_supervisors(Monitors) ->
case pmon:is_empty(Monitors) of
true ->
ok;
false ->
receive
{'DOWN', _MRef, process, {mnesia_sup, _} = I, _Reason} ->
wait_for_supervisors(pmon:erase(I, Monitors))
after
60000 ->
AliveLosers = [Node || {_, Node} <- pmon:monitored(Monitors)],
rabbit_log:info("Autoheal: mnesia in nodes ~p is still up, sending "
"winner notification again to these ~n", [AliveLosers]),
[send(L, {winner_is, node()}) || L <- AliveLosers],
wait_for_mnesia_shutdown(AliveLosers)
end
end.

restart_loser(State, Winner) ->
rabbit_log:warning(
Expand Down Expand Up @@ -402,3 +435,13 @@ fmt_error({remote_down, RemoteDown}) ->
rabbit_misc:format("Remote nodes disconnected:~n ~p", [RemoteDown]);
fmt_error({nodes_down, NodesDown}) ->
rabbit_misc:format("Local nodes down: ~p", [NodesDown]).

stop_partition(Losers) ->
%% The leader said everything was ready - do we agree? If not then
%% give up.
Down = Losers -- rabbit_node_monitor:alive_rabbit_nodes(Losers),
case Down of
[] -> [send(L, {winner_is, node()}) || L <- Losers],
{winner_waiting, Losers, Losers};
_ -> abort(Down, Losers)
end.
Loading

0 comments on commit 2994c42

Please sign in to comment.