Skip to content

Commit

Permalink
Merge branch 'stable'
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelklishin committed Aug 16, 2016
2 parents 6225ce6 + 4d4144e commit ccb5972
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 0 deletions.
11 changes: 11 additions & 0 deletions src/rabbit_autoheal.erl
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,17 @@ winner_finish(Notify) ->
send(leader(), {autoheal_finished, node()}),
not_healing.

%% XXX This can enter infinite loop, if mnesia was somehow restarted
%% outside of our control - i.e. somebody started app back by hand or
%% completely restarted node. One possible solution would be something
%% like this (but it needs some more pondering and is left for some
%% other patch):
%% - monitor top-level mnesia supervisors of all losers
%% - notify loosers about the fact that they are indeed loosers
%% - wait for all monitors to go 'DOWN' (+ maybe some timeout on the whole process)
%% - do one round of parallel rpc calls to check whether mnesia is still stoppend on all
%% loosers
%% - If everything is still stopped, continue autoheall process. Or cancel it otherwise.
wait_for_mnesia_shutdown([Node | Rest] = AllNodes) ->
case rpc:call(Node, mnesia, system_info, [is_running]) of
no ->
Expand Down
19 changes: 19 additions & 0 deletions test/health_check_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
,ignores_remote_alarms/1
,detects_local_alarm/1
,honors_timeout_argument/1
,detects_stuck_local_node_monitor/1
,ignores_stuck_remote_node_monitor/1
]).

all() ->
Expand All @@ -47,6 +49,8 @@ groups() ->
,ignores_remote_alarms
,detects_local_alarm
,honors_timeout_argument
,detects_stuck_local_node_monitor
,ignores_stuck_remote_node_monitor
]}].

init_per_suite(Config) ->
Expand Down Expand Up @@ -123,6 +127,21 @@ detects_local_alarm(Config) ->
{match, _} = re:run(Str, "resource alarm.*in effect"),
ok.

detects_stuck_local_node_monitor(Config) ->
[A|_] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
rabbit_ct_broker_helpers:rpc(Config, A, sys, suspend, [rabbit_node_monitor]),
{error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
{match, _} = re:run(Str, "operation node_health_check.*timed out"),
resume_sys_process(Config, A, rabbit_node_monitor),
ok.

ignores_stuck_remote_node_monitor(Config) ->
[A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
rabbit_ct_broker_helpers:rpc(Config, A, sys, suspend, [rabbit_node_monitor]),
{ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, B, ["-t", "5", "node_health_check"]),
resume_sys_process(Config, A, rabbit_node_monitor),
ok.

honors_timeout_argument(Config) ->
[A|_] = open_channel_and_declare_queue_everywhere(Config),
QPid = suspend_single_queue(Config, A),
Expand Down

0 comments on commit ccb5972

Please sign in to comment.