Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check rabbit_node_monitor during health-check #915

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/rabbit_autoheal.erl
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,17 @@ winner_finish(Notify) ->
send(leader(), {autoheal_finished, node()}),
not_healing.

%% XXX This can enter infinite loop, if mnesia was somehow restarted
%% outside of our control - i.e. somebody started app back by hand or
%% completely restarted node. One possible solution would be something
%% like this (but it needs some more pondering and is left for some
%% other patch):
%% - monitor top-level mnesia supervisors of all losers
%% - notify loosers about the fact that they are indeed loosers
%% - wait for all monitors to go 'DOWN' (+ maybe some timeout on the whole process)
%% - do one round of parallel rpc calls to check whether mnesia is still stoppend on all
%% loosers
%% - If everything is still stopped, continue autoheall process. Or cancel it otherwise.
wait_for_mnesia_shutdown([Node | Rest] = AllNodes) ->
case rpc:call(Node, mnesia, system_info, [is_running]) of
no ->
Expand Down
19 changes: 19 additions & 0 deletions test/health_check_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
,ignores_remote_alarms/1
,detects_local_alarm/1
,honors_timeout_argument/1
,detects_stuck_local_node_monitor/1
,ignores_stuck_remote_node_monitor/1
]).

all() ->
Expand All @@ -47,6 +49,8 @@ groups() ->
,ignores_remote_alarms
,detects_local_alarm
,honors_timeout_argument
,detects_stuck_local_node_monitor
,ignores_stuck_remote_node_monitor
]}].

init_per_suite(Config) ->
Expand Down Expand Up @@ -123,6 +127,21 @@ detects_local_alarm(Config) ->
{match, _} = re:run(Str, "resource alarm.*in effect"),
ok.

detects_stuck_local_node_monitor(Config) ->
[A|_] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
rabbit_ct_broker_helpers:rpc(Config, A, sys, suspend, [rabbit_node_monitor]),
{error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
{match, _} = re:run(Str, "operation node_health_check.*timed out"),
resume_sys_process(Config, A, rabbit_node_monitor),
ok.

ignores_stuck_remote_node_monitor(Config) ->
[A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
rabbit_ct_broker_helpers:rpc(Config, A, sys, suspend, [rabbit_node_monitor]),
{ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, B, ["-t", "5", "node_health_check"]),
resume_sys_process(Config, A, rabbit_node_monitor),
ok.

honors_timeout_argument(Config) ->
[A|_] = open_channel_and_declare_queue_everywhere(Config),
QPid = suspend_single_queue(Config, A),
Expand Down