Skip to content

Commit

Permalink
Redesigned k8s peer discovery
Browse files Browse the repository at this point in the history
Rather than querying the Kubernetes API, just check the local node name
and try to connect to the pod with `-0` suffix (or configured
`ordinal_start` value). Only the pod with the lowest ordinal can form
a new cluster - all other pods will wait forever.

This should prevent any race conditions and incorrectly formed clusters.
  • Loading branch information
mkuratczyk committed Jan 28, 2025
1 parent a3dae1f commit dd1d62c
Show file tree
Hide file tree
Showing 15 changed files with 343 additions and 630 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/oci-make.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
make package-generic-unix PROJECT_VERSION=4.1.0-alpha.1
- name: Upload package-generic-unix
if: steps.authorized.outputs.authorized == 'true'
uses: actions/upload-artifact@v4.3.1
uses: actions/upload-artifact@v4
with:
name: package-generic-unix
path: PACKAGES/rabbitmq-server-*.tar.xz
Expand Down
66 changes: 40 additions & 26 deletions deps/rabbit/src/rabbit_peer_discovery.erl
Original file line number Diff line number Diff line change
Expand Up @@ -108,29 +108,22 @@ maybe_init() ->
%% node, even if the configuration changed in between.
persistent_term:put(?PT_PEER_DISC_BACKEND, Backend),

_ = code:ensure_loaded(Backend),
case erlang:function_exported(Backend, init, 0) of
true ->
?LOG_DEBUG(
"Peer discovery: backend supports initialisation",
case catch Backend:init() of
ok ->
?LOG_INFO(
"Peer discovery: backend initialisation succeeded",
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
case Backend:init() of
ok ->
?LOG_DEBUG(
"Peer discovery: backend initialisation succeeded",
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
ok;
{error, _Reason} = Error ->
?LOG_WARNING(
"Peer discovery: backend initialisation failed: ~tp.",
[Error],
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
ok
end;
false ->
ok;
{error, _Reason} = Error ->
?LOG_ERROR(
"Peer discovery: backend initialisation failed: ~tp",
[Error],
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
ok;
{'EXIT', {undef, _}} ->
?LOG_DEBUG(
"Peer discovery: backend does not support initialisation",
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
ok
end.

Expand All @@ -153,13 +146,13 @@ sync_desired_cluster() ->

%% We handle retries at the top level: steps are followed sequentially and
%% if one of them fails, we retry the whole process.
{Retries, RetryDelay} = discovery_retries(),
{Retries, RetryDelay} = discovery_retries(Backend),

sync_desired_cluster(Backend, Retries, RetryDelay).

-spec sync_desired_cluster(Backend, RetriesLeft, RetryDelay) -> ok when
Backend :: backend(),
RetriesLeft :: non_neg_integer(),
RetriesLeft :: non_neg_integer() | unlimited,
RetryDelay :: non_neg_integer().
%% @private

Expand Down Expand Up @@ -240,10 +233,18 @@ sync_desired_cluster(Backend, RetriesLeft, RetryDelay) ->

-spec retry_sync_desired_cluster(Backend, RetriesLeft, RetryDelay) -> ok when
Backend :: backend(),
RetriesLeft :: non_neg_integer(),
RetriesLeft :: non_neg_integer() | unlimited,
RetryDelay :: non_neg_integer().
%% @private

retry_sync_desired_cluster(Backend, unlimited, RetryDelay) ->
?LOG_DEBUG(
"Peer discovery: retrying to create/sync cluster in ~b ms "
"(will retry forever)",
[RetryDelay],
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
timer:sleep(RetryDelay),
sync_desired_cluster(Backend, unlimited, RetryDelay);
retry_sync_desired_cluster(Backend, RetriesLeft, RetryDelay)
when RetriesLeft > 0 ->
RetriesLeft1 = RetriesLeft - 1,
Expand Down Expand Up @@ -1007,11 +1008,24 @@ maybe_unregister() ->
ok
end.

-spec discovery_retries() -> {Retries, RetryDelay} when
Retries :: non_neg_integer(),
-spec discovery_retries(Backend) -> {Retries, RetryDelay} when
Backend :: backend(),
Retries :: non_neg_integer() | unlimited,
RetryDelay :: non_neg_integer().

discovery_retries() ->
discovery_retries(Backend) ->
{_Retries, RetryDelay} = RetryConfig = discovery_retries_from_config(),
case catch Backend:retry_strategy() of
unlimited ->
{unlimited, RetryDelay};
_ ->
RetryConfig
end.

-spec discovery_retries_from_config() -> {Retries, RetryDelay} when
Retries :: non_neg_integer(),
RetryDelay :: non_neg_integer().
discovery_retries_from_config() ->
case application:get_env(rabbit, cluster_formation) of
{ok, Proplist} ->
Retries = proplists:get_value(discovery_retry_limit, Proplist, ?DEFAULT_DISCOVERY_RETRY_COUNT),
Expand Down
4 changes: 3 additions & 1 deletion deps/rabbit_common/src/rabbit_peer_discovery_backend.erl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@

-callback unlock(Data :: term()) -> ok.

-optional_callbacks([init/0]).
-callback retry_strategy() -> limited | unlimited.

-optional_callbacks([init/0, retry_strategy/0]).

-export([api_version/0]).

Expand Down
2 changes: 1 addition & 1 deletion deps/rabbitmq_peer_discovery_k8s/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ PROJECT_DESCRIPTION = Kubernetes-based RabbitMQ peer discovery backend
PROJECT_MOD = rabbitmq_peer_discovery_k8s_app

DEPS = rabbit_common rabbitmq_peer_discovery_common rabbit
TEST_DEPS = rabbitmq_ct_helpers rabbitmq_ct_client_helpers ct_helper meck
TEST_DEPS = rabbitmq_ct_helpers rabbitmq_ct_client_helpers meck
dep_ct_helper = git https://github.com/extend/ct_helper.git master

DEP_EARLY_PLUGINS = rabbit_common/mk/rabbitmq-early-plugin.mk
Expand Down
79 changes: 51 additions & 28 deletions deps/rabbitmq_peer_discovery_k8s/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,60 +2,83 @@

## Overview

This is an implementation of RabbitMQ [peer discovery interface](https://www.rabbitmq.com/blog/2018/02/12/peer-discovery-subsystem-in-rabbitmq-3-7/)
for Kubernetes.
This is an implementation of RabbitMQ peer discovery interface for Kubernetes.

This plugin only performs peer discovery using Kubernetes API as the source of data on running cluster pods.
Please get familiar with [RabbitMQ clustering fundamentals](https://rabbitmq.com/clustering.html) before attempting
to use it.
On Kubernetes, RabbitMQ should be deployed as a StatefulSet. Each Pod in a StatefulSet has
a name made up of the StatefulSet name and an ordinal index. The ordinal index values almost
always start with 0, although [this is configurable](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#ordinal-index).

Cluster provisioning and most of Day 2 operations such as [proper monitoring](https://rabbitmq.com/monitoring.html)
are not in scope for this plugin.
This plugin only allows the node with the lowest ordinal index (generally the pod with the `-0` suffix) to form a new cluster.
This node is referred to as the seed node.

For a more comprehensive open source RabbitMQ on Kubernetes deployment solution,
see the [RabbitMQ Cluster Operator for Kubernetes](https://www.rabbitmq.com/kubernetes/operator/operator-overview.html).
The Operator is developed [on GitHub](https://github.com/rabbitmq/cluster-operator/) and contains its
own [set of examples](https://github.com/rabbitmq/cluster-operator/tree/main/docs/examples).
All other nodes will join the seed node, or will forever keep try to join it, if they can't.

In the most common scenario, this means that:
* the pod with `-0` suffix will start immediately, effectively forming a new single-node cluster
* any other pod will join the pod with `-0` suffix and synchronize the cluster metadata with it

## Supported RabbitMQ Versions

This plugin ships with RabbitMQ 3.7.0 or later.
## Configuration

**In most cases, no configuration should be necessary beyond enabling this plugin.**

## Installation
If you use the [RabbitMQ Cluster Operator](https://www.rabbitmq.com/kubernetes/operator/operator-overview)
or the [Bitnami Helm Chart](https://github.com/bitnami/charts/tree/main/bitnami/rabbitmq), this plugin is enabled by default,
so you don't have to do anything.

This plugin ships with [supported RabbitMQ versions](https://www.rabbitmq.com/versions.html).
There is no need to install it separately.
### Advanced Configuration

As with any [plugin](https://rabbitmq.com/plugins.html), it must be enabled before it
can be used. For peer discovery plugins it means they must be [enabled](https://rabbitmq.com//plugins.html#basics) or [preconfigured](https://rabbitmq.com//plugins.html#enabled-plugins-file)
before first node boot:
If you use a different ordinal start value in your StatefulSet, you have to configure this plugin to use it:
```
cluster_formation.k8s.ordinal_start = N
```
where `N` matches the `.spec.ordinals.start` value of the StatefulSet.

If the plugin doesn't work for any reason (a very unusual Kubernetes configuration or issues with hostname resolution)
and you have to force RabbitMQ to use a different seed node than it would automatically, you can do this:
```
rabbitmq-plugins --offline enable rabbitmq_peer_discovery_k8s
cluster_formation.k8s.seed_node = rabbit@seed-node-hostname
```

If `cluster_formation.k8s.seed_node` is configured, this plugin will just use this value as the seed node.
If you do this, please open a GitHub issue and explain why the plugin didn't work for you, so we can improve it.

### Historical Notes

This implementation (version 2) of the plugin was introduced in RabbitMQ 4.1 and has little to do with the original design,
which was included in RabbitMQ from version 3.7.0 until 4.0. Nevertheless, backwards compatibility is maintained,
by simply ignoring the configuration options of the original implementation.

The original implementation of this plugin performed peer discovery by querying the Kubernetes API for the list of endpoints
serving as the backends of a Kubernetes Service. However, this approach had a few issues:
1. The query was not necessary, given that the Pod names are predictable
2. The query could fail or require configuration to work (eg. TLS settings had to be adjusted in some environments)
3. To perform the query, access to the Kubernetes API was required (unnecessary privileges)
4. In some environments, the plugin was prone to race conditions and could form more than 1 cluster

The new design solves all those problems:
1. It doesn't query the Kubernetes API at all
2. Only one node is allowed to form a cluster

## Supported RabbitMQ Versions

Version 2 was first included in RabbitMQ 4.1.

Version 1 of this plugin (which queried the Kubernetes API) was included from RabbitMQ 3.7.0 until 4.0.

## Documentation

See [RabbitMQ Cluster Formation guide](https://www.rabbitmq.com/cluster-formation.html) for an overview
of the peer discovery subsystem, general and Kubernetes-specific configurable values and troubleshooting tips.

Example deployments that use this plugin can be found in an [RabbitMQ on Kubernetes examples repository](https://github.com/rabbitmq/diy-kubernetes-examples).
Note that they are just that, examples, and won't be optimal for every use case or cover a lot of important production
system concerns such as monitoring, persistent volume settings, access control, sizing, and so on.


## Contributing

See [CONTRIBUTING.md](./CONTRIBUTING.md) and our [development process overview](https://www.rabbitmq.com/github.html).


## License

[Licensed under the MPL](LICENSE-MPL-RabbitMQ), same as RabbitMQ server.


## Copyright

(c) 2007-2024 Broadcom. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
(c) 2007-2025 Broadcom. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
9 changes: 0 additions & 9 deletions deps/rabbitmq_peer_discovery_k8s/app.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ def all_beam_files(name = "all_beam_files"):
srcs = [
"src/rabbit_peer_discovery_k8s.erl",
"src/rabbitmq_peer_discovery_k8s.erl",
"src/rabbitmq_peer_discovery_k8s_app.erl",
"src/rabbitmq_peer_discovery_k8s_node_monitor.erl",
"src/rabbitmq_peer_discovery_k8s_sup.erl",
],
hdrs = [":public_and_private_hdrs"],
app_name = "rabbitmq_peer_discovery_k8s",
Expand All @@ -37,9 +34,6 @@ def all_test_beam_files(name = "all_test_beam_files"):
srcs = [
"src/rabbit_peer_discovery_k8s.erl",
"src/rabbitmq_peer_discovery_k8s.erl",
"src/rabbitmq_peer_discovery_k8s_app.erl",
"src/rabbitmq_peer_discovery_k8s_node_monitor.erl",
"src/rabbitmq_peer_discovery_k8s_sup.erl",
],
hdrs = [":public_and_private_hdrs"],
app_name = "rabbitmq_peer_discovery_k8s",
Expand Down Expand Up @@ -73,9 +67,6 @@ def all_srcs(name = "all_srcs"):
srcs = [
"src/rabbit_peer_discovery_k8s.erl",
"src/rabbitmq_peer_discovery_k8s.erl",
"src/rabbitmq_peer_discovery_k8s_app.erl",
"src/rabbitmq_peer_discovery_k8s_node_monitor.erl",
"src/rabbitmq_peer_discovery_k8s_sup.erl",
],
)
filegroup(
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,38 @@
%% Copyright (c) 2007-2025 Broadcom. All Rights Reserved. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
%%

{mapping, "cluster_formation.k8s.ordinal_start", "rabbit.cluster_formation.peer_discovery_k8s.ordinal_start", [
{datatype, integer}
]}.

{translation, "rabbit.cluster_formation.peer_discovery_k8s.ordinal_start",
fun(Conf) ->
case cuttlefish:conf_get("cluster_formation.k8s.ordinal_start", Conf, undefined) of
undefined -> cuttlefish:unset();
Value -> Value
end
end}.

{mapping, "cluster_formation.k8s.seed_node", "rabbit.cluster_formation.peer_discovery_k8s.seed_node", [
{datatype, string}
]}.

{translation, "rabbit.cluster_formation.peer_discovery_k8s.seed_node",
fun(Conf) ->
case cuttlefish:conf_get("cluster_formation.k8s.seed_node", Conf, undefined) of
undefined -> cuttlefish:unset();
Value -> Value
end
end}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% All configuration options below are deprecated as of version 2 of this plugin (first shipped in 4.1).
%% Test cases are kept to ensure we still accept the old config, but it is completely ignored.
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Kubernetes host

{mapping, "cluster_formation.k8s.host", "rabbit.cluster_formation.peer_discovery_k8s.k8s_host", [
Expand Down
Loading

0 comments on commit dd1d62c

Please sign in to comment.