rabbit_db: Restart Ra systems after reset during join

dumbbell · michaelklishin · commit 76c9c37ff5e0 · 2024-01-31T20:12:13.000-05:00
[Why]
When the local node joins a remote node, it resets its own data first.
This includes the files of the Ra systems (`quorum` and `coordination`).

When the CLI is used, that's fine because the `rabbit` app is stopped
and thus the Ra systems.

However, when this is done as part of peer discovery, the node is
booting: the Ra systems were started earlier because they are required
to run Khepri. Therefore, the reset deletes files being used. This
breaks the Ra systems.

[How]
The Ra systems are stopped just before the reset (if the join is
performed as part of peer discovery) and they are restarted after.
diff --git a/deps/rabbit/src/rabbit_db_cluster.erl b/deps/rabbit/src/rabbit_db_cluster.erl
@@ -101,11 +101,22 @@ join(RemoteNode, NodeType)
             %% database because we might change it during the join.
             RestartMnesia = rabbit_mnesia:is_running(),
             RestartFFCtl = rabbit_ff_controller:is_running(),
+            RestartRaSystems = rabbit_ra_systems:are_running(),
             RestartRabbit = rabbit:is_running(),
             case RestartRabbit of
                 true ->
                     rabbit:stop();
                 false ->
+                    %% The Ra systems were started before we initialize the
+                    %% database (because Khepri depends on one of them).
+                    %% Therefore, there are files in the data directory. They
+                    %% will go away with the reset and we will need to restart
+                    %% Ra systems afterwards.
+                    case RestartRaSystems of
+                        true  -> ok = rabbit_ra_systems:ensure_stopped();
+                        false -> ok
+                    end,
+
                     case RestartFFCtl of
                         true ->
                             ok = rabbit_ff_controller:wait_for_task_and_stop();
@@ -136,6 +147,30 @@ join(RemoteNode, NodeType)
                 rabbit_ff_registry_factory:release_state_change_lock()
             end,
 
+            %% After the regular reset, we also reset Mnesia specifically if
+            %% it is meant to be used. That's because we may switch back from
+            %% Khepri to Mnesia. To be safe, remove possibly stale files from
+            %% a previous instance where Mnesia was used.
+            case rabbit_khepri:is_enabled(RemoteNode) of
+                true  -> ok;
+                false -> ok = rabbit_mnesia:reset_gracefully()
+            end,
+
+            %% Now that the files are all gone after the reset above, restart
+            %% the Ra systems. They will recreate their folder in the process.
+            case RestartRabbit of
+                true ->
+                    ok;
+                false ->
+                    case RestartRaSystems of
+                        true ->
+                            ok = rabbit_ra_systems:ensure_started(),
+                            ok = rabbit_khepri:setup();
+                        false ->
+                            ok
+                    end
+            end,
+
             ?LOG_INFO(
                "DB: joining cluster using remote nodes:~n~tp", [ClusterNodes],
                #{domain => ?RMQLOG_DOMAIN_DB}),
@@ -182,7 +217,6 @@ join(RemoteNode, NodeType)
     end.
 
 join_using_mnesia(ClusterNodes, NodeType) when is_list(ClusterNodes) ->
-    ok = rabbit_mnesia:reset_gracefully(),
     rabbit_mnesia:join_cluster(ClusterNodes, NodeType).
 
 join_using_khepri(ClusterNodes, disc) ->