Skip to content

Commit 76c9c37

Browse files
dumbbellmichaelklishin
authored andcommitted
rabbit_db: Restart Ra systems after reset during join
[Why] When the local node joins a remote node, it resets its own data first. This includes the files of the Ra systems (`quorum` and `coordination`). When the CLI is used, that's fine because the `rabbit` app is stopped and thus the Ra systems. However, when this is done as part of peer discovery, the node is booting: the Ra systems were started earlier because they are required to run Khepri. Therefore, the reset deletes files being used. This breaks the Ra systems. [How] The Ra systems are stopped just before the reset (if the join is performed as part of peer discovery) and they are restarted after.
1 parent a308edc commit 76c9c37

File tree

1 file changed

+35
-1
lines changed

1 file changed

+35
-1
lines changed

deps/rabbit/src/rabbit_db_cluster.erl

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,22 @@ join(RemoteNode, NodeType)
101101
%% database because we might change it during the join.
102102
RestartMnesia = rabbit_mnesia:is_running(),
103103
RestartFFCtl = rabbit_ff_controller:is_running(),
104+
RestartRaSystems = rabbit_ra_systems:are_running(),
104105
RestartRabbit = rabbit:is_running(),
105106
case RestartRabbit of
106107
true ->
107108
rabbit:stop();
108109
false ->
110+
%% The Ra systems were started before we initialize the
111+
%% database (because Khepri depends on one of them).
112+
%% Therefore, there are files in the data directory. They
113+
%% will go away with the reset and we will need to restart
114+
%% Ra systems afterwards.
115+
case RestartRaSystems of
116+
true -> ok = rabbit_ra_systems:ensure_stopped();
117+
false -> ok
118+
end,
119+
109120
case RestartFFCtl of
110121
true ->
111122
ok = rabbit_ff_controller:wait_for_task_and_stop();
@@ -136,6 +147,30 @@ join(RemoteNode, NodeType)
136147
rabbit_ff_registry_factory:release_state_change_lock()
137148
end,
138149

150+
%% After the regular reset, we also reset Mnesia specifically if
151+
%% it is meant to be used. That's because we may switch back from
152+
%% Khepri to Mnesia. To be safe, remove possibly stale files from
153+
%% a previous instance where Mnesia was used.
154+
case rabbit_khepri:is_enabled(RemoteNode) of
155+
true -> ok;
156+
false -> ok = rabbit_mnesia:reset_gracefully()
157+
end,
158+
159+
%% Now that the files are all gone after the reset above, restart
160+
%% the Ra systems. They will recreate their folder in the process.
161+
case RestartRabbit of
162+
true ->
163+
ok;
164+
false ->
165+
case RestartRaSystems of
166+
true ->
167+
ok = rabbit_ra_systems:ensure_started(),
168+
ok = rabbit_khepri:setup();
169+
false ->
170+
ok
171+
end
172+
end,
173+
139174
?LOG_INFO(
140175
"DB: joining cluster using remote nodes:~n~tp", [ClusterNodes],
141176
#{domain => ?RMQLOG_DOMAIN_DB}),
@@ -182,7 +217,6 @@ join(RemoteNode, NodeType)
182217
end.
183218

184219
join_using_mnesia(ClusterNodes, NodeType) when is_list(ClusterNodes) ->
185-
ok = rabbit_mnesia:reset_gracefully(),
186220
rabbit_mnesia:join_cluster(ClusterNodes, NodeType).
187221

188222
join_using_khepri(ClusterNodes, disc) ->

0 commit comments

Comments
 (0)