Skip to content

Commit 77d0c90

Browse files
committed
Remove feature flag code handling potential code server deadlocks
These test cases and RPC calls were concerned with deadlocks possible from modifying the code server process from multiple callers. With the switch to persistent_term in the parent commits these kinds of deadlocks are no longer possible. We should keep the RPC calls to `rabbit_ff_registry_wrapper:inventory/0` though for mixed-version compatibility with nodes that use module generation instead of `persistent_term` for their registry.
1 parent 9cd33c8 commit 77d0c90

File tree

2 files changed

+13
-219
lines changed

2 files changed

+13
-219
lines changed

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,6 +1120,8 @@ collect_inventory_on_nodes(Nodes, Timeout) ->
11201120
Rets = inventory_rpcs(Nodes, Timeout),
11211121
maps:fold(
11221122
fun
1123+
(_Node, init_required, {ok, Inventory}) ->
1124+
{ok, Inventory};
11231125
(Node,
11241126
#{feature_flags := FeatureFlags1,
11251127
applications := ScannedApps,
@@ -1144,12 +1146,20 @@ collect_inventory_on_nodes(Nodes, Timeout) ->
11441146
end, {ok, Inventory0}, Rets).
11451147

11461148
inventory_rpcs(Nodes, Timeout) ->
1147-
%% We must use `rabbit_ff_registry_wrapper' if it is available to avoid
1148-
%% any deadlock with the Code server. If it is unavailable, we fall back
1149-
%% to `rabbit_ff_registry'.
1149+
%% In the past, the feature flag registry in `rabbit_ff_registry' was
1150+
%% implemented with a module which was dynamically regenerated and
1151+
%% reloaded. To avoid deadlocks with the Code server we need to first call
1152+
%% into `rabbit_ff_registry_wrapper' if it is available. If it is
1153+
%% unavailable, we fall back to `rabbit_ff_registry'.
11501154
%%
11511155
%% See commit aacfa1978e24bcacd8de7d06a7c3c5d9d8bd098e and pull request
11521156
%% #8155.
1157+
%%
1158+
%% In the long run, when compatibility with nodes that use module creation
1159+
%% for `rabbit_ff_registry' is no longer required, this block can be
1160+
%% replaced with a call of:
1161+
%%
1162+
%% rpc_calls(Nodes, rabbit_ff_registry, inventory, []).
11531163
Rets0 = rpc_calls(
11541164
Nodes,
11551165
rabbit_ff_registry_wrapper, inventory, [], Timeout),

deps/rabbit/test/feature_flags_SUITE.erl

Lines changed: 0 additions & 216 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525

2626
registry_general_usage/1,
2727
registry_concurrent_reloads/1,
28-
try_to_deadlock_in_registry_reload_1/1,
29-
try_to_deadlock_in_registry_reload_2/1,
3028
registry_reset/1,
3129
enable_feature_flag_in_a_healthy_situation/1,
3230
enable_unsupported_feature_flag_in_a_healthy_situation/1,
@@ -104,8 +102,6 @@ groups() ->
104102
[
105103
registry_general_usage,
106104
registry_concurrent_reloads,
107-
try_to_deadlock_in_registry_reload_1,
108-
try_to_deadlock_in_registry_reload_2,
109105
registry_reset
110106
]},
111107
{feature_flags_v2, [], Groups}
@@ -575,218 +571,6 @@ registry_spammer1(FeatureNames) ->
575571
?assertEqual(FeatureNames, ?list_ff(all)),
576572
registry_spammer1(FeatureNames).
577573

578-
try_to_deadlock_in_registry_reload_1(_Config) ->
579-
rabbit_ff_registry_factory:purge_old_registry(rabbit_ff_registry),
580-
_ = code:delete(rabbit_ff_registry),
581-
?assertEqual(false, code:is_loaded(rabbit_ff_registry)),
582-
583-
FeatureName = ?FUNCTION_NAME,
584-
FeatureProps = #{provided_by => rabbit,
585-
stability => stable},
586-
587-
Parent = self(),
588-
589-
%% Deadlock steps:
590-
%% * Process A acquires the lock first.
591-
%% * Process B loads the registry stub and waits for the lock.
592-
%% * Process A deletes the registry stub and loads the initialized
593-
%% registry.
594-
%% * Process B wants to purge the deleted registry stub by sending a
595-
%% request to the Code server.
596-
%%
597-
%% => Process B waits forever the return from the Code server because the
598-
%% Code server waits for process B to be runnable again to handle the
599-
%% signal.
600-
601-
ProcessA = spawn_link(
602-
fun() ->
603-
%% Process A acquires the lock manually first to
604-
%% ensure the ordering of events. It can be acquired
605-
%% recursively, so the feature flag injection can
606-
%% "acquire" it again and proceed.
607-
ct:pal("Process A: Acquire registry loading lock"),
608-
Lock =
609-
rabbit_ff_registry_factory:registry_loading_lock(),
610-
global:set_lock(Lock, [node()]),
611-
receive proceed -> ok end,
612-
613-
ct:pal(
614-
"Process A: "
615-
"Inject arbitrary feature flag to reload "
616-
"registry"),
617-
rabbit_feature_flags:inject_test_feature_flags(
618-
#{FeatureName => FeatureProps}),
619-
620-
ct:pal("Process A: Release registry loading lock"),
621-
global:del_lock(Lock, [node()]),
622-
623-
ct:pal("Process A: Exiting..."),
624-
erlang:unlink(Parent)
625-
end),
626-
timer:sleep(500),
627-
628-
ProcessB = spawn_link(
629-
fun() ->
630-
%% Process B is the one loading the registry stub and
631-
%% wants to initialize the real registry.
632-
ct:pal(
633-
"Process B: "
634-
"Trigger automatic initial registry load"),
635-
FF = rabbit_ff_registry_wrapper:get(FeatureName),
636-
637-
ct:pal("Process B: Exiting..."),
638-
erlang:unlink(Parent),
639-
Parent ! {self(), FF}
640-
end),
641-
timer:sleep(500),
642-
643-
begin
644-
{_, StacktraceA} = erlang:process_info(ProcessA, current_stacktrace),
645-
{_, StacktraceB} = erlang:process_info(ProcessB, current_stacktrace),
646-
ct:pal(
647-
"Process stacktraces:~n"
648-
" Process A: ~p~n"
649-
" Process B: ~p",
650-
[StacktraceA, StacktraceB])
651-
end,
652-
653-
%% Process A is resumed. Without a proper check, process B would try to
654-
%% purge the copy of the registry it is currently using itself, causing a
655-
%% deadlock because the Code server wants process A to handle a signal, but
656-
%% process A is not runnable.
657-
ProcessA ! proceed,
658-
659-
ct:pal("Waiting for process B to exit"),
660-
receive
661-
{ProcessB, FF} ->
662-
?assertEqual(FeatureProps#{name => FeatureName}, FF),
663-
ok
664-
after 10000 ->
665-
{_, StacktraceB} = erlang:process_info(
666-
ProcessB, current_stacktrace),
667-
ct:pal("Process B stuck; stacktrace: ~p", [StacktraceB]),
668-
error(registry_reload_deadlock)
669-
end.
670-
671-
try_to_deadlock_in_registry_reload_2(_Config) ->
672-
rabbit_ff_registry_factory:purge_old_registry(rabbit_ff_registry),
673-
_ = code:delete(rabbit_ff_registry),
674-
?assertEqual(false, code:is_loaded(rabbit_ff_registry)),
675-
676-
FeatureName = ?FUNCTION_NAME,
677-
FeatureProps = #{provided_by => rabbit,
678-
stability => stable},
679-
680-
ct:pal("Inject arbitrary feature flag to reload registry"),
681-
rabbit_feature_flags:inject_test_feature_flags(
682-
#{FeatureName => FeatureProps},
683-
false),
684-
685-
_ = erlang:process_flag(trap_exit, true),
686-
687-
ct:pal("Parent ~p: Acquire registry loading lock", [self()]),
688-
Lock = rabbit_ff_registry_factory:registry_loading_lock(),
689-
global:set_lock(Lock, [node()]),
690-
691-
Parent = self(),
692-
693-
%% Deadlock steps:
694-
%% * Processes A, B1 and B2 wait for the lock. The registry stub is loaded.
695-
%% * Process B1 acquires the lock.
696-
%% * Process B1 deletes the registry stub and loads the initialized
697-
%% registry.
698-
%% * Process A acquires the lock.
699-
%% * Process A wants to purge the deleted registry stub by sending a
700-
%% request to the Code server.
701-
%%
702-
%% => Process A waits forever the return from the Code server because the
703-
%% Code server waits for process B2 to stop lingering on the deleted
704-
%% registry stub, but process B2 waits for the lock.
705-
706-
ProcessA = spawn_link(
707-
fun() ->
708-
%% Process A acquires the lock automatically as part
709-
%% of requesting an explicit initialization of the
710-
%% registry. Process A doesn't linger on the registry
711-
%% stub.
712-
ct:pal(
713-
"Process A ~p: "
714-
"Trigger manual initial registry load",
715-
[self()]),
716-
rabbit_ff_registry_factory:initialize_registry(),
717-
718-
ct:pal("Process A ~p: Exiting...", [self()]),
719-
erlang:unlink(Parent),
720-
Parent ! {self(), done}
721-
end),
722-
723-
FunB = fun() ->
724-
%% Processes B1 and B2 acquire the lock automatically as
725-
%% part of trying to load the registry as part of they
726-
%% querying a feature flag.
727-
ct:pal(
728-
"Process B ~p: "
729-
"Trigger automatic initial registry load",
730-
[self()]),
731-
_ = rabbit_ff_registry_wrapper:get(FeatureName),
732-
733-
ct:pal(
734-
"Process B ~p: Exiting...",
735-
[self()]),
736-
erlang:unlink(Parent),
737-
Parent ! {self(), done}
738-
end,
739-
ProcessB1 = spawn_link(FunB),
740-
ProcessB2 = spawn_link(FunB),
741-
timer:sleep(500),
742-
743-
%% We use `erlang:suspend_process/1' and `erlang:resume_process/1' to
744-
%% ensure the order in which processes acquire the lock.
745-
erlang:suspend_process(ProcessA),
746-
erlang:suspend_process(ProcessB1),
747-
erlang:suspend_process(ProcessB2),
748-
timer:sleep(500),
749-
750-
ct:pal("Parent ~p: Release registry loading lock", [self()]),
751-
global:del_lock(Lock, [node()]),
752-
753-
erlang:resume_process(ProcessB1),
754-
timer:sleep(500),
755-
erlang:resume_process(ProcessA),
756-
timer:sleep(500),
757-
erlang:resume_process(ProcessB2),
758-
759-
ct:pal("Waiting for processes to exit"),
760-
Procs = [ProcessA, ProcessB1, ProcessB2],
761-
lists:foreach(
762-
fun(Pid) ->
763-
receive
764-
{Pid, done} ->
765-
ok;
766-
{'EXIT', Pid, Reason} ->
767-
ct:pal("Process ~p exited; reason: ~p", [Pid, Reason]),
768-
error(test_process_killed)
769-
after 10000 ->
770-
lists:foreach(
771-
fun(Pid1) ->
772-
PI = erlang:process_info(
773-
Pid1, current_stacktrace),
774-
case PI of
775-
undefined ->
776-
ok;
777-
{_, Stacktrace} ->
778-
ct:pal(
779-
"Process ~p stuck; "
780-
"stacktrace: ~p",
781-
[Pid1, Stacktrace])
782-
end
783-
end, Procs),
784-
error(registry_reload_deadlock)
785-
end
786-
end, Procs),
787-
788-
ok.
789-
790574
registry_reset(_Config) ->
791575
%% At first, the registry must be uninitialized.
792576
?assertNot(rabbit_ff_registry:is_registry_initialized()),

0 commit comments

Comments
 (0)