Skip to content

Commit 1953cfd

Browse files
committed
Verify that the process is alive in lookup/2
Fixes a data-race between an ets read from lookup/2 and the ets write from the 'DOWN' handler of a registered process which causes intermittent failures in supervision strategies. In practice, the problem can occur quite often in tests which assert the init behaviour of gen_server's. In production, the `already_started` errors can cause a single failure to cascade and restart the parent supervisor. A similar approach is taken in [Elixir's Registry](https://github.com/elixir-lang/elixir/blob/e35ffc5a903bff3b595e323eb1ac12c4ecd515ad/lib/elixir/lib/registry.ex#L243) (also backed by ets). Fixes #48
1 parent cf2b318 commit 1953cfd

File tree

2 files changed

+33
-3
lines changed

2 files changed

+33
-3
lines changed

src/syn_registry.erl

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,20 @@ lookup(Scope, Name) ->
7878
TableByName ->
7979
case find_registry_entry_by_name(Name, TableByName) of
8080
undefined -> undefined;
81-
{Name, Pid, Meta, _, _, _} -> {Pid, Meta}
81+
{Name, Pid, Meta, _, _, _} ->
82+
% This read can be initiated prior to registration, by a
83+
% supervisor or supervisor-like process trying to restart a
84+
% stopped process in response to a 'DOWN', while the 'DOWN'
85+
% handler in this module is yet to update TableByName.
86+
%
87+
% Verifying aliveness avoids confusing already_started
88+
% errors while restarting registered processes.
89+
% The aliveness check is only necessary when the Pid is
90+
% local.
91+
case erlang:node(Pid) =/= erlang:node() orelse erlang:is_process_alive(Pid) of
92+
true -> {Pid, Meta};
93+
false -> undefined
94+
end
8295
end
8396
end.
8497

test/syn_registry_SUITE.erl

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
-export([
3636
one_node_via_register_unregister/1,
3737
one_node_via_register_unregister_with_metadata/1,
38-
one_node_strict_mode/1
38+
one_node_strict_mode/1,
39+
one_node_repeated_restart/1
3940
]).
4041
-export([
4142
three_nodes_discover/1,
@@ -90,7 +91,8 @@ groups() ->
9091
{one_node_registry, [shuffle], [
9192
one_node_via_register_unregister,
9293
one_node_via_register_unregister_with_metadata,
93-
one_node_strict_mode
94+
one_node_strict_mode,
95+
one_node_repeated_restart
9496
]},
9597
{three_nodes_registry, [shuffle], [
9698
three_nodes_discover,
@@ -283,6 +285,21 @@ one_node_strict_mode(_Config) ->
283285
ok = syn:register(scope, "strict-true", Self),
284286
{Self, undefined} = syn:lookup(scope, "strict-true").
285287

288+
one_node_repeated_restart(_Config) ->
289+
%% start syn
290+
ok = syn:start(),
291+
syn:add_node_to_scopes([scope]),
292+
ViaTuple = {via, syn, {scope, <<"my proc">>, my_metadata}},
293+
% Data races between the 'DOWN' handler and reads pre-registration reads
294+
% cause intermittent failures. Repeat count 100 is heuristically chosen as
295+
% it consistently surfaced the problem at the time of writing.
296+
RepeatCount = 100,
297+
StartStop = fun(_) ->
298+
{ok, Pid} = syn_test_gen_server:start_link(ViaTuple),
299+
gen_server:stop(Pid)
300+
end,
301+
lists:foreach(StartStop, lists:duplicate(RepeatCount, 0)).
302+
286303
three_nodes_discover(Config) ->
287304
%% get slaves
288305
SlaveNode1 = proplists:get_value(syn_slave_1, Config),

0 commit comments

Comments
 (0)