Verify that the process is alive in lookup/2

urmastalimaa · urmastalimaa · commit 1953cfde11a7 · 2025-03-25T19:12:09.000+02:00
Fixes a data-race between an ets read from lookup/2 and the ets write from the 'DOWN' handler of a registered process which causes intermittent failures in supervision strategies. In practice, the problem can occur quite often in tests which assert the init behaviour of gen_server's. In production, the `already_started` errors can cause a single failure to cascade and restart the parent supervisor. A similar approach is taken in [Elixir's Registry](https://github.com/elixir-lang/elixir/blob/e35ffc5a903bff3b595e323eb1ac12c4ecd515ad/lib/elixir/lib/registry.ex#L243) (also backed by ets). Fixes #48
diff --git a/src/syn_registry.erl b/src/syn_registry.erl
@@ -78,7 +78,20 @@ lookup(Scope, Name) ->
         TableByName ->
             case find_registry_entry_by_name(Name, TableByName) of
                 undefined -> undefined;
-                {Name, Pid, Meta, _, _, _} -> {Pid, Meta}
+                {Name, Pid, Meta, _, _, _} ->
+                    % This read can be initiated prior to registration, by a
+                    % supervisor or supervisor-like process trying to restart a
+                    % stopped process in response to a 'DOWN', while the 'DOWN'
+                    % handler in this module is yet to update TableByName.
+                    %
+                    % Verifying aliveness avoids confusing already_started
+                    % errors while restarting registered processes.
+                    % The aliveness check is only necessary when the Pid is
+                    % local.
+                    case erlang:node(Pid) =/= erlang:node() orelse erlang:is_process_alive(Pid) of 
+                        true -> {Pid, Meta};
+                        false -> undefined
+                    end
             end
     end.
 
diff --git a/test/syn_registry_SUITE.erl b/test/syn_registry_SUITE.erl
@@ -35,7 +35,8 @@
 -export([
     one_node_via_register_unregister/1,
     one_node_via_register_unregister_with_metadata/1,
-    one_node_strict_mode/1
+    one_node_strict_mode/1,
+    one_node_repeated_restart/1
 ]).
 -export([
     three_nodes_discover/1,
@@ -90,7 +91,8 @@ groups() ->
         {one_node_registry, [shuffle], [
             one_node_via_register_unregister,
             one_node_via_register_unregister_with_metadata,
-            one_node_strict_mode
+            one_node_strict_mode,
+            one_node_repeated_restart
         ]},
         {three_nodes_registry, [shuffle], [
             three_nodes_discover,
@@ -283,6 +285,21 @@ one_node_strict_mode(_Config) ->
     ok = syn:register(scope, "strict-true", Self),
     {Self, undefined} = syn:lookup(scope, "strict-true").
 
+one_node_repeated_restart(_Config) ->
+    %% start syn
+    ok = syn:start(),
+    syn:add_node_to_scopes([scope]),
+    ViaTuple = {via, syn, {scope, <<"my proc">>, my_metadata}},
+    % Data races between the 'DOWN' handler and reads pre-registration reads
+    % cause intermittent failures. Repeat count 100 is heuristically chosen as
+    % it consistently surfaced the problem at the time of writing.
+    RepeatCount = 100,
+    StartStop = fun(_) ->
+        {ok, Pid} = syn_test_gen_server:start_link(ViaTuple),
+        gen_server:stop(Pid)
+    end,
+    lists:foreach(StartStop, lists:duplicate(RepeatCount, 0)).
+
 three_nodes_discover(Config) ->
     %% get slaves
     SlaveNode1 = proplists:get_value(syn_slave_1, Config),