From 01fb787902a61d1bfbbe335c67745b6a6e6d21dc Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Wed, 27 May 2026 15:38:41 -0700
Subject: [PATCH 1/2] Add script to reproduce SegFault

---
 .../repro_lifted_edges_segfault.py            | 154 ++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 development/graph/lifted_multicut/repro_lifted_edges_segfault.py

diff --git a/development/graph/lifted_multicut/repro_lifted_edges_segfault.py b/development/graph/lifted_multicut/repro_lifted_edges_segfault.py
new file mode 100644
index 0000000..b1f6465
--- /dev/null
+++ b/development/graph/lifted_multicut/repro_lifted_edges_segfault.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+"""Self-contained reproduction of the bug in
+``bioimage_cpp.graph.lifted_multicut.lifted_edges_from_node_labels``.
+
+The function misbehaves once the graph grows past a few hundred nodes. The bug is
+*nondeterministic* and shows up in two ways on the very same (deterministic) input:
+
+  1. It intermittently SIGSEGVs.
+  2. When it does return, the number of lifted edges varies from run to run and disagrees
+     with the reference ``nifty.distributed.liftedNeighborhoodFromNodeLabels`` (which is
+     stable). E.g. for a 2000-node chain at depth 3 we have seen 3288, 3837, ... while nifty
+     consistently returns 3993.
+
+A varying result for fixed input plus occasional crashes is the classic signature of a memory
+error (out-of-bounds read/write) in the C++ implementation. It reproduces with a trivial chain
+graph -- no RAG or production-scale data required -- and is independent of node-label values,
+``graph_depth`` and ``mode``. The RegionAdjacencyGraph path tends to crash the most reliably.
+
+Minimal trigger (run on its own a few times: some runs crash, others print a different count):
+
+    import numpy as np
+    import bioimage_cpp as bic
+
+    n = 2000
+    uv = np.array([(i, i + 1) for i in range(n - 1)], dtype="uint64")  # a simple chain
+    g = bic.graph.UndirectedGraph.from_edges(n, uv)
+    out = bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+        g, np.zeros(n, "uint64"), graph_depth=3, mode="all")
+    print(len(out))   # -> Segmentation fault, or a different number each run
+
+Each configuration below is run several times, each in its own child process, so a crash does
+not abort the sweep and the run-to-run variation is visible.
+
+Run it with:
+
+    python repro_lifted_edges_segfault.py
+"""
+import multiprocessing as mp
+import queue as _queue
+import signal
+
+import numpy as np
+import bioimage_cpp as bic
+
+try:
+    import nifty.distributed as ndist
+except ImportError:
+    ndist = None
+
+GRAPH_DEPTH = 3
+MODE = "all"
+NODE_LADDER = (100, 500, 1000, 2000)
+REPS = 5
+
+
+def _chain_edges(n_nodes):
+    """A simple connected chain 0-1-2-...-(n-1); enough to trigger the bug."""
+    return np.array([(i, i + 1) for i in range(n_nodes - 1)], dtype="uint64")
+
+
+def _chain_segmentation(n_nodes):
+    """A labeled volume whose region adjacency graph is the same chain of ``n_nodes`` nodes."""
+    return np.repeat(np.arange(n_nodes, dtype="uint32"), 16).reshape(n_nodes, 4, 4)
+
+
+# --- workers (each invocation runs in its own process) -------------------------------------
+def _bic_undirected(n_nodes, q):
+    g = bic.graph.UndirectedGraph.from_edges(n_nodes, _chain_edges(n_nodes))
+    node_labels = np.ones(n_nodes, dtype="uint64")  # label values are irrelevant to the bug
+    out = bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+        g, node_labels, graph_depth=GRAPH_DEPTH, mode=MODE)
+    q.put(len(out))
+
+
+def _bic_rag(n_nodes, q):
+    rag = bic.graph.region_adjacency_graph(_chain_segmentation(n_nodes))
+    node_labels = np.ones(rag.numberOfNodes, dtype="uint64")
+    out = bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+        rag, node_labels, graph_depth=GRAPH_DEPTH, mode=MODE)
+    q.put(len(out))
+
+
+def _nifty_undirected(n_nodes, q):
+    g = ndist.Graph(_chain_edges(n_nodes))
+    node_labels = np.ones(n_nodes, dtype="uint64")  # non-zero so ignoreLabel=0 keeps every pair
+    out = ndist.liftedNeighborhoodFromNodeLabels(
+        g, node_labels, GRAPH_DEPTH, mode=MODE, numberOfThreads=1, ignoreLabel=0)
+    q.put(len(out))
+
+
+def _run_once(worker, n_nodes, timeout=120):
+    """Run ``worker(n_nodes, queue)`` in a child process; return its lifted count or a status."""
+    q = mp.Queue()
+    p = mp.Process(target=worker, args=(n_nodes, q))
+    p.start()
+    p.join(timeout)
+    if p.is_alive():
+        p.terminate()
+        p.join()
+        return "timeout"
+    if p.exitcode == -signal.SIGSEGV:
+        return "segfault"
+    if p.exitcode == -signal.SIGABRT:
+        return "abort"  # glibc "double free or corruption"
+    if p.exitcode != 0:
+        return f"exit {p.exitcode}"
+    try:
+        return q.get(timeout=5)
+    except _queue.Empty:
+        return "no-result"
+
+
+def _summary(worker, n_nodes):
+    """Run ``worker`` REPS times and summarise crashes and the distinct lifted-edge counts."""
+    results = [_run_once(worker, n_nodes) for _ in range(REPS)]
+    crash_labels = ("segfault", "abort")
+    crashes = sum(1 for r in results if r in crash_labels)
+    counts = sorted({r for r in results if isinstance(r, int)})
+    others = [r for r in results if not isinstance(r, int) and r not in crash_labels]
+
+    parts = []
+    if crashes:
+        kinds = "/".join(sorted({r.upper() for r in results if r in crash_labels}))
+        parts.append(f"{crashes}/{REPS} {kinds}")
+    if counts:
+        flag = "  <- NONDETERMINISTIC" if len(counts) > 1 else ""
+        parts.append("lifted=" + ",".join(map(str, counts)) + flag)
+    parts.extend(sorted(set(others)))
+    return "; ".join(parts) if parts else "no output"
+
+
+def main():
+    print("Reproducing the lifted_edges_from_node_labels bug "
+          f"(graph_depth={GRAPH_DEPTH}, mode={MODE!r}, {REPS} runs per case).\n")
+
+    workers = [("bic  UndirectedGraph     ", _bic_undirected),
+               ("bic  RegionAdjacencyGraph", _bic_rag)]
+    if ndist is not None:
+        workers.append(("nifty (reference)        ", _nifty_undirected))
+
+    for n in NODE_LADDER:
+        print(f"=== {n} nodes ===")
+        for name, worker in workers:
+            print(f"  {name} : {_summary(worker, n)}")
+        print()
+
+    print("Reference (nifty) returns a single stable count; bic varies and/or crashes -> "
+          "memory corruption in lifted_edges_from_node_labels.")
+
+
+if __name__ == "__main__":
+    # "spawn" gives each case a fresh interpreter, so a crash is cleanly attributed.
+    mp.set_start_method("spawn", force=True)
+    main()

From 0a129408b6cf0135a52db57852fad0b4ca9123b9 Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Wed, 27 May 2026 21:36:07 -0700
Subject: [PATCH 2/2] Thread safety in graph access

---
 AGENTS.md                                     | 26 +++++++++++
 MIGRATION_GUIDE.md                            | 15 +++++--
 .../graph/lifted_multicut/fusion_move.hxx     |  8 ++++
 .../lifted_from_node_labels.hxx               |  6 +++
 .../graph/multicut/fusion_move.hxx            |  7 +++
 .../bioimage_cpp/graph/undirected_graph.hxx   | 26 ++++++++---
 src/bindings/graph.cxx                        | 10 ++++-
 src/bioimage_cpp/graph/__init__.py            | 19 ++++++++
 .../graph/lifted_multicut/test_fusion_move.py | 44 +++++++++++++++++++
 .../test_lifted_edges_from_node_labels.py     | 38 ++++++++++++++++
 tests/graph/multicut/test_fusion_move.py      | 39 ++++++++++++++++
 11 files changed, 227 insertions(+), 11 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index cd4b502..9f0e0f8 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -60,6 +60,32 @@ new fusion-move / proposal-based / contraction-based solver:
 - `detail/threading.hxx::parallel_for_chunks` — the only threading primitive
   we use. New parallel solvers should not introduce alternatives.
 
+### Freeze graphs before parallel fan-out (thread-safety contract)
+
+`UndirectedGraph` (and its subclasses) build the CSR adjacency *lazily*: the
+first `node_adjacency` read on a graph built incrementally rebuilds it through
+a `mutable` write inside a `const` method, and that rebuild is **not
+thread-safe**. Graphs are "dirty" (not yet built) after `insert_edge`-based
+construction — this includes the `from_edges` binding, `region_adjacency_graph`,
+and `GridGraph` (which defers the build on purpose). `from_sorted_unique_edges`
+returns an already-frozen graph.
+
+Rule: **any algorithm that reads `node_adjacency` from `parallel_for_chunks`
+(or other threads) MUST call `graph.freeze()` on the calling thread before the
+fan-out.** "Reads `node_adjacency`" includes the indirect readers
+`breadth_first_search`, `extract_subgraph_from_nodes`, and the multicut
+sub-solver `greedy_additive` (via `DynamicGraph::reset`, which sizes per-node
+adjacency from `graph.node_adjacency(node)`). Edge-only iteration
+(`uv`, `uv_ids`, `number_of_edges`) and `find_edge` (edge-lookup hashmap) do
+*not* trigger the rebuild and need no freeze. Symptoms when this is missed:
+nondeterministic results for fixed input and intermittent segfaults.
+
+Canonical fixed call sites that already follow this (copy the pattern):
+`lifted_multicut/lifted_from_node_labels.hxx` and both
+`{multicut,lifted_multicut}/fusion_move.hxx::FusionMoveSolver::optimize`
+(the lifted driver freezes the *base* graph — its proposal generators read base
+adjacency, while its warm-start only touches the lifted graph).
+
 When porting fusion moves to a new objective (e.g. lifted multicut):
 
 1. The driver loop in `multicut/fusion_move.hxx::FusionMoveSolver::optimize`
diff --git a/MIGRATION_GUIDE.md b/MIGRATION_GUIDE.md
index 6c12a90..99f5dad 100644
--- a/MIGRATION_GUIDE.md
+++ b/MIGRATION_GUIDE.md
@@ -430,10 +430,17 @@ Important differences:
 - `graph.clone()` returns an independent deep copy. The C++ class is
   move-only (it owns a CSR adjacency buffer), so prefer this over
   reassignment-by-value.
-- `graph.freeze()` eagerly builds the internal adjacency. Call it after a
-  batch of `insert_edge` calls if you intend to hand the graph to multiple
-  reader threads, or if you want to ensure subsequent `node_adjacency`
-  reads carry no first-call rebuild cost.
+- The internal adjacency is built *lazily* on the first `node_adjacency`
+  read, and that lazy build is **not thread-safe**. The built-in
+  multi-threaded algorithms freeze the graph internally before fanning out, so
+  passing a graph straight into them is safe. But if you build a graph and then
+  share it across **your own** threads (concurrent `node_adjacency` reads, a
+  BFS, etc.), call `graph.freeze()` once on the construction thread first —
+  racing the first read across threads corrupts the adjacency (nondeterministic
+  results, possible crashes). `freeze()` eagerly builds the adjacency and is a
+  no-op once built; it also removes the first-call rebuild cost from later
+  `node_adjacency` reads. This applies to all graph types (`GridGraph2D`,
+  `GridGraph3D`, `RegionAdjacencyGraph`).
 
 Common method/property mapping:
 
diff --git a/include/bioimage_cpp/graph/lifted_multicut/fusion_move.hxx b/include/bioimage_cpp/graph/lifted_multicut/fusion_move.hxx
index 5e791d2..5f91316 100644
--- a/include/bioimage_cpp/graph/lifted_multicut/fusion_move.hxx
+++ b/include/bioimage_cpp/graph/lifted_multicut/fusion_move.hxx
@@ -91,6 +91,14 @@ public:
             return objective.labels();
         }
 
+        // Proposal generators read base_graph.node_adjacency() concurrently in the
+        // stage-1 parallel region (the greedy-additive generator does, via
+        // DynamicGraph::reset). The lazy CSR rebuild is not thread-safe, and the
+        // warm-start below freezes the *lifted* graph, not the base graph, so freeze
+        // the base graph on this thread before fan-out. See UndirectedGraph
+        // thread-safety. (The lifted graph is only read by edge iteration here.)
+        base_graph.freeze();
+
         const auto effective_threads = ::bioimage_cpp::detail::normalize_thread_count(
             number_of_threads_, number_of_parallel_proposals_
         );
diff --git a/include/bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx b/include/bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx
index 438a57c..5663eb3 100644
--- a/include/bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx
+++ b/include/bioimage_cpp/graph/lifted_multicut/lifted_from_node_labels.hxx
@@ -60,6 +60,12 @@ std::vector<bioimage_cpp::detail::Edge> lifted_edges_from_node_labels(
         return {};
     }
 
+    // The CSR adjacency is rebuilt lazily on the first node_adjacency() read and
+    // that rebuild is not thread-safe. Freeze it on this thread before the
+    // parallel BFS fan-out below so worker threads only ever do const reads of
+    // an already-built adjacency (see the UndirectedGraph thread-safety notes).
+    graph.freeze();
+
     const auto n_threads = bioimage_cpp::detail::normalize_thread_count(
         number_of_threads, n_nodes
     );
diff --git a/include/bioimage_cpp/graph/multicut/fusion_move.hxx b/include/bioimage_cpp/graph/multicut/fusion_move.hxx
index 1cdd3e4..4377e08 100644
--- a/include/bioimage_cpp/graph/multicut/fusion_move.hxx
+++ b/include/bioimage_cpp/graph/multicut/fusion_move.hxx
@@ -77,6 +77,13 @@ public:
             return objective.labels();
         }
 
+        // Proposal generators may read graph.node_adjacency() concurrently in the
+        // stage-1 parallel region (the greedy-additive generator does, via
+        // DynamicGraph::reset). The lazy CSR rebuild is not thread-safe, and the
+        // warm-start below only freezes the graph for a singleton initial labeling,
+        // so freeze on this thread before fan-out. See UndirectedGraph thread-safety.
+        graph.freeze();
+
         // One workspace per worker thread; reused across the warm-start, every
         // pairwise fuse, and the stage-2 joint fuse.
         const auto effective_threads = ::bioimage_cpp::detail::normalize_thread_count(
diff --git a/include/bioimage_cpp/graph/undirected_graph.hxx b/include/bioimage_cpp/graph/undirected_graph.hxx
index 8eb8f25..1fac051 100644
--- a/include/bioimage_cpp/graph/undirected_graph.hxx
+++ b/include/bioimage_cpp/graph/undirected_graph.hxx
@@ -37,12 +37,22 @@ struct Adjacency {
 // `rebuild_adjacency_from_edges()` explicitly, which keeps reads cheap and
 // thread-safe.
 //
-// Thread safety: as long as a graph is "frozen" before being shared with
-// reader threads (no concurrent inserts, no first-read-from-dirty-state
-// race), reads of `node_adjacency` are safe to share across threads. The
-// lazy rebuild is not internally synchronized — call
-// `rebuild_adjacency_from_edges()` once on the construction thread before
-// fan-out if you built the graph via `insert_edge*`.
+// Thread safety: the lazy rebuild is not internally synchronized. If two
+// threads each take the first `node_adjacency` read on a still-dirty graph
+// they race on the rebuild — concurrently reallocating `adjacency_data_` and
+// overwriting `adjacency_offsets_` — which corrupts the CSR (garbage neighbor
+// ids, out-of-bounds reads) and intermittently segfaults. The rule:
+//
+//   Any algorithm that reads `node_adjacency` (directly, or via
+//   `breadth_first_search`, `extract_subgraph_from_nodes`, or a sub-solver
+//   such as `multicut::greedy_additive`'s `DynamicGraph::reset`) from
+//   `parallel_for_chunks` or other threads MUST `freeze()` the graph on the
+//   calling thread *before* the fan-out.
+//
+// Once frozen (or built via `from_sorted_unique_edges`, which rebuilds the CSR
+// eagerly), the graph has no mutable read path and is safe to share by
+// `const&` across reader threads. Graphs built incrementally via `insert_edge*`
+// (including the `from_edges` binding and `region_adjacency_graph`) start dirty.
 class UndirectedGraph {
 public:
     using NodeId = std::uint64_t;
@@ -139,6 +149,10 @@ public:
         return edges_;
     }
 
+    // Adjacency slice of `node`. The first call on a dirty graph triggers a
+    // non-thread-safe lazy CSR rebuild (mutable write through this `const`
+    // method); call `freeze()` on the construction thread before sharing the
+    // graph with concurrent readers. See the class-level thread-safety note.
     [[nodiscard]] AdjacencyList node_adjacency(const NodeId node) const {
         validate_node(node);
         ensure_adjacency_built();
diff --git a/src/bindings/graph.cxx b/src/bindings/graph.cxx
index 39b62ad..5cd0f99 100644
--- a/src/bindings/graph.cxx
+++ b/src/bindings/graph.cxx
@@ -1599,7 +1599,15 @@ void bind_graph(nb::module_ &m) {
             nb::arg("nodes")
         )
         .def("edges_from_node_list", &graph_edges_from_node_list, nb::arg("nodes"))
-        .def("freeze", &Graph::freeze)
+        .def(
+            "freeze",
+            &Graph::freeze,
+            "Build the internal adjacency representation now (it is otherwise "
+            "built lazily on first use). Call this on the construction thread "
+            "before sharing the graph with concurrent reader threads: the lazy "
+            "build is not thread-safe. No-op if already built; safe to call "
+            "repeatedly."
+        )
         .def("clone", &Graph::clone)
         .def_static(
             "from_edges",
diff --git a/src/bioimage_cpp/graph/__init__.py b/src/bioimage_cpp/graph/__init__.py
index b21e9f0..65e4f33 100644
--- a/src/bioimage_cpp/graph/__init__.py
+++ b/src/bioimage_cpp/graph/__init__.py
@@ -19,6 +19,20 @@
   (with and without semantic constraints).
 - :mod:`bioimage_cpp.graph.features` — edge-feature accumulation on RAGs and
   grid graphs.
+
+Thread safety
+-------------
+All graph types (:class:`UndirectedGraph`, :class:`GridGraph2D`,
+:class:`GridGraph3D`, :class:`RegionAdjacencyGraph`) build their internal
+adjacency representation *lazily*, on the first call that reads it. The
+built-in multi-threaded algorithms freeze the graph internally before fanning
+out, so passing a graph straight into them is safe and needs no extra step.
+
+If you build a graph yourself and then share it across **your own** threads
+(reading adjacency, running a BFS, etc. concurrently), call ``graph.freeze()``
+once on the construction thread first: the lazy build is not thread-safe, and
+racing the first read across threads corrupts the adjacency. ``freeze()`` is a
+no-op on an already-built graph.
 """
 
 from __future__ import annotations
@@ -46,6 +60,11 @@ class UndirectedGraph(_core.UndirectedGraph):
     ``0 .. number_of_nodes - 1``. Edges are inserted lazily and receive
     consecutive ids in insertion order. Re-inserting an existing undirected edge
     returns the existing edge id.
+
+    The adjacency representation is built lazily on first use. Before sharing a
+    freshly built graph across threads of your own, call :meth:`freeze` once on
+    the construction thread — see the module-level "Thread safety" note. The
+    built-in multi-threaded algorithms already freeze internally.
     """
 
     def insert_edges(self, uvs):
diff --git a/tests/graph/lifted_multicut/test_fusion_move.py b/tests/graph/lifted_multicut/test_fusion_move.py
index 07d9974..3bda8eb 100644
--- a/tests/graph/lifted_multicut/test_fusion_move.py
+++ b/tests/graph/lifted_multicut/test_fusion_move.py
@@ -309,3 +309,47 @@ def test_fusion_move_default_parallel_proposals_tracks_threads():
     )
     assert one_thread.number_of_parallel_proposals == 2
     assert four_threads.number_of_parallel_proposals == 4
+
+
+def test_greedy_proposals_parallel_is_deterministic_on_dirty_base_graph():
+    # Regression guard for the lazy-CSR-adjacency data race on the *base* graph.
+    # The greedy-additive proposal generator reads base_graph.node_adjacency()
+    # (via DynamicGraph::reset); with T>1 the parallel proposal slots used to
+    # race on the first rebuild of a not-yet-frozen base graph. Unlike the
+    # multicut driver, here the singleton warm-start only freezes the *lifted*
+    # graph, so the race is reachable from the default start. The solver now
+    # freezes the base graph before fan-out; the multi-threaded result must equal
+    # the single-threaded reference on every run.
+    #
+    # Note: a regression here can surface as a process crash (it is a data race),
+    # not just a value mismatch.
+    n = 2000
+    base_edges = np.array([[i, i + 1] for i in range(n - 1)], dtype=np.uint64)
+    base_costs = np.array(
+        [1.0 if i % 3 else -2.0 for i in range(n - 1)], dtype=np.float64
+    )
+    # A handful of lifted edges keeps the lifted graph small (fast warm-start)
+    # while the large base graph drives the parallel proposal generation.
+    lifted_uvs = np.array(
+        [[i, i + 5] for i in range(0, n - 5, 250)], dtype=np.uint64
+    )
+    lifted_costs = np.array([-3.0] * len(lifted_uvs), dtype=np.float64)
+    parallel_proposals = 4
+
+    def run(threads):
+        # Fresh base graph per run so each multi-threaded run starts dirty.
+        base = bic.graph.UndirectedGraph.from_edges(n, base_edges)
+        objective = bic.graph.lifted_multicut.LiftedMulticutObjective(
+            base, base_costs, lifted_uvs=lifted_uvs, lifted_costs=lifted_costs
+        )
+        solver = bic.graph.lifted_multicut.FusionMoveLiftedMulticut(
+            proposal_generator=bic.graph.lifted_multicut.GreedyAdditiveProposalGenerator(seed=0),
+            number_of_threads=threads,
+            number_of_parallel_proposals=parallel_proposals,
+            number_of_iterations=3,
+        )
+        return solver.optimize(objective)
+
+    reference = run(1)
+    for _ in range(15):
+        np.testing.assert_array_equal(run(4), reference)
diff --git a/tests/graph/lifted_multicut/test_lifted_edges_from_node_labels.py b/tests/graph/lifted_multicut/test_lifted_edges_from_node_labels.py
index 957da34..99ce5ec 100644
--- a/tests/graph/lifted_multicut/test_lifted_edges_from_node_labels.py
+++ b/tests/graph/lifted_multicut/test_lifted_edges_from_node_labels.py
@@ -238,3 +238,41 @@ def test_empty_graph():
         graph, labels, graph_depth=2, mode="all"
     )
     assert out.shape == (0, 2)
+
+
+def test_default_threading_is_deterministic_on_large_chain():
+    # Regression guard for a data race in the lazy CSR-adjacency rebuild: with
+    # default (multi-threaded) execution, every worker used to trigger the
+    # not-thread-safe rebuild concurrently on the first node_adjacency() read,
+    # corrupting the adjacency. That produced run-to-run varying counts and
+    # intermittent segfaults. A graph this size reliably exposes the race
+    # (a 10-node chain does not). The result must equal the single-threaded
+    # reference on every run.
+    n = 2000
+    graph = _make_chain(n)  # built via from_edges -> arrives "dirty"
+    labels = np.ones(n, dtype=np.uint64)
+    reference = bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+        graph, labels, graph_depth=3, mode="all", number_of_threads=1
+    )
+    for _ in range(25):
+        out = bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+            graph, labels, graph_depth=3, mode="all"  # default: multi-threaded
+        )
+        assert out.tolist() == reference.tolist()
+
+
+def test_default_threading_is_deterministic_on_rag():
+    # Same race, reached through the region_adjacency_graph construction path,
+    # which also returns a graph with a dirty (not-yet-built) adjacency.
+    n = 2000
+    segmentation = np.repeat(np.arange(n, dtype=np.uint32), 16).reshape(n, 4, 4)
+    rag = bic.graph.region_adjacency_graph(segmentation)
+    labels = np.ones(rag.numberOfNodes, dtype=np.uint64)
+    reference = bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+        rag, labels, graph_depth=3, mode="all", number_of_threads=1
+    )
+    for _ in range(25):
+        out = bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+            rag, labels, graph_depth=3, mode="all"  # default: multi-threaded
+        )
+        assert out.tolist() == reference.tolist()
diff --git a/tests/graph/multicut/test_fusion_move.py b/tests/graph/multicut/test_fusion_move.py
index cc45250..4c00416 100644
--- a/tests/graph/multicut/test_fusion_move.py
+++ b/tests/graph/multicut/test_fusion_move.py
@@ -265,3 +265,42 @@ def test_runs_on_graph_without_negative_edges(chain_problem):
     labels = solver.optimize(objective)
     # All-positive costs → no cut.
     assert np.all(edge_cut_labels(graph, labels) == False)  # noqa: E712
+
+
+def test_greedy_proposals_parallel_is_deterministic_on_dirty_graph():
+    # Smoke-test the parallel greedy-additive-proposal path on a dirty graph
+    # with a non-singleton initial labeling (which skips the calling-thread
+    # warm-start that would otherwise freeze the graph). The lazy CSR rebuild
+    # is not thread-safe; the solver now freezes the graph before fan-out, so
+    # the multi-threaded result must equal the single-threaded reference on
+    # every run.
+    #
+    # Note: the multicut race is hard to trigger deterministically from Python
+    # (the calling thread typically wins the rebuild before OS-spawned worker
+    # threads attach for graphs of test-friendly size). The structurally
+    # identical race in the lifted-multicut twin
+    # (test_greedy_proposals_parallel_is_deterministic_on_dirty_base_graph) is
+    # the primary race detector and reliably segfaults without the fix.
+    n = 5000
+    edges = np.array([[i, i + 1] for i in range(n - 1)], dtype=np.uint64)
+    costs = np.array([1.0 if i % 3 else -2.0 for i in range(n - 1)], dtype=np.float64)
+    parallel_proposals = 4
+
+    def run(threads):
+        # Fresh graph per run so each multi-threaded run starts from a dirty
+        # (not-yet-built) adjacency, the state that triggers the race.
+        graph = bic.graph.UndirectedGraph.from_edges(n, edges)
+        objective = bic.graph.multicut.MulticutObjective(
+            graph, costs, initial_labels=np.zeros(n, dtype=np.uint64)
+        )
+        solver = bic.graph.multicut.FusionMoveMulticut(
+            proposal_generator=bic.graph.multicut.GreedyAdditiveProposalGenerator(seed=0),
+            number_of_threads=threads,
+            number_of_parallel_proposals=parallel_proposals,
+            number_of_iterations=3,
+        )
+        return solver.optimize(objective)
+
+    reference = run(1)
+    for _ in range(25):
+        np.testing.assert_array_equal(run(4), reference)