test: add unit trst to catch caching behavior

dmadisetti · dmadisetti · commit ba68668e1548 · 2025-11-19T14:20:53.000-08:00
diff --git a/marimo/_ast/app.py b/marimo/_ast/app.py
@@ -336,15 +336,15 @@ def cell(
 
         ```
         @app.cell
-        def __(mo):
+        def _(mo):
             # ...
 
         @app.cell()
-        def __(mo):
+        def _(mo):
             # ...
 
         @app.cell(disabled=True)
-        def __(mo):
+        def _(mo):
             # ...
         ```
 
@@ -759,7 +759,6 @@ async def _function_call(
     async def embed(
         self,
         defs: dict[str, Any] | None = None,
-        **kwargs: Any,
     ) -> AppEmbedResult:
         """Embed a notebook into another notebook.
 
@@ -823,16 +822,13 @@ async def embed(
                 arguments. marimo will use these values instead of executing
                 the cells that would normally define them. Cells that depend
                 on these variables will use your provided values.
-            **kwargs (Any):
-                For forward-compatibility with future arguments.
 
         Returns:
             An object `result` with two attributes: `result.output` (visual
             output of the notebook) and `result.defs` (a dictionary mapping
             variable names defined by the notebook to their values).
 
         """
-        del kwargs
         from marimo._plugins.stateless.flex import vstack
         from marimo._runtime.context.utils import running_in_notebook
 
diff --git a/tests/_ast/test_app.py b/tests/_ast/test_app.py
@@ -1150,6 +1150,108 @@ def __(x: int, y: int) -> None:
         assert result.defs["y"] == 10  # y cell still ran
         assert "x=100, y=10" in result.output.text
 
+    async def test_app_embed_with_defs_stale_outputs(self) -> None:
+        """Test that embed() doesn't return stale cached outputs with different defs."""
+        app = App()
+
+        @app.cell
+        def __() -> tuple[int]:
+            x = 10
+            return (x,)
+
+        @app.cell
+        def __(x: int) -> None:
+            "x is small" if x == 10 else "x is large"
+
+        # First call - no override
+        result_initial = await app.embed()
+        assert result_initial.defs["x"] == 10
+        assert "x is small" in result_initial.output.text
+
+        # Second call - with first override
+        result_override = await app.embed(defs={"x": 100})
+        assert result_override.defs["x"] == 100
+        assert "x is large" in result_override.output.text
+        assert "x is small" not in result_override.output.text
+
+        # Third call - with second override
+        result_override2 = await app.embed(defs={"x": 200})
+        assert result_override2.defs["x"] == 200
+        assert "x is large" in result_override2.output.text
+        assert "x is small" not in result_override2.output.text
+
+        # Check that initial result wasn't mutated by subsequent calls
+        assert result_initial.defs["x"] == 10
+        assert "x is small" in result_initial.output.text
+        assert "x is large" not in result_initial.output.text
+
+    async def test_app_embed_with_defs_stale_outputs_kernel(
+        self, k: Kernel, exec_req: ExecReqProvider
+    ) -> None:
+        """Test embed() with different defs through kernel (tests caching code path)."""
+        await k.run(
+            [
+                exec_req.get(
+                    """
+                    from marimo import App
+
+                    app = App()
+
+                    @app.cell
+                    def __() -> tuple[int]:
+                        x = 10
+                        return (x,)
+
+                    @app.cell
+                    def __(x: int) -> None:
+                        "x is small" if x == 10 else "x is large"
+                    """
+                ),
+                exec_req.get(
+                    """
+                    # First call - no override
+                    result_initial = await app.embed()
+                    """
+                ),
+                exec_req.get(
+                    """
+                    # Second call - with first override
+                    result_override = await app.embed(defs={"x": 100})
+                    """
+                ),
+                exec_req.get(
+                    """
+                    # Third call - with second override
+                    result_override2 = await app.embed(defs={"x": 200})
+                    """
+                ),
+            ]
+        )
+        assert not k.errors
+
+        result_initial = k.globals["result_initial"]
+        result_override = k.globals["result_override"]
+        result_override2 = k.globals["result_override2"]
+
+        # Check first result - output then defs
+        assert "x is small" in result_initial.output.text
+        assert result_initial.defs["x"] == 10
+
+        # Check second result with first override - output then defs
+        assert "x is large" in result_override.output.text
+        assert "x is small" not in result_override.output.text
+        assert result_override.defs["x"] == 100
+
+        # Check third result with second override - output then defs
+        assert "x is large" in result_override2.output.text
+        assert "x is small" not in result_override2.output.text
+        assert result_override2.defs["x"] == 200
+
+        # Check that initial result wasn't mutated by subsequent calls
+        assert "x is small" in result_initial.output.text
+        assert "x is large" not in result_initial.output.text
+        assert result_initial.defs["x"] == 10
+
     @pytest.mark.xfail(
         True, reason="Flaky in CI, can't repro locally", strict=False
     )
diff --git a/tests/_runtime/test_dataflow.py b/tests/_runtime/test_dataflow.py
@@ -1445,7 +1445,9 @@ def test_prune_cells_for_overrides_single_cell() -> None:
     execution_order = ["0", "1", "2"]
 
     # Override x - should prune cell 0
-    result = dataflow.prune_cells_for_overrides(graph, execution_order, {"x": 100})
+    result = dataflow.prune_cells_for_overrides(
+        graph, execution_order, {"x": 100}
+    )
     assert result == ["1", "2"]
 
 
@@ -1537,7 +1539,9 @@ def test_prune_cells_for_overrides_partial_override() -> None:
     execution_order = ["0", "1", "2"]
 
     # Override only x - should prune only cell 0
-    result = dataflow.prune_cells_for_overrides(graph, execution_order, {"x": 100})
+    result = dataflow.prune_cells_for_overrides(
+        graph, execution_order, {"x": 100}
+    )
     assert result == ["1", "2"]
 
 
@@ -1581,5 +1585,7 @@ def test_prune_cells_for_overrides_preserves_order() -> None:
     execution_order = ["0", "1", "2", "3"]
 
     # Override b - should prune only cell 1, preserving order
-    result = dataflow.prune_cells_for_overrides(graph, execution_order, {"b": 100})
+    result = dataflow.prune_cells_for_overrides(
+        graph, execution_order, {"b": 100}
+    )
     assert result == ["0", "2", "3"]