OpenFn · hanna-paasivirta · Apr 27, 2026 · May 5, 2026 · May 6, 2026 · May 6, 2026
diff --git a/agent-team-architecture-plan/1-unit-tests.md b/agent-team-architecture-plan/1-unit-tests.md
@@ -64,7 +64,7 @@ Unit-tier import rules:
 
 `testing/fixtures.py` owns (same module, flat — split into files only when this one grows past ~500 lines):
 
-- Pytest fixtures (`sample_workflow_yaml`, `sample_<svc>_chat_payload`, `fake_api_key`, `anthropic_client_no_network`).
+- Pytest fixtures (`sample_workflow_yaml`, `sample_<svc>_chat_payload`, `fake_api_key`).
 - YAML assertion helpers migrated from the currently-duplicated `services/global_chat/tests/test_utils.py` and `services/workflow_chat/tests/test_utils.py` (`path_matches`, `assert_yaml_equal_except`, `assert_yaml_section_contains_all`, `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_no_special_chars`).
 - Payload builders (`make_<svc>_chat_payload`) — shared with integration tier.
 - Fixture loaders (`load_fixture_json`, `load_fixture_yaml`).
@@ -145,7 +145,7 @@ Coverage: generate `--cov=services --cov-report=xml` and upload as artifact for
 
 ## 7. Migration path for existing tests
 
-- `services/workflow_chat/tests/test_functions.py` — all eight tests are already unit-shaped. Rename to `test_workflow_chat_functions_unit.py`, delete `sys.path.insert(...)`, replace local `client` fixture with `anthropic_client_no_network`. No assertion changes.
+- `services/workflow_chat/tests/test_functions.py` — all eight tests are already unit-shaped. Rename to `test_workflow_chat_functions_unit.py`, delete `sys.path.insert(...)`, delete the local `client` fixture entirely (unit tests don't need an Anthropic client per §1's "zero LLM calls" rule). No assertion changes.
 - `services/job_chat/tests/test_functions.py` — misclassified. `test_generate_system_message_loads_adaptor_docs_when_missing` hits Postgres → integration. `test_generate_queries_returns_valid_structure` hits real Anthropic → service (with mocked client) or integration. `test_search_docs_returns_general_docs_only` hits Pinecone → integration. A new `test_prompt_unit.py` covers the pure helpers (`build_prompt`, `build_error_correction_prompt`, `extract_page_prefix_from_last_turn`).
 - `services/global_chat/tests/test_utils.py` + `services/workflow_chat/tests/test_utils.py` — YAML helpers migrate to `testing/fixtures.py`. The `call_<svc>_service` subprocess helpers are replaced by the integration tier's `ApolloClient`. Old files are deleted after all callers are updated.
 - `*_pass_fail.py`, `*_qualitative.py`, `*_langfuse_tracing.py`, `*_adaptor_version_passthrough.py`, `*_planner_*.py`, `*_good_morning_*.py` — owned by service/integration/acceptance tiers.

diff --git a/agent-team-architecture-plan/2-service-tests.md b/agent-team-architecture-plan/2-service-tests.md
diff --git a/agent-team-architecture-plan/5-overview.md b/agent-team-architecture-plan/5-overview.md
@@ -40,7 +40,7 @@ apollo/
 │
 ├── testing/                                # Shared test helpers — peer to services/, not a service itself
 │   ├── __init__.py
-│   ├── anthropic_mock.py                   # MockAnthropicClient + canned response builders; documents the `test_hooks` dict keys
+│   ├── anthropic_mock.py                   # MockAnthropic (regex → response) + tool_use helper + record_tool_call; documents the `test_hooks` dict keys
 │   ├── fixtures.py                         # pytest fixtures (mock client, test_hooks factory, payloads, yaml assertions)
 │   ├── server.py                           # apollo_server fixture + ApolloClient (sync / sse / ws)
 │   ├── judge.py                            # small LLM-as-judge helper for acceptance specs
@@ -90,7 +90,7 @@ def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: ...
 
 `test_hooks` is a plain Python `dict` (not a formal type). Its recognised keys are documented as a docstring in `testing/anthropic_mock.py`:
 
-- `anthropic_http_client` — an `httpx.Client` backed by `httpx.MockTransport`; threaded into every `Anthropic(api_key=..., http_client=...)` constructor site via a new `services/util.py::build_anthropic_client()` factory.
+- `anthropic_http_client` — an `httpx.Client` backed by `httpx.MockTransport`. Built by `MockAnthropic`, which matches each request's latest user message text against test-registered regex → response pairs (no match → `AssertionError`). Threaded into every `Anthropic(api_key=..., http_client=...)` constructor site via a new `services/util.py::build_anthropic_client()` factory. The planner's internal tool-use loop (multiple Anthropic calls per `main()`) is covered by the same mechanism — each round has different latest-user-message text, so different regexes match. See `2-service-tests.md` §3.
 - `tool_calls` — a test-allocated `list[dict]` that production code appends to as breadcrumbs when present.
 - `tool_stubs` — a `dict[str, Callable]` keyed by tool name. The planner consults it before dispatching a tool; if a stub is registered, it's called instead of the real tool. Today only used for `search_documentation` (which otherwise hits Pinecone + OpenAI). See `2-service-tests.md` §5.
 
@@ -178,7 +178,7 @@ No changes needed in pyproject, CI, or shared helpers. Discovery is zero-config
 
 1. **Scaffolding.** Create `testing/` (skeleton — `anthropic_mock.py`, `fixtures.py`, `server.py`, `judge.py`), root `apollo/conftest.py`, `[tool.pytest.ini_options]` block in pyproject. One PR — unblocks everything else.
 2. **Unit tier.** Migrate `services/workflow_chat/tests/test_functions.py` → `test_workflow_chat_functions_unit.py` as the worked example. Wire `tests.yaml` with just `-m unit`. Green CI on every push.
-3. **Service tier.** Add `test_hooks=None` to the three chat services' `main()`. Add `services/util.py::build_anthropic_client()`. Build `MockAnthropicClient`. Extend `tests.yaml` to `-m "unit or service"`. Migrate the first `pass_fail` test whose assertion doesn't depend on content.
+3. **Service tier.** Add `test_hooks=None` to the three chat services' `main()`. Add `services/util.py::build_anthropic_client()`. Build `MockAnthropic`. Extend `tests.yaml` to `-m "unit or service"`. Migrate the first `pass_fail` test whose assertion doesn't depend on content.
 4. **Integration tier.** Add `testing/server.py` (server fixture + `ApolloClient`). Create `llm-tests.yaml`. Migrate the first cross-service end-to-end test into `services/global_chat/tests/test_global_chat_integration.py`. Secrets wired.
 5. **Acceptance tier.** Add `testing/judge.py` and the markdown collector hook in the root conftest. Drop the first 2–3 hero specs into `services/global_chat/tests/acceptance/`. First manual run green (`workflow_dispatch`).
 

diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,91 @@
+"""Repo-root pytest configuration.
+
+- Auto-applies a tier marker (`unit` / `service` / `integration` /
+  `acceptance`) based on the test's path. The directory IS the marker.
+- For tests marked `unit`, blocks network, subprocess, DB, and LLM client
+  construction so accidental I/O fails loud instead of timing out.
+- Sets dummy env vars before any service module is imported so service-
+  tier tests can construct (mocked) Anthropic clients without real keys.
+"""
+
+import os
+from unittest.mock import patch
+
+import pytest
+
+
+# Set dummy keys before any service module imports. `setdefault` so real
+# keys (for integration / acceptance) win.
+for _key, _val in (
+    ("ANTHROPIC_API_KEY", "test-dummy"),
+    ("OPENAI_API_KEY", "test-dummy"),
+    ("PINECONE_API_KEY", "test-dummy"),
+    ("LANGFUSE_TRACING", "false"),
+    ("LANGFUSE_PUBLIC_KEY", "pk-test-dummy"),
+    ("LANGFUSE_SECRET_KEY", "sk-test-dummy"),
+    ("SENTRY_DSN", ""),
+):
+    os.environ.setdefault(_key, _val)
+
+pytest_plugins = ["testing.fixtures"]
+
+
+_TIER_DIRS = ("unit", "service", "integration", "acceptance")
+
+_BLOCKED_TARGETS = (
+    ("socket.socket.connect", "socket.connect()"),
+    ("subprocess.run", "subprocess.run()"),
+    ("subprocess.Popen", "subprocess.Popen()"),
+    ("psycopg2.connect", "psycopg2.connect()"),
+    # Block LLM client construction, not first request — earlier failure,
+    # easier to trace.
+    ("anthropic.Anthropic.__init__", "anthropic.Anthropic()"),
+    ("anthropic.AsyncAnthropic.__init__", "anthropic.AsyncAnthropic()"),
+    ("openai.OpenAI.__init__", "openai.OpenAI()"),
+    ("openai.AsyncOpenAI.__init__", "openai.AsyncOpenAI()"),
+)
+
+
+class UnitTestViolation(RuntimeError):
+    """Raised when a unit test attempts a forbidden operation."""
+
+
+def _make_blocker(operation):
+    def _block(*_args, **_kwargs):
+        raise UnitTestViolation(
+            f"Unit tests may not perform `{operation}`. Move this test to "
+            "tests/service/ or tests/integration/ if real I/O is needed. "
+            "See conftest.py at the repo root for the policy."
+        )
+
+    return _block
+
+
+def pytest_collection_modifyitems(items):
+    for item in items:
+        for tier in _TIER_DIRS:
+            if tier in item.path.parts:
+                item.add_marker(getattr(pytest.mark, tier))
+                break
+
+
+@pytest.fixture(autouse=True)
+def _enforce_unit_isolation(request):
+    if "unit" not in request.keywords:
+        yield
+        return
+
+    patches = []
+    for target, label in _BLOCKED_TARGETS:
+        try:
+            p = patch(target, side_effect=_make_blocker(label))
+            p.start()
+            patches.append(p)
+        except (AttributeError, ModuleNotFoundError):
+            continue
+
+    try:
+        yield
+    finally:
+        for p in patches:
+            p.stop()
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,43 @@ pytest = "^8.3.4"
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
+[tool.pytest.ini_options]
+# Make `services/` importable the same way `services/entry.py` does it.
+# Without this, `from workflow_chat.workflow_chat import ...` fails inside tests.
+pythonpath = ["services"]
+
+# Discovery roots. pytest walks these for test_*.py files.
+testpaths = [
+    "services/global_chat/tests",
+    "services/workflow_chat/tests",
+    "services/job_chat/tests",
+    "services/tools",
+]
+
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+
+markers = [
+    "unit: fast, isolated, no I/O. Runs on every PR push.",
+    "service: mocks HTTP/LLM clients; exercises service handlers. Runs on merge.",
+    "integration: hits real external services (LLM, Pinecone, Postgres). Manual/nightly.",
+    "acceptance: end-to-end acceptance criteria. Manual/nightly.",
+]
+
+addopts = [
+    "--strict-markers",
+    "--strict-config",
+    "-ra",
+    "--tb=short",
+]
+
+filterwarnings = [
+    "default::DeprecationWarning",
+    "ignore::DeprecationWarning:anthropic.*",
+    "ignore::DeprecationWarning:pydantic.*",
+]
+
 [tool.black]
 line-length = 120
 

diff --git a/services/entry.py b/services/entry.py
@@ -3,6 +3,7 @@
 import json
 import uuid
 import argparse
+
 from dotenv import load_dotenv
 import sentry_sdk
 from util import set_apollo_port, ApolloError

diff --git a/services/global_chat/global_chat.py b/services/global_chat/global_chat.py
@@ -60,12 +60,13 @@ def get_stream(self) -> bool:
 
 
 @observe(name="global_chat", capture_input=False)
-def main(data_dict: dict) -> dict:
+def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict:
     """
     Main entry point for global agent service.
 
     Args:
         data_dict: Input payload as dictionary
+        test_hooks: Optional test-only dict; see testing/anthropic_mock.py.
 
     Returns:
         Response dictionary with response, attachments, history, usage, meta
@@ -93,7 +94,7 @@ def main(data_dict: dict) -> dict:
             config_loader = ConfigLoader("config.yaml")
 
             # 3. Initialize router
-            router = RouterAgent(config_loader, data.api_key)
+            router = RouterAgent(config_loader, data.api_key, test_hooks=test_hooks)
 
             # 4. Route and execute
             result = router.route_and_execute(

diff --git a/services/global_chat/planner.py b/services/global_chat/planner.py
@@ -4,14 +4,13 @@
 import os
 from typing import List, Dict, Optional
 from dataclasses import dataclass
-from anthropic import Anthropic
 
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent.parent))
 
 from langfuse import observe
-from util import create_logger, ApolloError, sum_usage
+from util import create_logger, ApolloError, sum_usage, build_anthropic_client, record_tool_call
 from streaming_util import StreamManager
 from global_chat.config_loader import ConfigLoader
 from models import resolve_model
@@ -38,14 +37,20 @@ class PlannerAgent:
     Planner agent that coordinates subagents and tools for complex multi-step tasks.
     """
 
-    def __init__(self, config_loader: ConfigLoader, api_key: Optional[str] = None):
+    def __init__(
+        self,
+        config_loader: ConfigLoader,
+        api_key: Optional[str] = None,
+        test_hooks: Optional[dict] = None,
+    ):
         self.config_loader = config_loader
         self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
+        self._test_hooks = test_hooks
 
         if not self.api_key:
             raise ApolloError(500, "ANTHROPIC_API_KEY not found")
 
-        self.client = Anthropic(api_key=self.api_key)
+        self.client = build_anthropic_client(self.api_key, test_hooks)
         self.tools = TOOL_DEFINITIONS
 
         planner_config = config_loader.config.get("planner", {})
@@ -237,6 +242,14 @@ def _find_all_tool_uses(self, content):
 
     def _execute_tool(self, tool_use_block, total_usage, tool_calls_meta) -> str:
         """Execute a single tool call and return the result string."""
+        record_tool_call(self._test_hooks, {"tool": tool_use_block.name, "input": tool_use_block.input})
+
+        stub = (self._test_hooks or {}).get("tool_stubs", {}).get(tool_use_block.name)
+        if stub is not None:
+            tool_result = stub(tool_use_block.input)
+            tool_calls_meta.append({"tool": tool_use_block.name, "input": tool_use_block.input})
+            return tool_result
+
         if tool_use_block.name == "search_documentation":
             tool_result = search_documentation_tool(tool_use_block.input)
 
@@ -252,6 +265,7 @@ def _execute_tool(self, tool_use_block, total_usage, tool_calls_meta) -> str:
                 api_key=self.api_key,
                 user=self._user,
                 metrics_opt_in=self._metrics_opt_in,
+                test_hooks=self._test_hooks,
             )
 
             if "usage" in subagent_result:
@@ -296,6 +310,7 @@ def _execute_tool(self, tool_use_block, total_usage, tool_calls_meta) -> str:
                 api_key=self.api_key,
                 user=self._user,
                 metrics_opt_in=self._metrics_opt_in,
+                test_hooks=self._test_hooks,
             )
 
             if "usage" in subagent_result:

diff --git a/services/global_chat/router.py b/services/global_chat/router.py
@@ -7,15 +7,14 @@
 import json
 from typing import List, Dict, Optional
 from dataclasses import dataclass
-from anthropic import Anthropic
 
 # Import utilities from parent services directory
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent.parent))
 
 from langfuse import observe
-from util import create_logger, ApolloError, sum_usage
+from util import create_logger, ApolloError, sum_usage, build_anthropic_client, record_tool_call
 from global_chat.config_loader import ConfigLoader
 from models import resolve_model
 from global_chat.yaml_utils import get_step_name_from_page, find_job_in_yaml, stitch_job_code
@@ -51,14 +50,20 @@ class RouterAgent:
     - planner (for complex multi-step tasks)
     """
 
-    def __init__(self, config_loader: ConfigLoader, api_key: Optional[str] = None):
+    def __init__(
+        self,
+        config_loader: ConfigLoader,
+        api_key: Optional[str] = None,
+        test_hooks: Optional[dict] = None,
+    ):
         self.config_loader = config_loader
         self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
+        self._test_hooks = test_hooks
 
         if not self.api_key:
             raise ApolloError(500, "ANTHROPIC_API_KEY not found")
 
-        self.client = Anthropic(api_key=self.api_key)
+        self.client = build_anthropic_client(self.api_key, test_hooks)
 
         router_config = config_loader.config.get("router", {})
         self.model = resolve_model(router_config.get("model", "claude-haiku"))
@@ -113,6 +118,11 @@ def route_and_execute(
             logger.warning(f"Routing decision failed: {e}. Defaulting to planner for safety.")
             decision = RouterDecision(destination="planner", confidence=1)
 
+        record_tool_call(self._test_hooks, {
+            "tool": "router_decision",
+            "input": {"destination": decision.destination, "job_key": decision.job_key},
+        })
+
         if decision.destination == "workflow_agent":
             result = self._route_to_workflow_chat(content, workflow_yaml, history, stream, decision.confidence)
         elif decision.destination == "job_code_agent":
@@ -245,7 +255,7 @@ def _route_to_workflow_chat(
             "metrics_opt_in": self._metrics_opt_in,
         }
 
-        result = workflow_chat_main(payload)
+        result = workflow_chat_main(payload, test_hooks=self._test_hooks)
         total_usage = sum_usage(self.routing_usage, result["usage"])
 
         attachments = []
@@ -318,7 +328,7 @@ def _route_to_job_chat(
             "metrics_opt_in": self._metrics_opt_in,
         }
 
-        result = job_chat_main(payload)
+        result = job_chat_main(payload, test_hooks=self._test_hooks)
         total_usage = sum_usage(self.routing_usage, result["usage"])
 
         # Stitch suggested_code back into workflow YAML
@@ -365,7 +375,7 @@ def _route_to_planner(
         clean_history = [{"role": t["role"], "content": t["content"]} for t in history]
         enriched_content = self._format_attachments_for_content(content)
 
-        planner = PlannerAgent(self.config_loader, self.api_key)
+        planner = PlannerAgent(self.config_loader, self.api_key, test_hooks=self._test_hooks)
         planner_result = planner.run(
             content=enriched_content,
             workflow_yaml=workflow_yaml,

diff --git a/services/global_chat/subagent_caller.py b/services/global_chat/subagent_caller.py
@@ -24,6 +24,7 @@ def call_workflow_agent(
     api_key: Optional[str] = None,
     user: Optional[Dict] = None,
     metrics_opt_in: Optional[bool] = None,
+    test_hooks: Optional[dict] = None,
 ) -> Dict:
     """
     Call the workflow agent and return its results.
@@ -52,7 +53,7 @@ def call_workflow_agent(
 
     try:
         from workflow_chat.workflow_chat import main as workflow_chat_main
-        result = workflow_chat_main(workflow_payload)
+        result = workflow_chat_main(workflow_payload, test_hooks=test_hooks)
 
         response_preview = result.get("response", "")[:120]
         logger.info(f"workflow_agent response: {response_preview}")
@@ -75,6 +76,7 @@ def call_job_agent(
     api_key: Optional[str] = None,
     user: Optional[Dict] = None,
     metrics_opt_in: Optional[bool] = None,
+    test_hooks: Optional[dict] = None,
 ) -> Dict:
     """
     Call the job code agent and return its results.
@@ -119,7 +121,7 @@ def call_job_agent(
 
     try:
         from job_chat.job_chat import main as job_chat_main
-        result = job_chat_main(job_payload)
+        result = job_chat_main(job_payload, test_hooks=test_hooks)
 
         response_preview = result.get("response", "")[:120]
         logger.info(f"job_agent response: {response_preview}")