diff --git a/agent-team-architecture-plan/1-unit-tests.md b/agent-team-architecture-plan/1-unit-tests.md index 962a321a..903dcd9b 100644 --- a/agent-team-architecture-plan/1-unit-tests.md +++ b/agent-team-architecture-plan/1-unit-tests.md @@ -64,7 +64,7 @@ Unit-tier import rules: `testing/fixtures.py` owns (same module, flat — split into files only when this one grows past ~500 lines): -- Pytest fixtures (`sample_workflow_yaml`, `sample__chat_payload`, `fake_api_key`, `anthropic_client_no_network`). +- Pytest fixtures (`sample_workflow_yaml`, `sample__chat_payload`, `fake_api_key`). - YAML assertion helpers migrated from the currently-duplicated `services/global_chat/tests/test_utils.py` and `services/workflow_chat/tests/test_utils.py` (`path_matches`, `assert_yaml_equal_except`, `assert_yaml_section_contains_all`, `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_no_special_chars`). - Payload builders (`make__chat_payload`) — shared with integration tier. - Fixture loaders (`load_fixture_json`, `load_fixture_yaml`). @@ -145,7 +145,7 @@ Coverage: generate `--cov=services --cov-report=xml` and upload as artifact for ## 7. Migration path for existing tests -- `services/workflow_chat/tests/test_functions.py` — all eight tests are already unit-shaped. Rename to `test_workflow_chat_functions_unit.py`, delete `sys.path.insert(...)`, replace local `client` fixture with `anthropic_client_no_network`. No assertion changes. +- `services/workflow_chat/tests/test_functions.py` — all eight tests are already unit-shaped. Rename to `test_workflow_chat_functions_unit.py`, delete `sys.path.insert(...)`, delete the local `client` fixture entirely (unit tests don't need an Anthropic client per §1's "zero LLM calls" rule). No assertion changes. - `services/job_chat/tests/test_functions.py` — misclassified. `test_generate_system_message_loads_adaptor_docs_when_missing` hits Postgres → integration. `test_generate_queries_returns_valid_structure` hits real Anthropic → service (with mocked client) or integration. `test_search_docs_returns_general_docs_only` hits Pinecone → integration. A new `test_prompt_unit.py` covers the pure helpers (`build_prompt`, `build_error_correction_prompt`, `extract_page_prefix_from_last_turn`). - `services/global_chat/tests/test_utils.py` + `services/workflow_chat/tests/test_utils.py` — YAML helpers migrate to `testing/fixtures.py`. The `call__service` subprocess helpers are replaced by the integration tier's `ApolloClient`. Old files are deleted after all callers are updated. - `*_pass_fail.py`, `*_qualitative.py`, `*_langfuse_tracing.py`, `*_adaptor_version_passthrough.py`, `*_planner_*.py`, `*_good_morning_*.py` — owned by service/integration/acceptance tiers. diff --git a/agent-team-architecture-plan/2-service-tests.md b/agent-team-architecture-plan/2-service-tests.md index a0c30ce8..1aceee84 100644 --- a/agent-team-architecture-plan/2-service-tests.md +++ b/agent-team-architecture-plan/2-service-tests.md @@ -21,52 +21,29 @@ Service tests verify *logic and information flow*: payload validation, routing, ## 2. The `test_hooks` second argument -### 2.1 Signature change +`main()` gains an optional second positional arg: ```python def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: ... ``` -`entry.py` keeps calling `m.main(data)` with one positional arg — the HTTP path never sees `test_hooks`. `test_hooks` is a test-only affordance. +`entry.py` keeps calling `m.main(data)` with one arg — the HTTP path never sees `test_hooks`. Production behaviour when `test_hooks is None` is byte-identical to today. -### 2.2 The `test_hooks` dict — minimum viable shape +The dict has three documented keys, all optional (docstring in `services/testing/anthropic_mock.py`): -Plain Python `dict`. No `TypedDict`, no pydantic model — just a dict with documented keys. The recognised keys are documented as a docstring in `testing/anthropic_mock.py`: +- `"anthropic_http_client"` — `httpx.Client` backed by `httpx.MockTransport`. Threaded into every `Anthropic(...)` constructor via `build_anthropic_client`. +- `"tool_calls"` — `list[dict]` the test allocates. Production appends breadcrumbs via `record_tool_call`. +- `"tool_stubs"` — `dict[str, Callable]` keyed by tool name. When the planner dispatches a tool, a stub (if registered) is called instead. Today only used for `search_documentation`. -```python -# testing/anthropic_mock.py -""" -The `test_hooks` dict accepts (all optional; all default to absent): - -- "anthropic_http_client": an httpx.Client backed by httpx.MockTransport. - When present, threaded into every Anthropic(...) constructor site. -- "tool_calls": a list[dict] the test allocates. Production code appends - breadcrumbs via record_tool_call(test_hooks, entry). -- "tool_stubs": dict[str, Callable] keyed by tool name. When the planner - dispatches a tool, if a stub exists for that name, the stub is called - with the tool input and its return value used as the tool result. Today - only used for "search_documentation" — see §5. -""" -``` - -Start with three keys. Add more only when a concrete test can't be written without one. Things intentionally left out until needed: - -- Sub-agent stub registry. Default behaviour: the sub-agent's `main()` runs under the same mock HTTP client — that's usually what a test wants. A stub registry (`test_hooks["subagent_stubs"]`) can be added if a test needs to bypass the sub-agent's logic entirely. -- `seed`, `disable_langfuse`, `scratch`. Add when a test fails without them. - -### 2.3 Threading `test_hooks` through +Sub-agent stubbing isn't supported — sub-agents run under the same shared mock client. -Each chat service's `main()` passes `test_hooks` into the agent / client constructors it creates. Everywhere that currently calls `Anthropic(api_key=...)` swaps to `build_anthropic_client(api_key, test_hooks)` (new factory — see §3). +Sites that thread `test_hooks` through: -Sites that change: - -- `services/job_chat/job_chat.py` — `AnthropicClient.__init__`. -- `services/workflow_chat/workflow_chat.py` — `AnthropicClient.__init__`. -- `services/global_chat/router.py` — `RouterAgent.__init__`. -- `services/global_chat/planner.py` — `PlannerAgent.__init__`. -- `services/global_chat/subagent_caller.py` — accepts `test_hooks` and forwards to sub-agent `main()` calls. - -Production behaviour when `test_hooks is None` is byte-identical to today. Every new kwarg defaults to `None`. +- `services/job_chat/job_chat.py` — `AnthropicClient.__init__` +- `services/workflow_chat/workflow_chat.py` — `AnthropicClient.__init__` +- `services/global_chat/router.py` — `RouterAgent.__init__` +- `services/global_chat/planner.py` — `PlannerAgent.__init__` +- `services/global_chat/subagent_caller.py` — forwards to sub-agent `main()` --- @@ -85,77 +62,47 @@ def build_anthropic_client(api_key: str, test_hooks: Optional[dict] = None) -> A Every `AnthropicClient` / `RouterAgent` / `PlannerAgent` constructor swaps `Anthropic(api_key=...)` for `build_anthropic_client(api_key, test_hooks)`. -### 3.2 `testing/anthropic_mock.py` +### 3.2 `MockAnthropic` in `services/testing/anthropic_mock.py` -Single file — `MockAnthropicClient` class, canned response-body builders, docstring documenting recognised `test_hooks` keys, and the `record_tool_call(test_hooks, entry)` helper. Split later if it grows unwieldy. - -The `anthropic` Python SDK accepts a custom `http_client` — we build ours from `httpx.MockTransport`: +Thin wrapper over `httpx.MockTransport`. Tests register regex → response pairs; on each Anthropic request the mock matches the latest user message text against registered patterns and returns the first match. No new runtime dep — `httpx.MockTransport` is built into httpx (already in `poetry.lock`). ```python -class MockAnthropicClient: - """Thin wrapper over httpx.Client + httpx.MockTransport. - - Usage: - mc = MockAnthropicClient.always(response=text_response("hello")) - mc = MockAnthropicClient.script([resp1, resp2, resp3]) # multi-turn - mc = MockAnthropicClient.streaming(events=[...]) # SSE - - After the call: - mc.requests # list[RecordedRequest] - mc.last_request.json["messages"] - mc.last_request.headers["x-api-key"] - """ - @classmethod - def always(cls, response) -> "MockAnthropicClient": ... - @classmethod - def script(cls, responses) -> "MockAnthropicClient": ... - @classmethod - def streaming(cls, events) -> "MockAnthropicClient": ... - - @property - def httpx_client(self) -> httpx.Client: ... - @property - def requests(self) -> list[RecordedRequest]: ... - @property - def last_request(self) -> RecordedRequest: ... +mock = MockAnthropic() +mock.set_response(r"haiku", "sure, here's a haiku") +mock.set_response(r"create workflow", tool_use("call_workflow_agent", {...})) +test_hooks = test_hooks_factory(anthropic=mock) +main(payload, test_hooks) +assert mock.last_request.headers["x-api-key"] == "sk-test" ``` -Response-body builders in the same file: +Design choices: -- `text_response(text, model=..., usage=...)` -- `tool_use_response(tool_name, tool_input, tool_use_id="toolu_01")` -- `mixed_response(text, tool_uses=[...])` -- `router_decision_response(destination, confidence=4, job_key=None)` -- `stream_events(text="", tool_uses=None)` -- `usage_block(input_tokens=100, output_tokens=50, cache_creation=0, cache_read=0)` +- **First-match-wins.** Order specific patterns before general ones. +- **Loud no-match.** Raises `AssertionError` with the unmatched text + registered patterns. +- **Captured requests.** `mock.requests` (list) and `mock.last_request` for assertions on outbound headers, body shape, system prompt, `cache_control`, etc. -No new runtime dep — `httpx.MockTransport` is built into httpx, which is already in `poetry.lock`. +Two private helpers in the same file: -### 3.3 Scripted multi-turn example +- `_latest_user_text(messages)` — last `role=user` message's text + tool_result content. Matching against `tool_result` text is what makes the planner's internal loop work. +- `_build_message_body(response)` — wraps response in standard Anthropic message envelope. -```python -def test_planner_calls_workflow_then_job_agents(test_hooks_factory): - mock = MockAnthropicClient.script([ - router_decision_response("planner", confidence=5), - tool_use_response("call_workflow_agent", {"message": "create workflow"}), - tool_use_response("call_job_code_agent", {"message": "code for step", "job_key": "fetch"}), - text_response("All done."), - ]) - test_hooks = test_hooks_factory(anthropic=mock) - result = global_chat_main(make_global_chat_payload("create a workflow"), test_hooks) - - assert [c["tool"] for c in test_hooks["tool_calls"]] == [ - "router_decision", "call_workflow_agent", "call_job_code_agent", - ] -``` +A `tool_use(name, input)` helper builds tool_use content blocks for `set_response`. + +### 3.3 Planner internal loop + +`main()` is one user turn, but the planner calls Anthropic multiple times within that turn (call → tool_use → run tool → call with tool_result → ... until `end_turn`). Each round has different content in the latest user message, so regex matching naturally resolves it. + +When a sub-agent is invoked inside this loop, its own `main()` runs under the *same* mock client. Tests register regexes covering both parent and child expected user messages on one mock — no sub-agent stub registry needed. -When a test wants to bypass a sub-agent's real code, it scripts responses for the planner's `/v1/messages` calls and lets the sub-agent's own `main()` run under the same mock client. Stub registries aren't needed for the common case. +### 3.4 Streaming (deferred) + +Streaming-mock support is out of scope. Defer until the first service test for a streaming code path actually needs it. Integration tier covers stream behaviour. --- -## 4. Tool-call breadcrumbs (`test_hooks["tool_calls"]`) +## 4. Tool-call breadcrumbs -`test_hooks["tool_calls"]` is a list the test allocates and production code appends to. One helper in `testing/anthropic_mock.py`: +`record_tool_call` lives in `services/util.py` next to `build_anthropic_client` (production code never imports from `testing/`): ```python def record_tool_call(test_hooks: Optional[dict], entry: dict) -> None: @@ -166,7 +113,7 @@ def record_tool_call(test_hooks: Optional[dict], entry: dict) -> None: crumbs.append(entry) ``` -Dispatch sites (`planner._execute_tool`, `router.route_and_execute`) call `record_tool_call(test_hooks, {"tool": ..., "input": ...})`. Two dict lookups per call when `test_hooks is None` — negligible. +Dispatch sites (`planner._execute_tool`, `router.route_and_execute`) call `record_tool_call(test_hooks, {"tool": ..., "input": ...})`. Two dict lookups when `test_hooks is None` — negligible. Tests read: @@ -176,112 +123,87 @@ assert [c["tool"] for c in test_hooks["tool_calls"]] == ["router_decision", "cal --- -## 5. Tool stubs (`test_hooks["tool_stubs"]`) +## 5. Tool stubs -Most planner tools don't need stubbing. `call_workflow_agent` and `call_job_code_agent` inherit `test_hooks` and run the sub-agent's own mocked `main()`. `inspect_job_code` is pure local code with no network. The one tool that does need stubbing is **`search_documentation`** — without a stub it would hit Pinecone (vector store) and OpenAI (embeddings) on every service test. +Most planner tools don't need stubbing. `call_workflow_agent` and `call_job_code_agent` inherit `test_hooks` and run the sub-agent's own mocked `main()`. `inspect_job_code` is pure local code. The one tool that does need stubbing is **`search_documentation`** — without a stub it would hit Pinecone + OpenAI on every service test. -Production change in `services/global_chat/planner.py::_execute_tool` — one if/else at the top of the dispatch: +`planner._execute_tool` checks for a stub at the top of dispatch: ```python stub = (self._test_hooks or {}).get("tool_stubs", {}).get(tool_use_block.name) if stub is not None: tool_result = stub(tool_use_block.input) else: - # original dispatch by name follows + # original dispatch by name ... ``` -Test usage: - -```python -test_hooks = { - "anthropic_http_client": mock.httpx_client, - "tool_calls": [], - "tool_stubs": { - "search_documentation": lambda tool_input: "Cron triggers run on a schedule...", - }, -} -result = main(payload, test_hooks) -``` - -The stub returns whatever shape the real tool returns (here a string — the planner feeds it back into the next Anthropic call). - -A `build_search_documentation_stub(docs=[...])` helper in `testing/anthropic_mock.py` can emerge once a second test reuses the same shape — not preemptively. - --- ## 6. Directory layout +Tier folders, mirroring the unit-tests branch: + ``` services//tests/ __init__.py - conftest.py # re-exports shared fixtures; auto-marks by filename suffix - test__unit.py # tier 1 (unit-tests-architect) - test__service.py # tier 2 (this tier) - fixtures/ # per-service fixture data (optional) + unit/ # tier 1 + service/ # tier 2 (this tier) + __init__.py + test_.py + integration/ # tier 3 + acceptance/ # tier 4 ``` -Test filenames this tier will add (illustrative, not exhaustive): - -- `services/global_chat/tests/test_router_service.py` — router decisions by intent. -- `services/global_chat/tests/test_planner_service.py` — tool dispatch order, test_hooks propagation. -- `services/global_chat/tests/test_subagent_passthrough_service.py` — global → workflow / job wiring. -- `services/workflow_chat/tests/test_workflow_chat_service.py` — YAML extraction, retry loop, streaming events. -- `services/job_chat/tests/test_job_chat_service.py` — RAG injection (with stubbed retriever), suggest-code response shape, page-prefix detection, error-correction loop. - -Cross-service end-to-end flow tests (planner chain over mocks) also live under `services/global_chat/tests/` since `global_chat` owns the planner. - ---- +Tier marker is auto-applied based on which tier directory the test is in (see §8). Filenames are plain `test_*.py` — no `_service` suffix. -## 7. Shared helpers in `testing/` +Component subfolders are fine when a tier folder gets crowded: ``` -testing/ - __init__.py - anthropic_mock.py # MockAnthropicClient, response builders, test_hooks-keys docstring, record_tool_call - fixtures.py # pytest fixtures + YAML assertion helpers + payload builders + loaders - fixtures/ - workflows/*.yaml - histories/*.json +services/global_chat/tests/service/ + test_router.py + test_planner.py + planner/ + test_subagent_passthrough.py + test_search_documentation_stub.py ``` -`fixtures.py` is the flat home for: +Cross-service planner-chain tests live under `services/global_chat/tests/service/` since `global_chat` owns the planner. -- `make_global_chat_payload`, `make_workflow_chat_payload`, `make_job_chat_payload`. -- `get_workflow_yaml_attachment`, `get_suggested_code_attachment`, `get_usage`. -- `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_yaml_equal_except`, `path_matches`, `assert_no_special_chars`. -- Pytest fixtures: `mock_anthropic`, `test_hooks_factory`, `fake_api_key`, `sample_workflow_yaml`, `anthropic_client_no_network`. -- `set_unit_test_env` (dummy keys, disable langfuse/sentry). -- `load_fixture_json`, `load_fixture_yaml`. +--- -One file until it gets unwieldy (~500 lines). Split then, not pre-emptively. +## 7. Shared helpers in `services/testing/` -Key fixture: +`services/testing/` is on the import path via `pythonpath = ["services"]` (same as how services do `from util import …`). No path-munging hacks. -```python -@pytest.fixture -def test_hooks_factory(): - def _factory(*, anthropic=None, **overrides): - opts = {"tool_calls": []} - if anthropic is not None: - opts["anthropic_http_client"] = anthropic.httpx_client - opts.update(overrides) - return opts - return _factory ``` +services/testing/ + __init__.py + README.md + anthropic_mock.py # MockAnthropic + tool_use helper + fixtures.py # pytest fixtures + payload builders + env setup + yaml_assertions.py # YAML structural helpers (unit-tier safe; owned by unit tier) +``` + +`fixtures.py` registers pytest fixtures via `pytest_plugins = ["testing.fixtures"]` in the root conftest. Today: just `test_hooks_factory` (builds a `test_hooks` dict, threading in a `MockAnthropic`'s httpx client). Add more fixtures here as service tests need them. -Per-service `conftest.py` just exposes `pytest_plugins = ["testing.fixtures"]` (inherited from root) plus any per-service niche fixtures. +Dummy env vars (Anthropic / OpenAI / Pinecone / Langfuse keys) are set inline in the root `conftest.py` at import time, before any service module loads — `setdefault` so real keys (for integration / acceptance) win. --- ## 8. Pytest configuration -Owned initially by this tier in PR #1 (bootstrap). See overview §5 for the full block. Relevant keys: +Owned by the unit tier in PR #1. Service tier inherits everything; this section just lists what it relies on. ```toml [tool.pytest.ini_options] -pythonpath = ["services", "."] -testpaths = ["services"] +pythonpath = ["services"] +testpaths = [ + "services/global_chat/tests", + "services/workflow_chat/tests", + "services/job_chat/tests", + "services/tools", +] python_files = ["test_*.py"] markers = [ "unit: ...", @@ -289,10 +211,10 @@ markers = [ "integration: ...", "acceptance: ...", ] -addopts = ["-ra"] +addopts = ["--strict-markers", "--strict-config", "-ra", "--tb=short"] ``` -Markers applied by filename suffix in the root `apollo/conftest.py` — authors don't decorate manually. +The root `conftest.py` walks `item.path.parts` looking for `unit`/`service`/`integration`/`acceptance` — directory IS the marker. Authors don't decorate manually. --- @@ -304,11 +226,11 @@ Service runs in the same `tests.yaml` workflow as unit, via `pytest -m "unit or ## 10. Migration recipe for existing `pass_fail` tests -1. **Classify the assertion.** Content-sensitive (`"response mentions Salesforce"`) → integration or acceptance. Structural (`"workflow_yaml has 2 jobs"`) → service (with a canned mock producing that structure). +1. **Classify.** Content-sensitive (`"response mentions Salesforce"`) → integration / acceptance. Structural (`"workflow_yaml has 2 jobs"`) → service. 2. **Replace the call site.** Swap `subprocess.run([..., "entry.py", ...])` for `from . import main; main(payload, test_hooks)`. -3. **Build the mock.** Hand-craft an Anthropic response fixture (or a script for planner multi-turn) that produces the shape under test. -4. **Assert on structure + breadcrumbs.** Replace content asserts with routing / shape asserts; keep content in acceptance. -5. **Delete the old test** once the new one is stable. +3. **Build the mock.** Register `set_response(pattern, response)` pairs covering each Anthropic call in the path under test. +4. **Assert on structure + breadcrumbs.** Replace content asserts with routing / shape asserts. +5. **Delete the old test** once stable. Expect ~50–70% of `pass_fail` tests to become service tests; the rest stay in integration. @@ -324,20 +246,20 @@ Expect ~50–70% of `pass_fail` tests to become service tests; the rest stay in | `services/global_chat/router.py` | accept `test_hooks` in `__init__`; pass into sub-agent calls | | `services/global_chat/planner.py` | accept `test_hooks`; use in `_execute_tool`; thread into subagent_caller | | `services/global_chat/subagent_caller.py` | accept `test_hooks`; pass to sub-agent `main()` | -| `services/util.py` | add `build_anthropic_client()` | +| `services/util.py` | add `build_anthropic_client()` and `record_tool_call()` | -Everywhere: backward-compatible defaults. `test_hooks is None` ⇒ existing behaviour, byte-for-byte. +Backward-compatible defaults everywhere. `test_hooks is None` ⇒ existing behaviour, byte-for-byte. --- -## 12. Extensibility — new sub-agent or tool +## 12. Extensibility **New sub-agent:** 1. `services/my_new_agent/my_new_agent.py` with `def main(data, test_hooks=None)`. 2. Every `Anthropic(...)` site uses `build_anthropic_client(api_key, test_hooks)`. 3. Thread `test_hooks` through internal calls. -4. Add `services/my_new_agent/tests/test_*_service.py`. Conftest auto-inherits. +4. Add `services/my_new_agent/tests/service/test_*.py`. **New tool in the planner:** @@ -349,24 +271,21 @@ Pattern: **one arg, one call to `record_tool_call`, one test**. No framework cha --- -## 13. What this tier deliberately does NOT do +## 13. Deliberately deferred -- **No sub-agent stub registry on day one.** Default behaviour (sub-agent runs under shared mock client) is what tests usually want. -- **No `test_hooks["seed"]` / `["disable_langfuse"]` / `["scratch"]`.** Add when a test fails without them. -- **No `pytest-asyncio`, `pytest-randomly`, or other dev deps.** Add when needed. -- **No frozen public API contract between tiers.** Shared helpers live in one package; rename when the signature improves. +- Sub-agent stub registry — sub-agents run under the shared mock client by default. +- `seed`, `disable_langfuse`, `scratch` keys — add when a test fails without them. +- Streaming mock support — covered by integration tier. --- -## 14. What else belongs in this tier - -Good service-test targets: +## 14. Targets for this tier -- **API key threading** — payload `api_key` ends up in `mock.last_request.headers["x-api-key"]`; absent → env var is used. -- **Cache-control regression** — planner system prompt has `cache_control: {"type": "ephemeral"}`; assert on outbound request body. -- **Context-management beta** — planner sets `context-management-2025-06-27` header and `context_management` field on every call. +- **API key threading** — payload `api_key` ends up in `mock.last_request.headers["x-api-key"]`. +- **Cache-control regression** — assert on outbound request body via `json.loads(mock.last_request.content)`. +- **Context-management beta** — header + `context_management` field on every call. - **History round-trip** — returned `history` equals input + this turn's user/assistant messages. -- **`AdaptorSpecifier` propagation** — payload `context.adaptor = "@openfn/language-http@3.1.11"` shows up in the prompt. +- **`AdaptorSpecifier` propagation** — payload `context.adaptor` shows up in the prompt. - **Retry loops** — `workflow_chat` retries once on YAML parse failure; script invalid-then-valid and assert count. - **Negative paths** — missing `content` → `ApolloError(400)`; malformed tool-use response → graceful fallback. @@ -374,4 +293,4 @@ Good service-test targets: ## Summary -`test_hooks` second arg on `main()` + `build_anthropic_client(api_key, test_hooks)` factory + `MockAnthropicClient` with `always`/`script`/`streaming` constructors + three `test_hooks` keys (`anthropic_http_client`, `tool_calls`, `tool_stubs` — the last only used for `search_documentation` today). Three files in `testing/`, filename-suffix markers, shared workflow with the unit tier. Add sub-agent stub infrastructure the first time a test can't be written without it. +`test_hooks` second arg on `main()` + `build_anthropic_client(api_key, test_hooks)` and `record_tool_call(test_hooks, entry)` in `services/util.py` + `MockAnthropic` (regex → response pairs, `AssertionError` on no match) in `services/testing/anthropic_mock.py` + three `test_hooks` keys (`anthropic_http_client`, `tool_calls`, `tool_stubs`). Tier folders auto-mark by directory. Streaming and sub-agent stubs deferred. diff --git a/agent-team-architecture-plan/5-overview.md b/agent-team-architecture-plan/5-overview.md index 36d0fa28..05617e05 100644 --- a/agent-team-architecture-plan/5-overview.md +++ b/agent-team-architecture-plan/5-overview.md @@ -40,7 +40,7 @@ apollo/ │ ├── testing/ # Shared test helpers — peer to services/, not a service itself │ ├── __init__.py -│ ├── anthropic_mock.py # MockAnthropicClient + canned response builders; documents the `test_hooks` dict keys +│ ├── anthropic_mock.py # MockAnthropic (regex → response) + tool_use helper + record_tool_call; documents the `test_hooks` dict keys │ ├── fixtures.py # pytest fixtures (mock client, test_hooks factory, payloads, yaml assertions) │ ├── server.py # apollo_server fixture + ApolloClient (sync / sse / ws) │ ├── judge.py # small LLM-as-judge helper for acceptance specs @@ -90,7 +90,7 @@ def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: ... `test_hooks` is a plain Python `dict` (not a formal type). Its recognised keys are documented as a docstring in `testing/anthropic_mock.py`: -- `anthropic_http_client` — an `httpx.Client` backed by `httpx.MockTransport`; threaded into every `Anthropic(api_key=..., http_client=...)` constructor site via a new `services/util.py::build_anthropic_client()` factory. +- `anthropic_http_client` — an `httpx.Client` backed by `httpx.MockTransport`. Built by `MockAnthropic`, which matches each request's latest user message text against test-registered regex → response pairs (no match → `AssertionError`). Threaded into every `Anthropic(api_key=..., http_client=...)` constructor site via a new `services/util.py::build_anthropic_client()` factory. The planner's internal tool-use loop (multiple Anthropic calls per `main()`) is covered by the same mechanism — each round has different latest-user-message text, so different regexes match. See `2-service-tests.md` §3. - `tool_calls` — a test-allocated `list[dict]` that production code appends to as breadcrumbs when present. - `tool_stubs` — a `dict[str, Callable]` keyed by tool name. The planner consults it before dispatching a tool; if a stub is registered, it's called instead of the real tool. Today only used for `search_documentation` (which otherwise hits Pinecone + OpenAI). See `2-service-tests.md` §5. @@ -178,7 +178,7 @@ No changes needed in pyproject, CI, or shared helpers. Discovery is zero-config 1. **Scaffolding.** Create `testing/` (skeleton — `anthropic_mock.py`, `fixtures.py`, `server.py`, `judge.py`), root `apollo/conftest.py`, `[tool.pytest.ini_options]` block in pyproject. One PR — unblocks everything else. 2. **Unit tier.** Migrate `services/workflow_chat/tests/test_functions.py` → `test_workflow_chat_functions_unit.py` as the worked example. Wire `tests.yaml` with just `-m unit`. Green CI on every push. -3. **Service tier.** Add `test_hooks=None` to the three chat services' `main()`. Add `services/util.py::build_anthropic_client()`. Build `MockAnthropicClient`. Extend `tests.yaml` to `-m "unit or service"`. Migrate the first `pass_fail` test whose assertion doesn't depend on content. +3. **Service tier.** Add `test_hooks=None` to the three chat services' `main()`. Add `services/util.py::build_anthropic_client()`. Build `MockAnthropic`. Extend `tests.yaml` to `-m "unit or service"`. Migrate the first `pass_fail` test whose assertion doesn't depend on content. 4. **Integration tier.** Add `testing/server.py` (server fixture + `ApolloClient`). Create `llm-tests.yaml`. Migrate the first cross-service end-to-end test into `services/global_chat/tests/test_global_chat_integration.py`. Secrets wired. 5. **Acceptance tier.** Add `testing/judge.py` and the markdown collector hook in the root conftest. Drop the first 2–3 hero specs into `services/global_chat/tests/acceptance/`. First manual run green (`workflow_dispatch`). diff --git a/conftest.py b/conftest.py new file mode 100644 index 00000000..639f527b --- /dev/null +++ b/conftest.py @@ -0,0 +1,91 @@ +"""Repo-root pytest configuration. + +- Auto-applies a tier marker (`unit` / `service` / `integration` / + `acceptance`) based on the test's path. The directory IS the marker. +- For tests marked `unit`, blocks network, subprocess, DB, and LLM client + construction so accidental I/O fails loud instead of timing out. +- Sets dummy env vars before any service module is imported so service- + tier tests can construct (mocked) Anthropic clients without real keys. +""" + +import os +from unittest.mock import patch + +import pytest + + +# Set dummy keys before any service module imports. `setdefault` so real +# keys (for integration / acceptance) win. +for _key, _val in ( + ("ANTHROPIC_API_KEY", "test-dummy"), + ("OPENAI_API_KEY", "test-dummy"), + ("PINECONE_API_KEY", "test-dummy"), + ("LANGFUSE_TRACING", "false"), + ("LANGFUSE_PUBLIC_KEY", "pk-test-dummy"), + ("LANGFUSE_SECRET_KEY", "sk-test-dummy"), + ("SENTRY_DSN", ""), +): + os.environ.setdefault(_key, _val) + +pytest_plugins = ["testing.fixtures"] + + +_TIER_DIRS = ("unit", "service", "integration", "acceptance") + +_BLOCKED_TARGETS = ( + ("socket.socket.connect", "socket.connect()"), + ("subprocess.run", "subprocess.run()"), + ("subprocess.Popen", "subprocess.Popen()"), + ("psycopg2.connect", "psycopg2.connect()"), + # Block LLM client construction, not first request — earlier failure, + # easier to trace. + ("anthropic.Anthropic.__init__", "anthropic.Anthropic()"), + ("anthropic.AsyncAnthropic.__init__", "anthropic.AsyncAnthropic()"), + ("openai.OpenAI.__init__", "openai.OpenAI()"), + ("openai.AsyncOpenAI.__init__", "openai.AsyncOpenAI()"), +) + + +class UnitTestViolation(RuntimeError): + """Raised when a unit test attempts a forbidden operation.""" + + +def _make_blocker(operation): + def _block(*_args, **_kwargs): + raise UnitTestViolation( + f"Unit tests may not perform `{operation}`. Move this test to " + "tests/service/ or tests/integration/ if real I/O is needed. " + "See conftest.py at the repo root for the policy." + ) + + return _block + + +def pytest_collection_modifyitems(items): + for item in items: + for tier in _TIER_DIRS: + if tier in item.path.parts: + item.add_marker(getattr(pytest.mark, tier)) + break + + +@pytest.fixture(autouse=True) +def _enforce_unit_isolation(request): + if "unit" not in request.keywords: + yield + return + + patches = [] + for target, label in _BLOCKED_TARGETS: + try: + p = patch(target, side_effect=_make_blocker(label)) + p.start() + patches.append(p) + except (AttributeError, ModuleNotFoundError): + continue + + try: + yield + finally: + for p in patches: + p.stop() diff --git a/pyproject.toml b/pyproject.toml index 1aa8bc19..ea648bd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,43 @@ pytest = "^8.3.4" requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" +[tool.pytest.ini_options] +# Make `services/` importable the same way `services/entry.py` does it. +# Without this, `from workflow_chat.workflow_chat import ...` fails inside tests. +pythonpath = ["services"] + +# Discovery roots. pytest walks these for test_*.py files. +testpaths = [ + "services/global_chat/tests", + "services/workflow_chat/tests", + "services/job_chat/tests", + "services/tools", +] + +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] + +markers = [ + "unit: fast, isolated, no I/O. Runs on every PR push.", + "service: mocks HTTP/LLM clients; exercises service handlers. Runs on merge.", + "integration: hits real external services (LLM, Pinecone, Postgres). Manual/nightly.", + "acceptance: end-to-end acceptance criteria. Manual/nightly.", +] + +addopts = [ + "--strict-markers", + "--strict-config", + "-ra", + "--tb=short", +] + +filterwarnings = [ + "default::DeprecationWarning", + "ignore::DeprecationWarning:anthropic.*", + "ignore::DeprecationWarning:pydantic.*", +] + [tool.black] line-length = 120 diff --git a/services/entry.py b/services/entry.py index e9814fe5..886f7824 100644 --- a/services/entry.py +++ b/services/entry.py @@ -3,6 +3,7 @@ import json import uuid import argparse + from dotenv import load_dotenv import sentry_sdk from util import set_apollo_port, ApolloError diff --git a/services/global_chat/global_chat.py b/services/global_chat/global_chat.py index b48ed48b..ed0acac3 100644 --- a/services/global_chat/global_chat.py +++ b/services/global_chat/global_chat.py @@ -60,12 +60,13 @@ def get_stream(self) -> bool: @observe(name="global_chat", capture_input=False) -def main(data_dict: dict) -> dict: +def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: """ Main entry point for global agent service. Args: data_dict: Input payload as dictionary + test_hooks: Optional test-only dict; see testing/anthropic_mock.py. Returns: Response dictionary with response, attachments, history, usage, meta @@ -93,7 +94,7 @@ def main(data_dict: dict) -> dict: config_loader = ConfigLoader("config.yaml") # 3. Initialize router - router = RouterAgent(config_loader, data.api_key) + router = RouterAgent(config_loader, data.api_key, test_hooks=test_hooks) # 4. Route and execute result = router.route_and_execute( diff --git a/services/global_chat/planner.py b/services/global_chat/planner.py index a5fe2b46..a82aade8 100644 --- a/services/global_chat/planner.py +++ b/services/global_chat/planner.py @@ -4,14 +4,13 @@ import os from typing import List, Dict, Optional from dataclasses import dataclass -from anthropic import Anthropic import sys from pathlib import Path sys.path.append(str(Path(__file__).parent.parent)) from langfuse import observe -from util import create_logger, ApolloError, sum_usage +from util import create_logger, ApolloError, sum_usage, build_anthropic_client, record_tool_call from streaming_util import StreamManager from global_chat.config_loader import ConfigLoader from models import resolve_model @@ -38,14 +37,20 @@ class PlannerAgent: Planner agent that coordinates subagents and tools for complex multi-step tasks. """ - def __init__(self, config_loader: ConfigLoader, api_key: Optional[str] = None): + def __init__( + self, + config_loader: ConfigLoader, + api_key: Optional[str] = None, + test_hooks: Optional[dict] = None, + ): self.config_loader = config_loader self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY") + self._test_hooks = test_hooks if not self.api_key: raise ApolloError(500, "ANTHROPIC_API_KEY not found") - self.client = Anthropic(api_key=self.api_key) + self.client = build_anthropic_client(self.api_key, test_hooks) self.tools = TOOL_DEFINITIONS planner_config = config_loader.config.get("planner", {}) @@ -237,6 +242,14 @@ def _find_all_tool_uses(self, content): def _execute_tool(self, tool_use_block, total_usage, tool_calls_meta) -> str: """Execute a single tool call and return the result string.""" + record_tool_call(self._test_hooks, {"tool": tool_use_block.name, "input": tool_use_block.input}) + + stub = (self._test_hooks or {}).get("tool_stubs", {}).get(tool_use_block.name) + if stub is not None: + tool_result = stub(tool_use_block.input) + tool_calls_meta.append({"tool": tool_use_block.name, "input": tool_use_block.input}) + return tool_result + if tool_use_block.name == "search_documentation": tool_result = search_documentation_tool(tool_use_block.input) @@ -252,6 +265,7 @@ def _execute_tool(self, tool_use_block, total_usage, tool_calls_meta) -> str: api_key=self.api_key, user=self._user, metrics_opt_in=self._metrics_opt_in, + test_hooks=self._test_hooks, ) if "usage" in subagent_result: @@ -296,6 +310,7 @@ def _execute_tool(self, tool_use_block, total_usage, tool_calls_meta) -> str: api_key=self.api_key, user=self._user, metrics_opt_in=self._metrics_opt_in, + test_hooks=self._test_hooks, ) if "usage" in subagent_result: diff --git a/services/global_chat/router.py b/services/global_chat/router.py index b419ef58..168c2a26 100644 --- a/services/global_chat/router.py +++ b/services/global_chat/router.py @@ -7,7 +7,6 @@ import json from typing import List, Dict, Optional from dataclasses import dataclass -from anthropic import Anthropic # Import utilities from parent services directory import sys @@ -15,7 +14,7 @@ sys.path.append(str(Path(__file__).parent.parent)) from langfuse import observe -from util import create_logger, ApolloError, sum_usage +from util import create_logger, ApolloError, sum_usage, build_anthropic_client, record_tool_call from global_chat.config_loader import ConfigLoader from models import resolve_model from global_chat.yaml_utils import get_step_name_from_page, find_job_in_yaml, stitch_job_code @@ -51,14 +50,20 @@ class RouterAgent: - planner (for complex multi-step tasks) """ - def __init__(self, config_loader: ConfigLoader, api_key: Optional[str] = None): + def __init__( + self, + config_loader: ConfigLoader, + api_key: Optional[str] = None, + test_hooks: Optional[dict] = None, + ): self.config_loader = config_loader self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY") + self._test_hooks = test_hooks if not self.api_key: raise ApolloError(500, "ANTHROPIC_API_KEY not found") - self.client = Anthropic(api_key=self.api_key) + self.client = build_anthropic_client(self.api_key, test_hooks) router_config = config_loader.config.get("router", {}) self.model = resolve_model(router_config.get("model", "claude-haiku")) @@ -113,6 +118,11 @@ def route_and_execute( logger.warning(f"Routing decision failed: {e}. Defaulting to planner for safety.") decision = RouterDecision(destination="planner", confidence=1) + record_tool_call(self._test_hooks, { + "tool": "router_decision", + "input": {"destination": decision.destination, "job_key": decision.job_key}, + }) + if decision.destination == "workflow_agent": result = self._route_to_workflow_chat(content, workflow_yaml, history, stream, decision.confidence) elif decision.destination == "job_code_agent": @@ -245,7 +255,7 @@ def _route_to_workflow_chat( "metrics_opt_in": self._metrics_opt_in, } - result = workflow_chat_main(payload) + result = workflow_chat_main(payload, test_hooks=self._test_hooks) total_usage = sum_usage(self.routing_usage, result["usage"]) attachments = [] @@ -318,7 +328,7 @@ def _route_to_job_chat( "metrics_opt_in": self._metrics_opt_in, } - result = job_chat_main(payload) + result = job_chat_main(payload, test_hooks=self._test_hooks) total_usage = sum_usage(self.routing_usage, result["usage"]) # Stitch suggested_code back into workflow YAML @@ -365,7 +375,7 @@ def _route_to_planner( clean_history = [{"role": t["role"], "content": t["content"]} for t in history] enriched_content = self._format_attachments_for_content(content) - planner = PlannerAgent(self.config_loader, self.api_key) + planner = PlannerAgent(self.config_loader, self.api_key, test_hooks=self._test_hooks) planner_result = planner.run( content=enriched_content, workflow_yaml=workflow_yaml, diff --git a/services/global_chat/subagent_caller.py b/services/global_chat/subagent_caller.py index a5dbe435..23c4c6c9 100644 --- a/services/global_chat/subagent_caller.py +++ b/services/global_chat/subagent_caller.py @@ -24,6 +24,7 @@ def call_workflow_agent( api_key: Optional[str] = None, user: Optional[Dict] = None, metrics_opt_in: Optional[bool] = None, + test_hooks: Optional[dict] = None, ) -> Dict: """ Call the workflow agent and return its results. @@ -52,7 +53,7 @@ def call_workflow_agent( try: from workflow_chat.workflow_chat import main as workflow_chat_main - result = workflow_chat_main(workflow_payload) + result = workflow_chat_main(workflow_payload, test_hooks=test_hooks) response_preview = result.get("response", "")[:120] logger.info(f"workflow_agent response: {response_preview}") @@ -75,6 +76,7 @@ def call_job_agent( api_key: Optional[str] = None, user: Optional[Dict] = None, metrics_opt_in: Optional[bool] = None, + test_hooks: Optional[dict] = None, ) -> Dict: """ Call the job code agent and return its results. @@ -119,7 +121,7 @@ def call_job_agent( try: from job_chat.job_chat import main as job_chat_main - result = job_chat_main(job_payload) + result = job_chat_main(job_payload, test_hooks=test_hooks) response_preview = result.get("response", "")[:120] logger.info(f"job_agent response: {response_preview}") diff --git a/services/job_chat/job_chat.py b/services/job_chat/job_chat.py index 767253e0..36a44a49 100644 --- a/services/job_chat/job_chat.py +++ b/services/job_chat/job_chat.py @@ -17,7 +17,7 @@ import sentry_sdk from langfuse import observe, propagate_attributes, get_client as get_langfuse_client from langfuse_util import should_track, build_tags -from util import ApolloError, create_logger, AdaptorSpecifier, add_page_prefix +from util import ApolloError, create_logger, AdaptorSpecifier, add_page_prefix, build_anthropic_client from .prompt import build_prompt, build_error_correction_prompt from .old_prompt import build_old_prompt from streaming_util import StreamManager @@ -104,12 +104,12 @@ class ChatResponse: diff: Optional[Dict[str, Any]] = None class AnthropicClient: - def __init__(self, config: Optional[ChatConfig] = None): + def __init__(self, config: Optional[ChatConfig] = None, test_hooks: Optional[dict] = None): self.config = config or ChatConfig() self.api_key = self.config.api_key or os.getenv("ANTHROPIC_API_KEY") if not self.api_key: raise ValueError("API key must be provided") - self.client = Anthropic(api_key=self.api_key) + self.client = build_anthropic_client(self.api_key, test_hooks) @staticmethod @@ -507,7 +507,7 @@ def sum_usage(self, *usage_objects): @observe(name="job_chat", capture_input=False) -def main(data_dict: dict) -> dict: +def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: """ Main entry point with improved error handling and input validation. """ @@ -556,7 +556,7 @@ def main(data_dict: dict) -> dict: should_refresh_rag = data.refresh_rag or user_navigated config = ChatConfig(api_key=data.api_key) if data.api_key else None - client = AnthropicClient(config) + client = AnthropicClient(config, test_hooks=test_hooks) with propagate_attributes( session_id=session_id, user_id=user_info.get("id") if tracking else None, diff --git a/services/testing/README.md b/services/testing/README.md new file mode 100644 index 00000000..8e94d6a8 --- /dev/null +++ b/services/testing/README.md @@ -0,0 +1,26 @@ +# services/testing + +Shared helpers for the test suite. + +This directory is on the Python path via `pyproject.toml` +(`pythonpath = ["services"]`), so tests import as +`from testing.anthropic_mock import MockAnthropic`. + +## Modules + +- `anthropic_mock.py` — `MockAnthropic` (httpx.MockTransport-backed) and the + `tool_use(...)` content-block helper. Service tier. +- `fixtures.py` — pytest fixtures registered via `pytest_plugins` in the + root `conftest.py`. Currently just `test_hooks_factory`. +- `yaml_assertions.py` — pure-function YAML structural assertions, safe for + every tier (unit included). Owned by the unit tier. + +## Why under `services/` and not a top-level `tests/`? + +Imports across the codebase resolve relative to `services/` (see +`CLAUDE.md` → "Python Import Patterns"). Putting helpers here means tests +import without path-munging hacks, the same way services do `from util +import …`. + +The Bun service auto-discovery in `platform/src/util/describe-modules.ts` +skips this directory because it has no `testing.py` index file. diff --git a/services/testing/__init__.py b/services/testing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/testing/anthropic_mock.py b/services/testing/anthropic_mock.py new file mode 100644 index 00000000..3335bf95 --- /dev/null +++ b/services/testing/anthropic_mock.py @@ -0,0 +1,127 @@ +"""Mock Anthropic HTTP client for service tests. + +The `test_hooks` dict accepts (all optional; all default to absent): + +- "anthropic_http_client": an httpx.Client backed by httpx.MockTransport. + When present, threaded into every Anthropic(...) constructor site via + `services/util.py::build_anthropic_client`. +- "tool_calls": a list[dict] the test allocates. Production code appends + breadcrumbs via `services/util.py::record_tool_call`. +- "tool_stubs": dict[str, Callable] keyed by tool name. When the planner + dispatches a tool, if a stub exists for that name it's called with the + tool input and its return value is used as the tool result. Today only + used for "search_documentation". +""" +from __future__ import annotations + +import json +import re +import uuid + +import httpx + + +def tool_use(name: str, input: dict, id: str = "toolu_test") -> list[dict]: + """Build a single tool_use content block for `MockAnthropic.set_response`.""" + return [{"type": "tool_use", "id": id, "name": name, "input": input}] + + +def _latest_user_text(messages: list[dict]) -> str: + """Concatenate text + tool_result content from the last user message.""" + for message in reversed(messages): + if message.get("role") != "user": + continue + content = message.get("content") + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [] + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") == "text": + parts.append(block.get("text", "")) + elif block.get("type") == "tool_result": + inner = block.get("content") + if isinstance(inner, str): + parts.append(inner) + elif isinstance(inner, list): + for sub in inner: + if isinstance(sub, dict) and sub.get("type") == "text": + parts.append(sub.get("text", "")) + return "\n".join(parts) + return "" + return "" + + +def _build_message_body(response: str | list[dict]) -> dict: + """Wrap a registered response as a full Anthropic message envelope.""" + if isinstance(response, str): + content_blocks = [{"type": "text", "text": response}] + stop_reason = "end_turn" + else: + content_blocks = response + stop_reason = "tool_use" if any(b.get("type") == "tool_use" for b in response) else "end_turn" + + return { + "id": f"msg_{uuid.uuid4().hex[:24]}", + "type": "message", + "role": "assistant", + "model": "claude-mock", + "content": content_blocks, + "stop_reason": stop_reason, + "stop_sequence": None, + "usage": { + "input_tokens": 1, + "output_tokens": 1, + "cache_creation_input_tokens": 0, + "cache_read_input_tokens": 0, + }, + } + + +class MockAnthropic: + """Mock Anthropic API backed by httpx.MockTransport. + + Tests register regex → response pairs. Each request is matched against + the latest user message text (including tool_result content); the first + matching pattern wins. No match raises AssertionError. + + Usage: + mock = MockAnthropic() + mock.set_response(r"haiku", "sure, here's a haiku") + mock.set_response(r"create workflow", tool_use("call_workflow_agent", {...})) + test_hooks = test_hooks_factory(anthropic=mock) + main(payload, test_hooks) + assert mock.last_request.headers["x-api-key"] == "sk-test" + """ + + def __init__(self): + self._responses: list[tuple[re.Pattern, str | list[dict]]] = [] + self.requests: list[httpx.Request] = [] + self.httpx_client = httpx.Client(transport=httpx.MockTransport(self._handle)) + + def set_response(self, pattern: str, response: str | list[dict]) -> None: + """Register a response for any request whose latest user-message text matches `pattern`. + + `response` is either: + - str: returned as a single text content block. + - list[dict]: returned as content blocks (use for tool_use, mixed). + """ + self._responses.append((re.compile(pattern), response)) + + @property + def last_request(self) -> httpx.Request: + return self.requests[-1] + + def _handle(self, request: httpx.Request) -> httpx.Response: + self.requests.append(request) + body = json.loads(request.content) + user_text = _latest_user_text(body.get("messages", [])) + for pattern, resp in self._responses: + if pattern.search(user_text): + return httpx.Response(200, json=_build_message_body(resp)) + raise AssertionError( + f"MockAnthropic: no pattern matched user message {user_text!r}. " + f"Registered patterns: {[p.pattern for p, _ in self._responses]}" + ) diff --git a/services/testing/fixtures.py b/services/testing/fixtures.py new file mode 100644 index 00000000..1a2a424e --- /dev/null +++ b/services/testing/fixtures.py @@ -0,0 +1,23 @@ +"""Shared pytest fixtures for Apollo service tests.""" +from __future__ import annotations + +import pytest + +from testing.anthropic_mock import MockAnthropic + + +@pytest.fixture +def test_hooks_factory(): + """Build a `test_hooks` dict for `main(payload, test_hooks)`. + + Pass `anthropic=` a MockAnthropic to wire its httpx client; pass any + other key as a kwarg (e.g. `tool_calls=[]`, `tool_stubs={...}`). + """ + def _factory(*, anthropic: MockAnthropic | None = None, **overrides) -> dict: + opts: dict = {} + if anthropic is not None: + opts["anthropic_http_client"] = anthropic.httpx_client + opts.update(overrides) + return opts + + return _factory diff --git a/services/util.py b/services/util.py index bdce2d24..670ab22a 100644 --- a/services/util.py +++ b/services/util.py @@ -2,10 +2,11 @@ import os import sys from dataclasses import dataclass -from typing import Any +from typing import Any, Optional import psycopg2 import requests +from anthropic import Anthropic # Adaptor parsing constants SCOPED_ADAPTOR_MIN_PARTS = 3 @@ -205,6 +206,34 @@ def short_name(self) -> str: return self.name.split("/")[-1].replace("language-", "") +def build_anthropic_client(api_key: str, test_hooks: Optional[dict] = None) -> Anthropic: + """Construct an Anthropic client, optionally backed by a test http_client. + + When `test_hooks` carries an `anthropic_http_client` (an `httpx.Client` + bound to a `MockTransport`), it's threaded into the SDK so no real HTTP + request is made. With `test_hooks is None`, behaviour is byte-identical + to `Anthropic(api_key=api_key)`. + """ + http_client = (test_hooks or {}).get("anthropic_http_client") + kwargs = {"api_key": api_key} + if http_client is not None: + kwargs["http_client"] = http_client + return Anthropic(**kwargs) + + +def record_tool_call(test_hooks: Optional[dict], entry: dict) -> None: + """Append a tool-dispatch breadcrumb when tests have allocated a list. + + No-op when `test_hooks is None` or when the dict has no `tool_calls` + list. Two dict lookups in the no-op path; negligible. + """ + if test_hooks is None: + return + crumbs = test_hooks.get("tool_calls") + if crumbs is not None: + crumbs.append(entry) + + def add_page_prefix(content: str, page: dict | None) -> str: """ Add [pg:...] prefix to message for page navigation tracking. diff --git a/services/workflow_chat/workflow_chat.py b/services/workflow_chat/workflow_chat.py index 95af209e..d5915f2c 100644 --- a/services/workflow_chat/workflow_chat.py +++ b/services/workflow_chat/workflow_chat.py @@ -27,7 +27,7 @@ import sentry_sdk from langfuse import observe, propagate_attributes, get_client as get_langfuse_client from langfuse_util import should_track, build_tags -from util import ApolloError, create_logger, add_page_prefix +from util import ApolloError, create_logger, add_page_prefix, build_anthropic_client from .gen_project_prompt import build_prompt from workflow_chat.available_adaptors import get_available_adaptors from streaming_util import StreamManager @@ -107,12 +107,12 @@ class ChatResponse: class AnthropicClient: - def __init__(self, config: Optional[ChatConfig] = None): + def __init__(self, config: Optional[ChatConfig] = None, test_hooks: Optional[dict] = None): self.config = config or ChatConfig() self.api_key = self.config.api_key or os.getenv("ANTHROPIC_API_KEY") if not self.api_key: raise ValueError("API key must be provided") - self.client = Anthropic(api_key=self.api_key) + self.client = build_anthropic_client(self.api_key, test_hooks) @staticmethod def _unescape_json_string(text): @@ -554,7 +554,7 @@ def process_stream_event(self, event, accumulated_response, text_started, sent_l @observe(name="workflow_chat", capture_input=False) -def main(data_dict: dict) -> dict: +def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: """ Main entry point with improved error handling and input validation. """ @@ -585,7 +585,7 @@ def main(data_dict: dict) -> dict: } config = ChatConfig(api_key=data.api_key) if data.api_key else None - client = AnthropicClient(config) + client = AnthropicClient(config, test_hooks=test_hooks) with propagate_attributes( session_id=session_id,