Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions session_memory_summary_diff_report.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
{
"allowed_diff_count": 0,
"allowed_diffs": [],
"backend_pairs": [
"inmemory_vs_sqlite"
],
"backend_statuses": [
{
"name": "inmemory",
"reason": "",
"status": "ok"
},
{
"name": "sqlite",
"reason": "",
"status": "ok"
},
{
"name": "external_sql",
"reason": "TRPC_AGENT_REPLAY_SQL_URL is not set",
"status": "skipped"
},
{
"name": "redis",
"reason": "TRPC_AGENT_REPLAY_REDIS_URL is not set",
"status": "skipped"
}
],
"case_count": 14,
"cases": [
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "single_turn_text",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "multi_turn_append_order",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "tool_call_roundtrip",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "scoped_state_overwrite",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "memory_preference_search",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "memory_multi_session_isolation",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "summary_generation",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "summary_update_overwrite",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "summary_with_event_truncation",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "duplicate_or_error_recovery",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "serialization_order_nested_payload",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "list_sessions_consistency",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "state_temp_key_ignored_but_persistent_key_compared",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
},
{
"allowed_diff_count": 0,
"backend_pair": "inmemory_vs_sqlite",
"elapsed_ms": 0,
"name": "summary_truncation_preserves_recent_context",
"unallowed_diff_count": 0,
"unexpected_diff_count": 0
}
],
"diffs": [],
"false_positive_summary": {
"normal_case_count": 14,
"unexpected_diff_count": 0
},
"generated_at": "deterministic",
"generated_by": "tests/sessions/test_replay_consistency.py",
"mutation_summary": {
"detected_count": 0,
"mutation_count": 0,
"undetected_mutations": []
},
"report_kind": "normal_replay",
"schema_version": 1,
"unallowed_diff_count": 0,
"unallowed_diffs": [],
"unexpected_diff_count": 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{"name":"single_turn_text","description":"Two text events preserve order, author, role, and text.","operations":["create_session","append user text","append assistant text","store_session","search_memory"],"expected_risks":["basic event ordering drift","text/content normalization regression"]}
{"name":"multi_turn_append_order","description":"Three user/assistant turns preserve append order and invocation ids.","operations":["create_session","append six text events","store_session","search_memory"],"expected_risks":["event reorder","invocation_id mismatch","author/text drift"]}
{"name":"tool_call_roundtrip","description":"Weather tool call, tool response, and final assistant text round-trip across backends.","operations":["create_session","append user request","append function_call","append function_response","append final text"],"expected_risks":["tool args drift","tool response drift","tool event visibility loss"]}
{"name":"scoped_state_overwrite","description":"Session, user, and app state overwrites are strict while temp state is not persisted.","operations":["create_session with state","append state deltas","assert no persisted temp state"],"expected_risks":["state overwrite drift","temp state leakage","persistent key loss"]}
{"name":"memory_preference_search","description":"Preference text is stored through store_session and found by memory search.","operations":["create_session","append preference events","store_session","search tea/hiking/vegetarian"],"expected_risks":["memory content loss","memory author drift","search result normalization drift"]}
{"name":"memory_multi_session_isolation","description":"User A memory search must not return overlapping User B memory containing city-museums-b.","operations":["create session A","create session B","store both sessions","search A save_key"],"expected_risks":["save_key isolation failure","cross-user memory leak","query overlap leakage"]}
{"name":"summary_generation","description":"Manual summary creation yields summary text, summary event, and manager metadata.","operations":["create_session","append long conversation","create_session_summary","get_session_summary"],"expected_risks":["missing summary","summary event flag loss","summary session mismatch"]}
{"name":"summary_update_overwrite","description":"A later summary overwrites the cached summary and includes new release checklist context.","operations":["create_session","append first summary window","create summary","append new events","create summary again"],"expected_risks":["stale summary reuse","latest summary drift","summary wrong session"]}
{"name":"summary_with_event_truncation","description":"Summary truncation stores historical events and keeps recent plus post-summary events active.","operations":["create_session","append events","create summary","append post-summary event"],"expected_risks":["historical event loss","summary anchor loss","recent context loss"]}
{"name":"duplicate_or_error_recovery","description":"Duplicate content, retry error metadata, and recovery event are preserved.","operations":["create_session","append duplicate text events","append retry error event","append recovery event","store_session"],"expected_risks":["duplicate drop/reorder","error metadata drift","recovery event loss"]}
{"name":"serialization_order_nested_payload","description":"Nested dict/list tool payloads are canonicalized by order but strict on value changes.","operations":["create_session","append nested function_call","append nested function_response","append final text"],"expected_risks":["serialization order false positive","nested value drift","tool response loss"]}
{"name":"list_sessions_consistency","description":"list_sessions returns normalized id, app, user, and state consistently.","operations":["create_session with state","append state update","list_sessions"],"expected_risks":["list_sessions state omission","backend-specific list shape drift"]}
{"name":"state_temp_key_ignored_but_persistent_key_compared","description":"temp:* state is ignored for snapshots but rejected in raw persisted sessions.","operations":["create_session with persistent state","append temp and persistent deltas","assert raw temp not persisted"],"expected_risks":["temp key persistence","business state false allow","state value drift"]}
{"name":"summary_truncation_preserves_recent_context","description":"Summary truncation preserves historical events and recent rainy-day context before a post-summary append.","operations":["create_session","append context events","create summary","append canal walk"],"expected_risks":["recent context truncation","historical_events missing","summary event ordering drift"]}
38 changes: 38 additions & 0 deletions tests/sessions/replay_consistency/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Replay Consistency Harness

This harness verifies that session, memory, and summary behavior replays
consistently across storage backends. Python `ReplayCase` fixtures define the
executable operation DSL: create a session, append deterministic events, create
summaries at fixed points, store the final session to memory, run memory
queries, and normalize the resulting snapshot. The JSONL manifest in
`tests/sessions/replay_cases/session_memory_summary_replay_cases.jsonl`
mirrors the registry for review readability and is checked by tests.

The default CI matrix is intentionally light: InMemory plus temporary SQLite
files under `tmp_path`. Optional integration backends are only enabled through
environment variables: `TRPC_AGENT_REPLAY_SQL_URL` for an external SQL backend
and `TRPC_AGENT_REPLAY_REDIS_URL` for Redis. When these variables are absent,
the report records the backend as skipped instead of making CI depend on
external services.

Normalization removes or canonicalizes non-semantic variance: exact timestamps,
summary timestamp values, auto-generated event ids, dict serialization order,
and memory timestamp values. Fixture event ids are preserved so duplicate,
retry, and wrong-id regressions remain visible. Memory search order is sorted
by stable content keys because ranking differs by backend.

Allowed diffs are deliberately narrow: backend name, raw timestamp values, and
timestamp presence. Event order, author/role/text, tool arguments and results,
state values, memory scope/content, summary text, latest-summary overwrite
semantics, summary session id, summary event flags, and historical events are
strict. Summary text is whitespace-normalized but not semantically relaxed.

Diff reports include backend statuses, normal replay false-positive counts,
mutation detection summaries, and structured diff entries with case, backend,
session, event, memory, summary, path, left, and right fields. Tests write
runtime reports to `tmp_path`; the repository root JSON is only a deterministic
schema example. The checked-in mutation example lives at
`tests/sessions/replay_consistency/session_memory_summary_mutation_report.json`
and is generated from the same registry that mutates event text, tool
arguments/results, state, memory text, summaries, and retry error/recovery
events on real normalized replay snapshots.
35 changes: 35 additions & 0 deletions tests/sessions/replay_consistency/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Replay consistency harness for session, memory, and summary backends."""

from .backends import BackendBundle
from .backends import build_backends
from .backends import DeterministicSessionSummarizer
from .cases import EventSpec
from .cases import MemoryQuerySpec
from .cases import ReplayCase
from .cases import replay_cases
from .comparator import DiffEntry
from .comparator import compare_snapshot_pair
from .comparator import recursive_diff
from .mutations import mutate_snapshot
from .mutations import mutations_for_case
from .normalizer import Snapshot
from .normalizer import normalize_snapshot
from .report import write_report

__all__ = [
"BackendBundle",
"build_backends",
"DeterministicSessionSummarizer",
"EventSpec",
"MemoryQuerySpec",
"ReplayCase",
"replay_cases",
"DiffEntry",
"compare_snapshot_pair",
"recursive_diff",
"mutate_snapshot",
"mutations_for_case",
"Snapshot",
"normalize_snapshot",
"write_report",
]
Loading
Loading