volcengine · yeyitech · Apr 17, 2026 · Apr 17, 2026
diff --git a/.gitignore b/.gitignore
@@ -153,6 +153,10 @@ logs/
 .tmp/
 ov.conf
 result/
+!benchmark/openmontage/data/
+!benchmark/openmontage/data/fixture.json
+!benchmark/openmontage/result/
+!benchmark/openmontage/result/.gitkeep
 
 # Jupyter Notebook
 .ipynb_checkpoints

diff --git a/benchmark/openmontage/README.md b/benchmark/openmontage/README.md
@@ -0,0 +1,105 @@
+# OpenMontage Benchmark MVP
+
+This benchmark turns the [OpenMontage RFC](../../docs/en/about/03-roadmap.md) idea into a small,
+repo-local fixture that stresses stage-scoped context handoff in OpenViking.
+
+The benchmark is intentionally narrow:
+
+- no dependency on the external OpenMontage repository
+- one in-repo production fixture
+- deterministic scoring
+- no UI or long-running orchestration
+
+## Why This Benchmark Matters
+
+OpenMontage-like pipelines are not generic chat. They move across explicit production stages:
+
+1. `brief`
+2. `script`
+3. `scene_plan`
+4. `asset_manifest`
+5. `render_report`
+
+Each stage wants a different slice of context. The benchmark checks whether retrieval can stay
+focused on the right artifact instead of dragging the full production history into every step.
+
+## Layout
+
+```text
+benchmark/openmontage/
+├── README.md
+├── data/
+│   └── fixture.json
+├── import_to_ov.py
+├── run_eval.py
+├── scorer.py
+├── test_smoke.py
+└── result/
+```
+
+## Fixture Model
+
+The fixture contains one synthetic production, `launch-video`, with five stage artifacts:
+
+- `brief.md`
+- `script.md`
+- `scene_plan.md`
+- `asset_manifest.md`
+- `render_report.md`
+
+It also contains a compact evaluation set. Each case defines:
+
+- a query
+- the target stage
+- the artifact URI suffix that should be retrieved
+- keywords that should appear in the returned evidence
+
+## Quick Start
+
+### 1. Import the fixture into OpenViking
+
+Embedded mode:
+
+```bash
+python benchmark/openmontage/import_to_ov.py --mode embedded --workspace ./data/openmontage-workspace
+```
+
+HTTP mode:
+
+```bash
+python benchmark/openmontage/import_to_ov.py --mode http --url http://localhost:1933
+```
+
+### 2. Run the evaluation
+
+```bash
+python benchmark/openmontage/run_eval.py --mode embedded --workspace ./data/openmontage-workspace
+```
+
+This writes a JSON report to `benchmark/openmontage/result/openmontage_eval.json`.
+
+### 3. Score the results again if needed
+
+```bash
+python benchmark/openmontage/scorer.py benchmark/openmontage/result/openmontage_eval.json
+```
+
+## Evaluation Strategy
+
+The MVP deliberately avoids LLM judges. It scores retrieval deterministically:
+
+- pass if the expected artifact URI suffix appears in the returned hits
+- pass if all required keywords appear across the top retrieved evidence
+
+That keeps this benchmark cheap, reproducible, and suitable for CI smoke usage.
+
+## Expected Use
+
+This benchmark is useful when you want to compare:
+
+- stage-local retrieval quality
+- hierarchy-aware directory layout decisions
+- whether artifact handoff remains coherent as a project accumulates more files
+
+It is not a full production workflow benchmark yet. The next step would be to add multiple
+projects, more stage transitions, and latency/cost reporting.
diff --git a/benchmark/openmontage/data/fixture.json b/benchmark/openmontage/data/fixture.json
@@ -0,0 +1,78 @@
+{
+  "project_id": "launch-video",
+  "project_uri": "viking://resources/openmontage/launch-video",
+  "artifacts": [
+    {
+      "stage": "brief",
+      "relative_path": "01-brief/brief.md",
+      "title": "Launch Video Brief",
+      "content": "# Launch Video Brief\n\nGoal: produce a 45-second launch video for the Northstar workspace release.\n\nCreative intent:\n- calm, editorial motion\n- no handheld camera simulation\n- still frames are allowed only for the final CTA card\n\nProvider lock:\n- after approval, provider must stay on Remotion + ImageGen Alpha\n- sample-before-batch generation is mandatory before full asset production\n"
+    },
+    {
+      "stage": "script",
+      "relative_path": "02-script/script.md",
+      "title": "Launch Video Script",
+      "content": "# Script\n\nScene 1 introduces the pain of tab overload.\nScene 2 shows the workspace timeline.\nScene 3 ends with the CTA line: 'Ship context, not chaos.'\n\nThe narration must keep the phrase 'sample-before-batch' as a production rule reminder.\n"
+    },
+    {
+      "stage": "scene_plan",
+      "relative_path": "03-scene-plan/scene_plan.md",
+      "title": "Scene Plan",
+      "content": "# Scene Plan\n\nScene 1:\n- duration: 10s\n- motion required: yes\n- visual: browser tabs collapsing into a single timeline rail\n\nScene 2:\n- duration: 18s\n- motion required: yes\n- visual: cards snapping into a production board\n\nScene 3:\n- duration: 17s\n- motion required: no\n- visual: still CTA card with product lockup\n"
+    },
+    {
+      "stage": "asset_manifest",
+      "relative_path": "04-asset-manifest/asset_manifest.md",
+      "title": "Asset Manifest",
+      "content": "# Asset Manifest\n\nRequired assets:\n- 3 product UI stills\n- 2 gradient backplates\n- 1 voiceover track\n- 1 logo lockup SVG\n\nBatching rule:\n- generate one approved keyframe sample before producing the full image batch\n- keep provider fixed to ImageGen Alpha after approval\n"
+    },
+    {
+      "stage": "render_report",
+      "relative_path": "05-render-report/render_report.md",
+      "title": "Render Report",
+      "content": "# Render Report\n\nTerminal output: final Remotion render at 1920x1080, 24fps.\n\nChecks:\n- provider remained locked after approval\n- still image used only on the final CTA card\n- render completed without missing assets\n"
+    }
+  ],
+  "evaluations": [
+    {
+      "id": "brief-provider-lock",
+      "stage": "brief",
+      "query": "Which provider stack stays locked after approval for the launch video?",
+      "target_uri": "viking://resources/openmontage/launch-video/01-brief",
+      "expected_uri_suffix": "01-brief/brief.md",
+      "expected_keywords": ["Remotion", "ImageGen Alpha", "provider"]
+    },
+    {
+      "id": "script-cta-line",
+      "stage": "script",
+      "query": "What is the final CTA line in the approved script?",
+      "target_uri": "viking://resources/openmontage/launch-video/02-script",
+      "expected_uri_suffix": "02-script/script.md",
+      "expected_keywords": ["Ship context, not chaos"]
+    },
+    {
+      "id": "scene-plan-still-card",
+      "stage": "scene_plan",
+      "query": "Which scene is allowed to stay still instead of requiring motion?",
+      "target_uri": "viking://resources/openmontage/launch-video/03-scene-plan",
+      "expected_uri_suffix": "03-scene-plan/scene_plan.md",
+      "expected_keywords": ["Scene 3", "motion required: no", "CTA"]
+    },
+    {
+      "id": "asset-manifest-sample-rule",
+      "stage": "asset_manifest",
+      "query": "What batching rule applies before generating the full image set?",
+      "target_uri": "viking://resources/openmontage/launch-video/04-asset-manifest",
+      "expected_uri_suffix": "04-asset-manifest/asset_manifest.md",
+      "expected_keywords": ["approved keyframe sample", "full image batch"]
+    },
+    {
+      "id": "render-terminal-output",
+      "stage": "render_report",
+      "query": "What is the terminal output deliverable for the pipeline?",
+      "target_uri": "viking://resources/openmontage/launch-video/05-render-report",
+      "expected_uri_suffix": "05-render-report/render_report.md",
+      "expected_keywords": ["final Remotion render", "1920x1080", "24fps"]
+    }
+  ]
+}
diff --git a/benchmark/openmontage/import_to_ov.py b/benchmark/openmontage/import_to_ov.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""Import the OpenMontage benchmark fixture into OpenViking."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+import tempfile
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parent
+FIXTURE_PATH = ROOT / "data" / "fixture.json"
+
+
+def load_fixture() -> dict:
+    return json.loads(FIXTURE_PATH.read_text(encoding="utf-8"))
+
+
+def build_fixture_tree(base_dir: Path, fixture: dict) -> Path:
+    project_dir = base_dir / fixture["project_id"]
+    for artifact in fixture["artifacts"]:
+        path = project_dir / artifact["relative_path"]
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(artifact["content"], encoding="utf-8")
+    return project_dir
+
+
+def build_client(mode: str, workspace: str | None, url: str | None):
+    import openviking as ov
+
+    if mode == "http":
+        client = ov.SyncHTTPClient(url=url or "http://localhost:1933")
+    else:
+        client = ov.OpenViking(path=workspace or "./data/openmontage-workspace")
+    client.initialize()
+    return client
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", choices=["embedded", "http"], default="embedded")
+    parser.add_argument("--workspace", help="Embedded-mode workspace path")
+    parser.add_argument("--url", help="HTTP server base URL")
+    parser.add_argument(
+        "--keep-temp",
+        action="store_true",
+        help="Keep the generated fixture directory instead of deleting it",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    fixture = load_fixture()
+    temp_root = Path(tempfile.mkdtemp(prefix="openmontage_fixture_"))
+    try:
+        project_dir = build_fixture_tree(temp_root, fixture)
+        client = build_client(args.mode, args.workspace, args.url)
+        result = client.add_resource(path=str(project_dir), wait=True)
+        print(json.dumps({"project_uri": fixture["project_uri"], "import_result": result}, indent=2))
+        return 0
+    except Exception as exc:
+        print(f"import failed: {exc}", file=sys.stderr)
+        return 1
+    finally:
+        if not args.keep_temp and temp_root.exists():
+            shutil.rmtree(temp_root, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmark/openmontage/result/.gitkeep b/benchmark/openmontage/result/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/benchmark/openmontage/run_eval.py b/benchmark/openmontage/run_eval.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""Run the OpenMontage retrieval benchmark against an OpenViking instance."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from scorer import score_report
+
+
+ROOT = Path(__file__).resolve().parent
+FIXTURE_PATH = ROOT / "data" / "fixture.json"
+DEFAULT_OUTPUT = ROOT / "result" / "openmontage_eval.json"
+
+
+def load_fixture() -> dict:
+    return json.loads(FIXTURE_PATH.read_text(encoding="utf-8"))
+
+
+def build_client(mode: str, workspace: str | None, url: str | None):
+    import openviking as ov
+
+    if mode == "http":
+        client = ov.SyncHTTPClient(url=url or "http://localhost:1933")
+    else:
+        client = ov.OpenViking(path=workspace or "./data/openmontage-workspace")
+    client.initialize()
+    return client
+
+
+def serialize_find_results(results) -> list[dict]:
+    resources = getattr(results, "resources", []) or []
+    serialized = []
+    for item in resources:
+        serialized.append(
+            {
+                "uri": getattr(item, "uri", ""),
+                "score": getattr(item, "score", None),
+                "abstract": getattr(item, "abstract", "") or "",
+                "overview": getattr(item, "overview", "") or "",
+            }
+        )
+    return serialized
+
+
+def run_cases(client, fixture: dict, limit: int) -> dict:
+    report = {
+        "project_id": fixture["project_id"],
+        "project_uri": fixture["project_uri"],
+        "cases": [],
+    }
+    for case in fixture["evaluations"]:
+        results = client.find(query=case["query"], target_uri=case["target_uri"], limit=limit)
+        report["cases"].append(
+            {
+                **case,
+                "hits": serialize_find_results(results),
+            }
+        )
+    return report
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", choices=["embedded", "http"], default="embedded")
+    parser.add_argument("--workspace", help="Embedded-mode workspace path")
+    parser.add_argument("--url", help="HTTP server base URL")
+    parser.add_argument("--limit", type=int, default=3, help="Top-k retrieval hits per query")
+    parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSON path")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    fixture = load_fixture()
+    try:
+        client = build_client(args.mode, args.workspace, args.url)
+        report = run_cases(client, fixture, args.limit)
+        scored = score_report(report)
+        output_path = Path(args.output)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(
+            json.dumps({"report": report, "score": scored}, indent=2),
+            encoding="utf-8",
+        )
+        print(json.dumps(scored, indent=2))
+        return 0
+    except Exception as exc:
+        print(f"benchmark failed: {exc}", file=sys.stderr)
+        return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())