diff --git a/.gitignore b/.gitignore index 82a3ca9c7..d6ae1f411 100644 --- a/.gitignore +++ b/.gitignore @@ -153,6 +153,10 @@ logs/ .tmp/ ov.conf result/ +!benchmark/openmontage/data/ +!benchmark/openmontage/data/fixture.json +!benchmark/openmontage/result/ +!benchmark/openmontage/result/.gitkeep # Jupyter Notebook .ipynb_checkpoints diff --git a/benchmark/openmontage/README.md b/benchmark/openmontage/README.md new file mode 100644 index 000000000..4179ffac2 --- /dev/null +++ b/benchmark/openmontage/README.md @@ -0,0 +1,105 @@ +# OpenMontage Benchmark MVP + +This benchmark turns the [OpenMontage RFC](../../docs/en/about/03-roadmap.md) idea into a small, +repo-local fixture that stresses stage-scoped context handoff in OpenViking. + +The benchmark is intentionally narrow: + +- no dependency on the external OpenMontage repository +- one in-repo production fixture +- deterministic scoring +- no UI or long-running orchestration + +## Why This Benchmark Matters + +OpenMontage-like pipelines are not generic chat. They move across explicit production stages: + +1. `brief` +2. `script` +3. `scene_plan` +4. `asset_manifest` +5. `render_report` + +Each stage wants a different slice of context. The benchmark checks whether retrieval can stay +focused on the right artifact instead of dragging the full production history into every step. + +## Layout + +```text +benchmark/openmontage/ +├── README.md +├── data/ +│ └── fixture.json +├── import_to_ov.py +├── run_eval.py +├── scorer.py +├── test_smoke.py +└── result/ +``` + +## Fixture Model + +The fixture contains one synthetic production, `launch-video`, with five stage artifacts: + +- `brief.md` +- `script.md` +- `scene_plan.md` +- `asset_manifest.md` +- `render_report.md` + +It also contains a compact evaluation set. Each case defines: + +- a query +- the target stage +- the artifact URI suffix that should be retrieved +- keywords that should appear in the returned evidence + +## Quick Start + +### 1. Import the fixture into OpenViking + +Embedded mode: + +```bash +python benchmark/openmontage/import_to_ov.py --mode embedded --workspace ./data/openmontage-workspace +``` + +HTTP mode: + +```bash +python benchmark/openmontage/import_to_ov.py --mode http --url http://localhost:1933 +``` + +### 2. Run the evaluation + +```bash +python benchmark/openmontage/run_eval.py --mode embedded --workspace ./data/openmontage-workspace +``` + +This writes a JSON report to `benchmark/openmontage/result/openmontage_eval.json`. + +### 3. Score the results again if needed + +```bash +python benchmark/openmontage/scorer.py benchmark/openmontage/result/openmontage_eval.json +``` + +## Evaluation Strategy + +The MVP deliberately avoids LLM judges. It scores retrieval deterministically: + +- pass if the expected artifact URI suffix appears in the returned hits +- pass if all required keywords appear across the top retrieved evidence + +That keeps this benchmark cheap, reproducible, and suitable for CI smoke usage. + +## Expected Use + +This benchmark is useful when you want to compare: + +- stage-local retrieval quality +- hierarchy-aware directory layout decisions +- whether artifact handoff remains coherent as a project accumulates more files + +It is not a full production workflow benchmark yet. The next step would be to add multiple +projects, more stage transitions, and latency/cost reporting. diff --git a/benchmark/openmontage/data/fixture.json b/benchmark/openmontage/data/fixture.json new file mode 100644 index 000000000..8e6573cb8 --- /dev/null +++ b/benchmark/openmontage/data/fixture.json @@ -0,0 +1,78 @@ +{ + "project_id": "launch-video", + "project_uri": "viking://resources/openmontage/launch-video", + "artifacts": [ + { + "stage": "brief", + "relative_path": "01-brief/brief.md", + "title": "Launch Video Brief", + "content": "# Launch Video Brief\n\nGoal: produce a 45-second launch video for the Northstar workspace release.\n\nCreative intent:\n- calm, editorial motion\n- no handheld camera simulation\n- still frames are allowed only for the final CTA card\n\nProvider lock:\n- after approval, provider must stay on Remotion + ImageGen Alpha\n- sample-before-batch generation is mandatory before full asset production\n" + }, + { + "stage": "script", + "relative_path": "02-script/script.md", + "title": "Launch Video Script", + "content": "# Script\n\nScene 1 introduces the pain of tab overload.\nScene 2 shows the workspace timeline.\nScene 3 ends with the CTA line: 'Ship context, not chaos.'\n\nThe narration must keep the phrase 'sample-before-batch' as a production rule reminder.\n" + }, + { + "stage": "scene_plan", + "relative_path": "03-scene-plan/scene_plan.md", + "title": "Scene Plan", + "content": "# Scene Plan\n\nScene 1:\n- duration: 10s\n- motion required: yes\n- visual: browser tabs collapsing into a single timeline rail\n\nScene 2:\n- duration: 18s\n- motion required: yes\n- visual: cards snapping into a production board\n\nScene 3:\n- duration: 17s\n- motion required: no\n- visual: still CTA card with product lockup\n" + }, + { + "stage": "asset_manifest", + "relative_path": "04-asset-manifest/asset_manifest.md", + "title": "Asset Manifest", + "content": "# Asset Manifest\n\nRequired assets:\n- 3 product UI stills\n- 2 gradient backplates\n- 1 voiceover track\n- 1 logo lockup SVG\n\nBatching rule:\n- generate one approved keyframe sample before producing the full image batch\n- keep provider fixed to ImageGen Alpha after approval\n" + }, + { + "stage": "render_report", + "relative_path": "05-render-report/render_report.md", + "title": "Render Report", + "content": "# Render Report\n\nTerminal output: final Remotion render at 1920x1080, 24fps.\n\nChecks:\n- provider remained locked after approval\n- still image used only on the final CTA card\n- render completed without missing assets\n" + } + ], + "evaluations": [ + { + "id": "brief-provider-lock", + "stage": "brief", + "query": "Which provider stack stays locked after approval for the launch video?", + "target_uri": "viking://resources/openmontage/launch-video/01-brief", + "expected_uri_suffix": "01-brief/brief.md", + "expected_keywords": ["Remotion", "ImageGen Alpha", "provider"] + }, + { + "id": "script-cta-line", + "stage": "script", + "query": "What is the final CTA line in the approved script?", + "target_uri": "viking://resources/openmontage/launch-video/02-script", + "expected_uri_suffix": "02-script/script.md", + "expected_keywords": ["Ship context, not chaos"] + }, + { + "id": "scene-plan-still-card", + "stage": "scene_plan", + "query": "Which scene is allowed to stay still instead of requiring motion?", + "target_uri": "viking://resources/openmontage/launch-video/03-scene-plan", + "expected_uri_suffix": "03-scene-plan/scene_plan.md", + "expected_keywords": ["Scene 3", "motion required: no", "CTA"] + }, + { + "id": "asset-manifest-sample-rule", + "stage": "asset_manifest", + "query": "What batching rule applies before generating the full image set?", + "target_uri": "viking://resources/openmontage/launch-video/04-asset-manifest", + "expected_uri_suffix": "04-asset-manifest/asset_manifest.md", + "expected_keywords": ["approved keyframe sample", "full image batch"] + }, + { + "id": "render-terminal-output", + "stage": "render_report", + "query": "What is the terminal output deliverable for the pipeline?", + "target_uri": "viking://resources/openmontage/launch-video/05-render-report", + "expected_uri_suffix": "05-render-report/render_report.md", + "expected_keywords": ["final Remotion render", "1920x1080", "24fps"] + } + ] +} diff --git a/benchmark/openmontage/import_to_ov.py b/benchmark/openmontage/import_to_ov.py new file mode 100644 index 000000000..bbbe1a6d8 --- /dev/null +++ b/benchmark/openmontage/import_to_ov.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +"""Import the OpenMontage benchmark fixture into OpenViking.""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +import tempfile +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent +FIXTURE_PATH = ROOT / "data" / "fixture.json" + + +def load_fixture() -> dict: + return json.loads(FIXTURE_PATH.read_text(encoding="utf-8")) + + +def build_fixture_tree(base_dir: Path, fixture: dict) -> Path: + project_dir = base_dir / fixture["project_id"] + for artifact in fixture["artifacts"]: + path = project_dir / artifact["relative_path"] + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(artifact["content"], encoding="utf-8") + return project_dir + + +def build_client(mode: str, workspace: str | None, url: str | None): + import openviking as ov + + if mode == "http": + client = ov.SyncHTTPClient(url=url or "http://localhost:1933") + else: + client = ov.OpenViking(path=workspace or "./data/openmontage-workspace") + client.initialize() + return client + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["embedded", "http"], default="embedded") + parser.add_argument("--workspace", help="Embedded-mode workspace path") + parser.add_argument("--url", help="HTTP server base URL") + parser.add_argument( + "--keep-temp", + action="store_true", + help="Keep the generated fixture directory instead of deleting it", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + fixture = load_fixture() + temp_root = Path(tempfile.mkdtemp(prefix="openmontage_fixture_")) + try: + project_dir = build_fixture_tree(temp_root, fixture) + client = build_client(args.mode, args.workspace, args.url) + result = client.add_resource(path=str(project_dir), wait=True) + print(json.dumps({"project_uri": fixture["project_uri"], "import_result": result}, indent=2)) + return 0 + except Exception as exc: + print(f"import failed: {exc}", file=sys.stderr) + return 1 + finally: + if not args.keep_temp and temp_root.exists(): + shutil.rmtree(temp_root, ignore_errors=True) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmark/openmontage/result/.gitkeep b/benchmark/openmontage/result/.gitkeep new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/benchmark/openmontage/result/.gitkeep @@ -0,0 +1 @@ + diff --git a/benchmark/openmontage/run_eval.py b/benchmark/openmontage/run_eval.py new file mode 100644 index 000000000..279f83267 --- /dev/null +++ b/benchmark/openmontage/run_eval.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +"""Run the OpenMontage retrieval benchmark against an OpenViking instance.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from scorer import score_report + + +ROOT = Path(__file__).resolve().parent +FIXTURE_PATH = ROOT / "data" / "fixture.json" +DEFAULT_OUTPUT = ROOT / "result" / "openmontage_eval.json" + + +def load_fixture() -> dict: + return json.loads(FIXTURE_PATH.read_text(encoding="utf-8")) + + +def build_client(mode: str, workspace: str | None, url: str | None): + import openviking as ov + + if mode == "http": + client = ov.SyncHTTPClient(url=url or "http://localhost:1933") + else: + client = ov.OpenViking(path=workspace or "./data/openmontage-workspace") + client.initialize() + return client + + +def serialize_find_results(results) -> list[dict]: + resources = getattr(results, "resources", []) or [] + serialized = [] + for item in resources: + serialized.append( + { + "uri": getattr(item, "uri", ""), + "score": getattr(item, "score", None), + "abstract": getattr(item, "abstract", "") or "", + "overview": getattr(item, "overview", "") or "", + } + ) + return serialized + + +def run_cases(client, fixture: dict, limit: int) -> dict: + report = { + "project_id": fixture["project_id"], + "project_uri": fixture["project_uri"], + "cases": [], + } + for case in fixture["evaluations"]: + results = client.find(query=case["query"], target_uri=case["target_uri"], limit=limit) + report["cases"].append( + { + **case, + "hits": serialize_find_results(results), + } + ) + return report + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["embedded", "http"], default="embedded") + parser.add_argument("--workspace", help="Embedded-mode workspace path") + parser.add_argument("--url", help="HTTP server base URL") + parser.add_argument("--limit", type=int, default=3, help="Top-k retrieval hits per query") + parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSON path") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + fixture = load_fixture() + try: + client = build_client(args.mode, args.workspace, args.url) + report = run_cases(client, fixture, args.limit) + scored = score_report(report) + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + json.dumps({"report": report, "score": scored}, indent=2), + encoding="utf-8", + ) + print(json.dumps(scored, indent=2)) + return 0 + except Exception as exc: + print(f"benchmark failed: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmark/openmontage/scorer.py b/benchmark/openmontage/scorer.py new file mode 100644 index 000000000..b11b8dd68 --- /dev/null +++ b/benchmark/openmontage/scorer.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +"""Deterministic scorer for the OpenMontage benchmark MVP.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + + +def normalize_text(value: str) -> str: + return value.lower().strip() + + +def score_case(case: dict) -> dict: + expected_uri_suffix = case["expected_uri_suffix"] + expected_keywords = [normalize_text(keyword) for keyword in case["expected_keywords"]] + hits = case.get("hits", []) + + hit_uris = [hit.get("uri", "") for hit in hits] + combined_evidence = "\n".join( + [ + hit.get("uri", "") + + "\n" + + hit.get("abstract", "") + + "\n" + + hit.get("overview", "") + for hit in hits + ] + ).lower() + + uri_match = any(uri.endswith(expected_uri_suffix) for uri in hit_uris) + keyword_match = all(keyword in combined_evidence for keyword in expected_keywords) + passed = uri_match and keyword_match + + return { + "id": case["id"], + "passed": passed, + "uri_match": uri_match, + "keyword_match": keyword_match, + "expected_uri_suffix": expected_uri_suffix, + "returned_uris": hit_uris, + } + + +def score_report(report: dict) -> dict: + scored_cases = [score_case(case) for case in report["cases"]] + passed = sum(1 for case in scored_cases if case["passed"]) + total = len(scored_cases) + return { + "project_id": report["project_id"], + "passed": passed, + "total": total, + "score": passed / total if total else 0.0, + "cases": scored_cases, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("report", help="Path to run_eval.py output JSON") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + report_path = Path(args.report) + report = json.loads(report_path.read_text(encoding="utf-8")) + print(json.dumps(score_report(report), indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmark/openmontage/test_smoke.py b/benchmark/openmontage/test_smoke.py new file mode 100644 index 000000000..e09ae596f --- /dev/null +++ b/benchmark/openmontage/test_smoke.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from scorer import score_case + + +ROOT = Path(__file__).resolve().parent + + +def test_fixture_has_expected_stage_and_eval_counts(): + fixture = json.loads((ROOT / "data" / "fixture.json").read_text(encoding="utf-8")) + assert len(fixture["artifacts"]) == 5 + assert len(fixture["evaluations"]) == 5 + assert {artifact["stage"] for artifact in fixture["artifacts"]} == { + "brief", + "script", + "scene_plan", + "asset_manifest", + "render_report", + } + + +def test_score_case_requires_uri_and_keywords(): + case = { + "id": "brief-provider-lock", + "expected_uri_suffix": "01-brief/brief.md", + "expected_keywords": ["Remotion", "ImageGen Alpha"], + "hits": [ + { + "uri": "viking://resources/openmontage/launch-video/01-brief/brief.md", + "abstract": "provider lock uses Remotion with ImageGen Alpha", + "overview": "", + } + ], + } + + result = score_case(case) + + assert result["passed"] is True + assert result["uri_match"] is True + assert result["keyword_match"] is True