Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ logs/
.tmp/
ov.conf
result/
!benchmark/openmontage/data/
!benchmark/openmontage/data/fixture.json
!benchmark/openmontage/result/
!benchmark/openmontage/result/.gitkeep

# Jupyter Notebook
.ipynb_checkpoints
Expand Down
105 changes: 105 additions & 0 deletions benchmark/openmontage/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# OpenMontage Benchmark MVP

This benchmark turns the [OpenMontage RFC](../../docs/en/about/03-roadmap.md) idea into a small,
repo-local fixture that stresses stage-scoped context handoff in OpenViking.

The benchmark is intentionally narrow:

- no dependency on the external OpenMontage repository
- one in-repo production fixture
- deterministic scoring
- no UI or long-running orchestration

## Why This Benchmark Matters

OpenMontage-like pipelines are not generic chat. They move across explicit production stages:

1. `brief`
2. `script`
3. `scene_plan`
4. `asset_manifest`
5. `render_report`

Each stage wants a different slice of context. The benchmark checks whether retrieval can stay
focused on the right artifact instead of dragging the full production history into every step.

## Layout

```text
benchmark/openmontage/
├── README.md
├── data/
│ └── fixture.json
├── import_to_ov.py
├── run_eval.py
├── scorer.py
├── test_smoke.py
└── result/
```

## Fixture Model

The fixture contains one synthetic production, `launch-video`, with five stage artifacts:

- `brief.md`
- `script.md`
- `scene_plan.md`
- `asset_manifest.md`
- `render_report.md`

It also contains a compact evaluation set. Each case defines:

- a query
- the target stage
- the artifact URI suffix that should be retrieved
- keywords that should appear in the returned evidence

## Quick Start

### 1. Import the fixture into OpenViking

Embedded mode:

```bash
python benchmark/openmontage/import_to_ov.py --mode embedded --workspace ./data/openmontage-workspace
```

HTTP mode:

```bash
python benchmark/openmontage/import_to_ov.py --mode http --url http://localhost:1933
```

### 2. Run the evaluation

```bash
python benchmark/openmontage/run_eval.py --mode embedded --workspace ./data/openmontage-workspace
```

This writes a JSON report to `benchmark/openmontage/result/openmontage_eval.json`.

### 3. Score the results again if needed

```bash
python benchmark/openmontage/scorer.py benchmark/openmontage/result/openmontage_eval.json
```

## Evaluation Strategy

The MVP deliberately avoids LLM judges. It scores retrieval deterministically:

- pass if the expected artifact URI suffix appears in the returned hits
- pass if all required keywords appear across the top retrieved evidence

That keeps this benchmark cheap, reproducible, and suitable for CI smoke usage.

## Expected Use

This benchmark is useful when you want to compare:

- stage-local retrieval quality
- hierarchy-aware directory layout decisions
- whether artifact handoff remains coherent as a project accumulates more files

It is not a full production workflow benchmark yet. The next step would be to add multiple
projects, more stage transitions, and latency/cost reporting.
78 changes: 78 additions & 0 deletions benchmark/openmontage/data/fixture.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"project_id": "launch-video",
"project_uri": "viking://resources/openmontage/launch-video",
"artifacts": [
{
"stage": "brief",
"relative_path": "01-brief/brief.md",
"title": "Launch Video Brief",
"content": "# Launch Video Brief\n\nGoal: produce a 45-second launch video for the Northstar workspace release.\n\nCreative intent:\n- calm, editorial motion\n- no handheld camera simulation\n- still frames are allowed only for the final CTA card\n\nProvider lock:\n- after approval, provider must stay on Remotion + ImageGen Alpha\n- sample-before-batch generation is mandatory before full asset production\n"
},
{
"stage": "script",
"relative_path": "02-script/script.md",
"title": "Launch Video Script",
"content": "# Script\n\nScene 1 introduces the pain of tab overload.\nScene 2 shows the workspace timeline.\nScene 3 ends with the CTA line: 'Ship context, not chaos.'\n\nThe narration must keep the phrase 'sample-before-batch' as a production rule reminder.\n"
},
{
"stage": "scene_plan",
"relative_path": "03-scene-plan/scene_plan.md",
"title": "Scene Plan",
"content": "# Scene Plan\n\nScene 1:\n- duration: 10s\n- motion required: yes\n- visual: browser tabs collapsing into a single timeline rail\n\nScene 2:\n- duration: 18s\n- motion required: yes\n- visual: cards snapping into a production board\n\nScene 3:\n- duration: 17s\n- motion required: no\n- visual: still CTA card with product lockup\n"
},
{
"stage": "asset_manifest",
"relative_path": "04-asset-manifest/asset_manifest.md",
"title": "Asset Manifest",
"content": "# Asset Manifest\n\nRequired assets:\n- 3 product UI stills\n- 2 gradient backplates\n- 1 voiceover track\n- 1 logo lockup SVG\n\nBatching rule:\n- generate one approved keyframe sample before producing the full image batch\n- keep provider fixed to ImageGen Alpha after approval\n"
},
{
"stage": "render_report",
"relative_path": "05-render-report/render_report.md",
"title": "Render Report",
"content": "# Render Report\n\nTerminal output: final Remotion render at 1920x1080, 24fps.\n\nChecks:\n- provider remained locked after approval\n- still image used only on the final CTA card\n- render completed without missing assets\n"
}
],
"evaluations": [
{
"id": "brief-provider-lock",
"stage": "brief",
"query": "Which provider stack stays locked after approval for the launch video?",
"target_uri": "viking://resources/openmontage/launch-video/01-brief",
"expected_uri_suffix": "01-brief/brief.md",
"expected_keywords": ["Remotion", "ImageGen Alpha", "provider"]
},
{
"id": "script-cta-line",
"stage": "script",
"query": "What is the final CTA line in the approved script?",
"target_uri": "viking://resources/openmontage/launch-video/02-script",
"expected_uri_suffix": "02-script/script.md",
"expected_keywords": ["Ship context, not chaos"]
},
{
"id": "scene-plan-still-card",
"stage": "scene_plan",
"query": "Which scene is allowed to stay still instead of requiring motion?",
"target_uri": "viking://resources/openmontage/launch-video/03-scene-plan",
"expected_uri_suffix": "03-scene-plan/scene_plan.md",
"expected_keywords": ["Scene 3", "motion required: no", "CTA"]
},
{
"id": "asset-manifest-sample-rule",
"stage": "asset_manifest",
"query": "What batching rule applies before generating the full image set?",
"target_uri": "viking://resources/openmontage/launch-video/04-asset-manifest",
"expected_uri_suffix": "04-asset-manifest/asset_manifest.md",
"expected_keywords": ["approved keyframe sample", "full image batch"]
},
{
"id": "render-terminal-output",
"stage": "render_report",
"query": "What is the terminal output deliverable for the pipeline?",
"target_uri": "viking://resources/openmontage/launch-video/05-render-report",
"expected_uri_suffix": "05-render-report/render_report.md",
"expected_keywords": ["final Remotion render", "1920x1080", "24fps"]
}
]
}
74 changes: 74 additions & 0 deletions benchmark/openmontage/import_to_ov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python3
"""Import the OpenMontage benchmark fixture into OpenViking."""

from __future__ import annotations

import argparse
import json
import shutil
import sys
import tempfile
from pathlib import Path


ROOT = Path(__file__).resolve().parent
FIXTURE_PATH = ROOT / "data" / "fixture.json"


def load_fixture() -> dict:
return json.loads(FIXTURE_PATH.read_text(encoding="utf-8"))


def build_fixture_tree(base_dir: Path, fixture: dict) -> Path:
project_dir = base_dir / fixture["project_id"]
for artifact in fixture["artifacts"]:
path = project_dir / artifact["relative_path"]
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(artifact["content"], encoding="utf-8")
return project_dir


def build_client(mode: str, workspace: str | None, url: str | None):
import openviking as ov

if mode == "http":
client = ov.SyncHTTPClient(url=url or "http://localhost:1933")
else:
client = ov.OpenViking(path=workspace or "./data/openmontage-workspace")
client.initialize()
return client


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--mode", choices=["embedded", "http"], default="embedded")
parser.add_argument("--workspace", help="Embedded-mode workspace path")
parser.add_argument("--url", help="HTTP server base URL")
parser.add_argument(
"--keep-temp",
action="store_true",
help="Keep the generated fixture directory instead of deleting it",
)
return parser.parse_args()


def main() -> int:
args = parse_args()
fixture = load_fixture()
temp_root = Path(tempfile.mkdtemp(prefix="openmontage_fixture_"))
try:
project_dir = build_fixture_tree(temp_root, fixture)
client = build_client(args.mode, args.workspace, args.url)
result = client.add_resource(path=str(project_dir), wait=True)
print(json.dumps({"project_uri": fixture["project_uri"], "import_result": result}, indent=2))
return 0
except Exception as exc:
print(f"import failed: {exc}", file=sys.stderr)
return 1
finally:
if not args.keep_temp and temp_root.exists():
shutil.rmtree(temp_root, ignore_errors=True)


if __name__ == "__main__":
raise SystemExit(main())
1 change: 1 addition & 0 deletions benchmark/openmontage/result/.gitkeep
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

97 changes: 97 additions & 0 deletions benchmark/openmontage/run_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3
"""Run the OpenMontage retrieval benchmark against an OpenViking instance."""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path

from scorer import score_report


ROOT = Path(__file__).resolve().parent
FIXTURE_PATH = ROOT / "data" / "fixture.json"
DEFAULT_OUTPUT = ROOT / "result" / "openmontage_eval.json"


def load_fixture() -> dict:
return json.loads(FIXTURE_PATH.read_text(encoding="utf-8"))


def build_client(mode: str, workspace: str | None, url: str | None):
import openviking as ov

if mode == "http":
client = ov.SyncHTTPClient(url=url or "http://localhost:1933")
else:
client = ov.OpenViking(path=workspace or "./data/openmontage-workspace")
client.initialize()
return client


def serialize_find_results(results) -> list[dict]:
resources = getattr(results, "resources", []) or []
serialized = []
for item in resources:
serialized.append(
{
"uri": getattr(item, "uri", ""),
"score": getattr(item, "score", None),
"abstract": getattr(item, "abstract", "") or "",
"overview": getattr(item, "overview", "") or "",
}
)
return serialized


def run_cases(client, fixture: dict, limit: int) -> dict:
report = {
"project_id": fixture["project_id"],
"project_uri": fixture["project_uri"],
"cases": [],
}
for case in fixture["evaluations"]:
results = client.find(query=case["query"], target_uri=case["target_uri"], limit=limit)
report["cases"].append(
{
**case,
"hits": serialize_find_results(results),
}
)
return report


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--mode", choices=["embedded", "http"], default="embedded")
parser.add_argument("--workspace", help="Embedded-mode workspace path")
parser.add_argument("--url", help="HTTP server base URL")
parser.add_argument("--limit", type=int, default=3, help="Top-k retrieval hits per query")
parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSON path")
return parser.parse_args()


def main() -> int:
args = parse_args()
fixture = load_fixture()
try:
client = build_client(args.mode, args.workspace, args.url)
report = run_cases(client, fixture, args.limit)
scored = score_report(report)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps({"report": report, "score": scored}, indent=2),
encoding="utf-8",
)
print(json.dumps(scored, indent=2))
return 0
except Exception as exc:
print(f"benchmark failed: {exc}", file=sys.stderr)
return 1


if __name__ == "__main__":
raise SystemExit(main())
Loading
Loading