AlaRduTP · skhuang · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/.github/workflows/pilot-linux.yml b/.github/workflows/pilot-linux.yml
@@ -0,0 +1,75 @@
+name: pilot-linux
+
+# Stage 2 PILOT: a small-scale, offline (--mock, no API) end-to-end run of the
+# whole SHS evaluation pipeline -- kofta-campaign launches a tiny matrix
+# (kofta/kshs/kshsng x the toy smoke target x 2 runs), post-processing extracts
+# edges/opts/cost, and kofta-stats turns those real artifacts into real (non
+# [\;]) LaTeX table rows. It validates the orchestration + extraction + stats
+# wiring before the expensive Stage 3 campaigns spend real compute + API budget.
+#
+# Like smoke-linux, this MUST run on a native x86_64 runner in an Ubuntu 20.04
+# container (glibc 2.31): KOFTA's __args_leak forkserver needs glibc <=2.33 AND
+# real (non-emulated) x86_64 -- QEMU-emulated x86_64 (e.g. Colima on Apple
+# Silicon) breaks the forkserver the same way glibc 2.34+ does, so the real
+# pilot can only be exercised here, not on a developer's arm64 laptop.
+
+on:
+  push:
+    paths:
+      - "**.c"
+      - "**.h"
+      - "llvm_mode/**"
+      - "Makefile"
+      - "docker/**"
+      - "kofta-shs"
+      - "kofta-campaign"
+      - "kofta-stats"
+      - "kofta-opts"
+      - "shs/**"
+      - ".github/workflows/pilot-linux.yml"
+  pull_request:
+    paths:
+      - "**.c"
+      - "**.h"
+      - "llvm_mode/**"
+      - "Makefile"
+      - "docker/**"
+      - "kofta-shs"
+      - "kofta-campaign"
+      - "kofta-stats"
+      - "kofta-opts"
+      - "shs/**"
+      - ".github/workflows/pilot-linux.yml"
+
+jobs:
+  pilot:
+    runs-on: ubuntu-latest
+    # glibc 2.31 + clang-12; see the header comment and smoke-linux.yml for why
+    # the OS version is load-bearing.
+    container: ubuntu:20.04
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install clang/llvm 12 + tooling
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          apt-get update
+          apt-get install -y --no-install-recommends \
+            clang-12 llvm-12-dev make libc6-dev python3 ca-certificates
+
+      # Host-agnostic wiring test: drives shs.campaign with fake fuzzer/showmap/
+      # opts commands (no forkserver), so it would catch orchestration/extraction
+      # regressions even if the real run below were skipped. Uses the module's
+      # built-in runner so no pytest install is needed.
+      - name: Orchestrator wiring test (no forkserver)
+        run: python3 -m shs.tests.test_campaign
+
+      # Real end-to-end: builds in place and runs the actual kofta-campaign with
+      # the mock SHS client. Asserts kofta-stats produced a populated smoke row,
+      # so a green job means the whole real pipeline round-tripped.
+      - name: Run SHS pilot campaign (mock, no API)
+        run: |
+          PILOT_REPO="$GITHUB_WORKSPACE" \
+          PILOT_BUILD="$GITHUB_WORKSPACE" \
+            bash docker/run-pilot.sh
diff --git a/docker/run-pilot.sh b/docker/run-pilot.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# Stage 2 PILOT: a small-scale, offline (--mock, no API) dry run of the whole
+# SHS evaluation pipeline. It builds KOFTA, instruments the toy docker/smoke.c
+# target, lays out the directory shape a campaign spec expects, runs a tiny
+# kofta-campaign matrix (kofta/kshs/kshsng x 1 target x 2 runs, ~25s each), then
+# feeds the artifacts to kofta-stats and asserts it produced REAL (non-[\;])
+# table rows. The point is to shake out orchestration/extraction/stats wiring
+# bugs before spending real compute + API budget on the Stage 3 campaigns -- not
+# to measure fuzzing dynamics.
+#
+# Must run on a native x86_64 Linux host with glibc <=2.33 (Ubuntu 20.04): same
+# constraints as run-smoke.sh (KOFTA's __args_leak + LLVM-12 legacy passes).
+#
+# Overridable via env (defaults match the container layout):
+#   PILOT_REPO   source tree              (default /repo)
+#   PILOT_BUILD  writable build dir       (default /build; set == repo to build in place)
+#
+# PASS criterion: kofta-stats emits at least one populated coverage row for the
+# smoke target (a numeric median, not the [\;] placeholder) AND a populated SHS
+# cost row -- proving the campaign->postprocess->loaders->tables round-trip works
+# on real artifacts.
+set -euo pipefail
+
+REPO="${PILOT_REPO:-/repo}"
+BUILD="${PILOT_BUILD:-/build}"
+
+if [ "$BUILD" != "$REPO" ]; then
+  echo "==> copying repo ($REPO) into writable $BUILD"
+  rm -rf "$BUILD"
+  cp -r "$REPO" "$BUILD"
+else
+  echo "==> building in place at $BUILD (no copy)"
+fi
+cd "$BUILD"
+
+echo "==> building afl-fuzz / afl-showmap"
+make clean >/dev/null
+AFL_NO_X86=1 make CC=clang-12
+
+echo "==> building llvm_mode instrumentation"
+make -C llvm_mode LLVM_CONFIG=llvm-config-12 CC=clang-12 CXX=clang++-12
+
+echo "==> laying out the campaign input tree (seeds/, opts/, bin/, srcmap.txt)"
+mkdir -p seeds/smoke opts bin
+printf 'hello\n' > seeds/smoke/seed
+rm -f opts/smoke.txt srcmap.txt
+KOFTA_OPTSAVE="$BUILD/opts/smoke.txt" \
+KOFTA_SRCMAP="$BUILD/srcmap.txt" \
+AFL_PATH="$BUILD" AFL_CC=clang-12 \
+  "$BUILD/afl-clang-fast" -g "$BUILD/docker/smoke.c" -o "$BUILD/bin/smoke"
+
+echo "==> opts discovered by the LLVM pass (-k file for kofta-fuzz):"
+cat opts/smoke.txt || true
+
+echo "==> running the pilot campaign (mock SHS, no API)"
+rm -rf pilot-campaign
+python3 ./kofta-campaign shs/campaign.pilot.json
+
+echo "----- campaign tree -----------------------------------------------"
+find pilot-campaign -maxdepth 4 -type f | sort || true
+
+echo "==> generating tables from the pilot artifacts (--targets smoke)"
+# kofta-stats defaults to the paper's eval binaries; the pilot target is "smoke",
+# so we must override the target list or every row is a placeholder.
+# NOTE: kofta-stats emits every table row + facts line via debug.psay(), which
+# writes to STDERR (not stdout). So we must fold stderr into the pipe (2>&1) or
+# pilot-stats.out captures nothing and every assertion below fails spuriously.
+python3 ./kofta-stats pilot-campaign --targets smoke 2>&1 | tee pilot-stats.out
+
+echo "==> asserting kofta-stats produced real (non-placeholder) rows"
+# The kshs vs kofta comparison (RQ5 facts) only prints when both configs have
+# coverage for the smoke target -- i.e. the whole campaign->edges->loaders chain
+# worked for at least kshs and kofta.
+if ! grep -Eq "targets compared[[:space:]]+= [1-9]" pilot-stats.out; then
+  echo "==> FAIL: no coverage facts -- the cov table is empty/placeholder" >&2
+  echo "----- edges.txt files found ---------------------------------------" >&2
+  find pilot-campaign -name edges.txt -print -exec cat {} \; >&2 || true
+  exit 1
+fi
+# Show the smoke cov row for the log. Its weifuzz/llmonly cells are legitimately
+# [\;] (the pilot doesn't run those configs); the kofta and kshs cells (the 4th
+# and 5th "&"-separated fields) must be real medians, not placeholders.
+smoke_cov="$(grep -E '^smoke' pilot-stats.out | head -1 || true)"
+echo "smoke cov row: $smoke_cov"
+ko_cell="$(printf '%s' "$smoke_cov" | awk -F'&' '{gsub(/ /,"",$4); print $4}')"
+ks_cell="$(printf '%s' "$smoke_cov" | awk -F'&' '{gsub(/ /,"",$5); print $5}')"
+if [ -z "$ko_cell" ] || [ "$ko_cell" = '[\;]' ] || [ -z "$ks_cell" ] || [ "$ks_cell" = '[\;]' ]; then
+  echo "==> FAIL: smoke cov row missing kofta/kshs edge medians (ko='$ko_cell' ks='$ks_cell')" >&2
+  exit 1
+fi
+
+echo "==> PASS: pilot pipeline produced real tables from real artifacts"
+echo "----- cost records ------------------------------------------------"
+find pilot-campaign/cost -name shs_cost.json -print -exec cat {} \; 2>/dev/null || \
+  echo "  (no cost records -- kshs/kshsng cost did not flush)"
diff --git a/kofta-opts b/kofta-opts
@@ -1,6 +1,12 @@
 #!python3
 # -*- coding: utf-8 -*-
 
+# Defer annotation evaluation so the list[...]/dict[...] type hints below parse
+# on Ubuntu 20.04's system python3 (3.8), which predates PEP 585 runtime
+# subscription. Without this, importing the script raises
+# "TypeError: 'type' object is not subscriptable".
+from __future__ import annotations
+
 import argparse
 import csv
 import xml.etree.ElementTree as ET

diff --git a/kofta-stats b/kofta-stats
@@ -14,6 +14,10 @@ compilable, incomplete table -- numbers are never invented.
   ./kofta-stats <campaign_root> --table cov     # one table only
 """
 
+# Defer annotation evaluation so the list[...] hints below parse on Ubuntu
+# 20.04's system python3 (3.8), which predates PEP 585 runtime subscription.
+from __future__ import annotations
+
 import argparse
 from pathlib import Path
 
@@ -30,6 +34,10 @@ def parse_args() -> argparse.Namespace:
                    help="coverage metric for tab:cov (default: edges)")
     p.add_argument("--table", choices=["cov", "magic", "undoc", "cost", "all"],
                    default="all", help="which table to emit (default: all)")
+    p.add_argument("--targets", default=None,
+                   help="comma-separated target subset (default: the paper's "
+                        "eval binaries). Use this to score a pilot/custom target "
+                        "the hardcoded table lists don't include, e.g. --targets smoke")
     return p.parse_args()
 
 
@@ -80,8 +88,16 @@ def main() -> None:
         pok("metric=paths (plot_data paths_total -- a queue-size proxy)")
     psay("")
 
+    # Optional target override. The table emitters default to the paper's
+    # hardcoded eval binaries; a pilot or ad-hoc run uses a different target
+    # (e.g. "smoke"), which would otherwise render as all-placeholder rows.
+    targets = args.targets.split(",") if args.targets else None
+    cov_targets = targets or tables.COV_TARGETS
+    undoc_targets = targets or tables.UNDOC_TARGETS
+    cost_targets = targets or tables.COST_TARGETS
+
     if args.table in ("cov", "all"):
-        rows, f = tables.table_cov(root, args.metric)
+        rows, f = tables.table_cov(root, args.metric, targets=cov_targets)
         _emit("Table A: Edge Coverage (tab:cov)",
               "8.SemanticHintSynthesis_ENG.tex", rows)
         _facts_cov(f)
@@ -91,11 +107,11 @@ def main() -> None:
               "8.SemanticHintSynthesis_ENG.tex", rows)
         _facts_magic(f)
     if args.table in ("undoc", "all"):
-        rows, _ = tables.table_undoc(root)
+        rows, _ = tables.table_undoc(root, targets=undoc_targets)
         _emit("Table C: Undocumented Optargs (tab:undoc)",
               "8.SemanticHintSynthesis_ENG.tex", rows)
     if args.table in ("cost", "all"):
-        rows, _ = tables.table_cost(root)
+        rows, _ = tables.table_cost(root, targets=cost_targets)
         _emit("Table D: SHS Cost (tab:cost)",
               "8.SemanticHintSynthesis_ENG.tex", rows)
 

diff --git a/shs/campaign.pilot.json b/shs/campaign.pilot.json
@@ -0,0 +1,23 @@
+{
+  "_comment": "Stage 2 PILOT spec -- NOT the real evaluation. Validates the campaign->postprocess->kofta-stats pipeline end-to-end with the toy docker/smoke.c target and the offline --mock SHS client (no API, no cost). Only KOFTA-buildable configs (kofta/kshs/kshsng); weifuzz/llmonly need an external wei-fuzz binary not in this repo. Must run inside the ubuntu:20.04 container (glibc<=2.33) via docker/run-pilot.sh, which sets up bin/{target}, seeds/{target}, opts/{target}.txt, srcmap.txt under cwd. Durations are tiny on purpose -- the goal is a clean artifact->table round-trip, not real fuzzing dynamics.",
+
+  "root": "pilot-campaign",
+  "targets": ["smoke"],
+  "runs": 2,
+  "duration_s": 25,
+
+  "env": {
+    "AFL_NO_UI": "1",
+    "AFL_SKIP_CPUFREQ": "1",
+    "AFL_I_DONT_CARE_ABOUT_MISSING_CRASHES": "1"
+  },
+
+  "commands": {
+    "kofta":  "timeout {duration} env KOFTA_SHS_BUDGET=0 ./kofta-fuzz -i seeds/{target} -o {out} -m none -t 5000 -k opts/{target}.txt -- bin/{target}",
+    "kshs":   "timeout {duration} env PYTHONPATH=. KOFTA_SRCMAP=srcmap.txt KOFTA_SHS=1 KOFTA_SHS_BIN=./kofta-shs KOFTA_SHS_CACHE={cache} KOFTA_SHS_COST={cost} ./kofta-fuzz -i seeds/{target} -o {out} -m none -t 5000 -k opts/{target}.txt -- bin/{target}",
+    "kshsng": "timeout {duration} env PYTHONPATH=. KOFTA_SRCMAP=srcmap.txt KOFTA_SHS=1 KOFTA_SHS_NOSLICE=1 KOFTA_SHS_BIN=./kofta-shs KOFTA_SHS_CACHE={cache} KOFTA_SHS_COST={cost} ./kofta-fuzz -i seeds/{target} -o {out} -m none -t 5000 -k opts/{target}.txt -- bin/{target}"
+  },
+
+  "showmap": "./afl-showmap -q -m none -o {output} -- bin/{target} {input}",
+  "opts_cmd": "python3 kofta-opts {out} -c -d {state} -q"
+}
diff --git a/shs/campaign.py b/shs/campaign.py
@@ -104,7 +104,12 @@ def launch_one(spec: Spec, config: str, target: str, i: int,
     t0 = time.time()
     proc = subprocess.run(cmd, shell=True, env=env)
     wall = time.time() - t0
-    if proc.returncode != 0:
+    # A fuzzer runs until its wall-clock budget; the command templates cap it
+    # with `timeout {duration}`, and GNU timeout reports 124 when it had to send
+    # the signal (the normal end of a campaign, not an error). AFL itself exits
+    # 0 on SIGTERM, but timeout's own status masks that. Treat both as success;
+    # anything else is a real failure (build/seed/forkserver problem).
+    if proc.returncode not in (0, 124):
         pwarn(f"run exited {proc.returncode}: {config}/{target}/run-{i:02d}")
         return False
 
@@ -124,6 +129,14 @@ def _postprocess(spec: Spec, config: str, target: str, run_dir: Path,
     if spec.opts_cmd:
         cmd = _subst(spec.opts_cmd, target=target, out=run_dir, state=run_dir)
         subprocess.run(cmd, shell=True)
+        # kofta-opts writes opts.csv into {out} (= the cov run dir), but the
+        # undoc table loader reads <root>/undoc/<target>/<config>/<run>/opts.csv.
+        # Mirror it there so tab:undoc is populated (same pattern as cost below).
+        src = run_dir / "opts.csv"
+        if src.is_file():
+            dst = spec.root / "undoc" / target / config / run_dir.name
+            dst.mkdir(parents=True, exist_ok=True)
+            (dst / "opts.csv").write_text(src.read_text())
     cost = run_dir / "shs_cost.json"
     if cost.is_file():
         try: