diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..1f20d76
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,50 @@
+name: Benchmarks
+on:
+  push:
+    tags: ['v*']
+  pull_request:
+    paths:
+      - 'src/**'
+      - 'benchmark/**'
+      - '.github/workflows/benchmark.yml'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+  benchmark:
+    name: Benchmark suite
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    permissions:
+      actions: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: '1.11'
+          arch: x64
+
+      - uses: julia-actions/cache@v2
+
+      - name: Instantiate benchmark environment
+        run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
+
+      - name: Run benchmarks
+        run: |
+          julia --project=benchmark -t auto -e '
+            using TestItemRunner
+            TestItemRunner.run_tests("benchmark/")
+          '
+
+      - name: Upload benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-${{ github.event.pull_request.number || github.ref_name }}-${{ github.sha }}
+          path: benchmark/results/
+          retention-days: 90
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
new file mode 100644
index 0000000..ca28c11
--- /dev/null
+++ b/benchmark/.gitignore
@@ -0,0 +1,2 @@
+results/
+Manifest.toml
diff --git a/benchmark/BenchmarkUtils.jl b/benchmark/BenchmarkUtils.jl
deleted file mode 100644
index 8b13789..0000000
--- a/benchmark/BenchmarkUtils.jl
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 0000000..b219215
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,20 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
+ExponentialAction = "e24c0720-ea99-47e8-929e-571b494574d3"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HarmoniqsBenchmarks = "f45d0b76-2d23-4568-9599-481e0da131db"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
+MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
+TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
+
+[sources]
+DirectTrajOpt = {path = ".."}
+HarmoniqsBenchmarks = {url = "https://github.com/harmoniqs/HarmoniqsBenchmarks.jl"}
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..c0737c9
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,33 @@
+# DirectTrajOpt Benchmarks
+
+Benchmark suite for DirectTrajOpt.jl comparing Ipopt and MadNLP solver performance.
+
+## Running locally
+
+```bash
+# From DirectTrajOpt.jl root
+julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
+
+julia --project=benchmark -t auto -e '
+    using TestItemRunner
+    TestItemRunner.run_tests("benchmark/")
+'
+```
+
+Artifacts are saved as JLD2 files in `benchmark/results/` (gitignored).
+
+## Benchmark suites
+
+- **Evaluator micro-benchmarks** — `BenchmarkTools.@benchmark` timings for each MOI eval function (objective, gradient, constraint, jacobian, hessian_lagrangian) on bilinear N=51
+- **Ipopt vs MadNLP** — full solve comparison on bilinear N=51
+- **Memory scaling study** — N ∈ {25, 51, 101} × state_dim ∈ {4, 8, 16}
+
+## Schema
+
+Results use `BenchmarkResult` / `MicroBenchmarkResult` from [HarmoniqsBenchmarks.jl](https://github.com/harmoniqs/HarmoniqsBenchmarks.jl).
+
+Load with:
+```julia
+using HarmoniqsBenchmarks
+results = load_results("benchmark/results/ipopt_vs_madnlp_N51_<sha>.jld2")
+```
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
new file mode 100644
index 0000000..28e6ee4
--- /dev/null
+++ b/benchmark/benchmarks.jl
@@ -0,0 +1,272 @@
+using TestItems
+
+@testitem "Evaluator micro-benchmarks: bilinear N=51" begin
+    using HarmoniqsBenchmarks, BenchmarkTools, DirectTrajOpt, NamedTrajectories
+    using SparseArrays, ExponentialAction, MathOptInterface, Random, Dates, Printf
+    const MOI = MathOptInterface
+
+    Random.seed!(42)
+    N = 51;
+    Δt = 0.1;
+    u_bound = 0.1;
+    ω = 0.1
+    Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
+    Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
+    Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
+    G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
+
+    traj = NamedTrajectory(
+        (
+            x = 2rand(4, N) .- 1,
+            u = u_bound*(2rand(2, N) .- 1),
+            du = randn(2, N),
+            ddu = randn(2, N),
+            Δt = fill(Δt, N),
+        );
+        controls = (:ddu, :Δt),
+        timestep = :Δt,
+        bounds = (u = u_bound, Δt = (0.01, 0.5)),
+        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
+        final = (u = zeros(2),),
+        goal = (x = [0.0, 1.0, 0.0, 0.0],),
+    )
+    integrators = [
+        BilinearIntegrator(G, :x, :u, traj),
+        DerivativeIntegrator(:u, :du, traj),
+        DerivativeIntegrator(:du, :ddu, traj),
+    ]
+    J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
+    prob = DirectTrajOptProblem(traj, J, integrators)
+
+    evaluator, Z_vec = build_evaluator(prob)
+    dims = evaluator_dims(evaluator)
+
+    g = zeros(dims.n_constraints)
+    grad = zeros(dims.n_variables)
+    H = zeros(dims.n_hessian_entries)
+    Jac = zeros(dims.n_jacobian_entries)
+    sigma = 1.0
+    mu = ones(dims.n_constraints)
+
+    benchmarks = Dict{Symbol,EvalBenchmark}(
+        :eval_objective =>
+            trial_to_eval_benchmark(@benchmark(MOI.eval_objective($evaluator, $Z_vec))),
+        :eval_gradient => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec))
+        ),
+        :eval_constraint => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec))
+        ),
+        :eval_jacobian => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_constraint_jacobian($evaluator, $Jac, $Z_vec))
+        ),
+        :eval_hessian_lagrangian => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu))
+        ),
+    )
+
+    result = MicroBenchmarkResult(
+        package = "DirectTrajOpt",
+        package_version = "0.8.10",
+        commit = (
+            try
+                String(strip(read(`git rev-parse --short HEAD`, String)))
+            catch
+                ; "unknown"
+            end
+        ),
+        benchmark_name = "evaluator_micro_bilinear_N51",
+        N = N,
+        state_dim = 4,
+        control_dim = 2,
+        eval_benchmarks = benchmarks,
+        julia_version = string(VERSION),
+        timestamp = Dates.now(),
+        runner = get(ENV, "BENCHMARK_RUNNER", "local"),
+        n_threads = Threads.nthreads(),
+    )
+
+    println("\n=== Evaluator Micro-benchmarks (bilinear N=$N) ===")
+    for (name, eb) in sort(collect(result.eval_benchmarks), by = first)
+        @printf(
+            "  %-25s  median: %8.1f ns  allocs: %d  memory: %d bytes\n",
+            name,
+            eb.median_ns,
+            eb.allocs,
+            eb.memory_bytes
+        )
+    end
+
+    results_dir = joinpath(@__DIR__, "results")
+    save_micro_results(results_dir, result.benchmark_name, result)
+    println("  Saved to $results_dir/")
+end
+
+@testitem "Ipopt vs MadNLP: bilinear N=51" begin
+    using HarmoniqsBenchmarks, DirectTrajOpt, NamedTrajectories
+    using SparseArrays, ExponentialAction, Random, Dates
+    import MadNLP
+
+    const MadNLPSolverExt = [
+        mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt
+    ][1]
+
+    function make_bilinear_problem(; seed = 42)
+        Random.seed!(seed)
+        N = 51;
+        Δt = 0.1;
+        u_bound = 0.1;
+        ω = 0.1
+        Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
+        Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
+        Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
+        G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
+
+        traj = NamedTrajectory(
+            (
+                x = 2rand(4, N) .- 1,
+                u = u_bound*(2rand(2, N) .- 1),
+                du = randn(2, N),
+                ddu = randn(2, N),
+                Δt = fill(Δt, N),
+            );
+            controls = (:ddu, :Δt),
+            timestep = :Δt,
+            bounds = (u = u_bound, Δt = (0.01, 0.5)),
+            initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
+            final = (u = zeros(2),),
+            goal = (x = [0.0, 1.0, 0.0, 0.0],),
+        )
+        integrators = [
+            BilinearIntegrator(G, :x, :u, traj),
+            DerivativeIntegrator(:u, :du, traj),
+            DerivativeIntegrator(:du, :ddu, traj),
+        ]
+        J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
+        return DirectTrajOptProblem(traj, J, integrators)
+    end
+
+    prob_ipopt = make_bilinear_problem()
+    result_ipopt = benchmark_solve!(
+        prob_ipopt,
+        IpoptOptions(max_iter = 200, print_level = 0);
+        benchmark_name = "bilinear_N51_ipopt",
+    )
+
+    prob_madnlp = make_bilinear_problem()
+    result_madnlp = benchmark_solve!(
+        prob_madnlp,
+        MadNLPSolverExt.MadNLPOptions(max_iter = 200, print_level = 1);
+        benchmark_name = "bilinear_N51_madnlp",
+    )
+
+    println("\n=== Ipopt vs MadNLP: bilinear N=51 ===")
+    println(
+        "  Ipopt:  $(round(result_ipopt.wall_time_s, digits=3))s, $(result_ipopt.total_allocations_bytes ÷ 1024) KB alloc",
+    )
+    println(
+        "  MadNLP: $(round(result_madnlp.wall_time_s, digits=3))s, $(result_madnlp.total_allocations_bytes ÷ 1024) KB alloc",
+    )
+
+    results_dir = joinpath(@__DIR__, "results")
+    save_results(results_dir, "ipopt_vs_madnlp_N51", [result_ipopt, result_madnlp])
+end
+
+@testitem "Memory scaling: N and state_dim sweep" begin
+    using HarmoniqsBenchmarks, DirectTrajOpt, NamedTrajectories
+    using SparseArrays, ExponentialAction, Random, Dates, Printf
+    import MadNLP
+
+    const MadNLPSolverExt = [
+        mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt
+    ][1]
+
+    function make_scaled_problem(; N, state_dim, n_controls = 2, seed = 42)
+        Random.seed!(seed)
+        G_drift = sparse(randn(state_dim, state_dim))
+        G_drives = [sparse(randn(state_dim, state_dim)) for _ = 1:n_controls]
+        G(u) = G_drift + sum(u[i] * G_drives[i] for i = 1:n_controls)
+
+        x_init = zeros(state_dim);
+        x_init[1] = 1.0
+        x_goal = zeros(state_dim);
+        x_goal[min(2, state_dim)] = 1.0
+
+        traj = NamedTrajectory(
+            (
+                x = randn(state_dim, N),
+                u = 0.1*randn(n_controls, N),
+                du = randn(n_controls, N),
+                Δt = fill(0.1, N),
+            );
+            controls = (:du, :Δt),
+            timestep = :Δt,
+            bounds = (u = 1.0, Δt = (0.01, 0.5)),
+            initial = (x = x_init, u = zeros(n_controls)),
+            final = (u = zeros(n_controls),),
+            goal = (x = x_goal,),
+        )
+        integrators =
+            [BilinearIntegrator(G, :x, :u, traj), DerivativeIntegrator(:u, :du, traj)]
+        J = QuadraticRegularizer(:u, traj, 1.0)
+        return DirectTrajOptProblem(traj, J, integrators)
+    end
+
+    N_values = [25, 51, 101]
+    dim_values = [4, 8, 16]
+    results = BenchmarkResult[]
+
+    println("\n=== Memory Scaling Study ===")
+    @printf(
+        "  %5s | %5s | %12s | %12s | %12s | %12s\n",
+        "N",
+        "dim",
+        "Ipopt (s)",
+        "Ipopt (KB)",
+        "MadNLP (s)",
+        "MadNLP (KB)"
+    )
+    @printf(
+        "  %5s-+-%5s-+-%12s-+-%12s-+-%12s-+-%12s\n",
+        "-"^5,
+        "-"^5,
+        "-"^12,
+        "-"^12,
+        "-"^12,
+        "-"^12
+    )
+
+    for N in N_values
+        for dim in dim_values
+            prob = make_scaled_problem(; N = N, state_dim = dim)
+            r_ipopt = benchmark_solve!(
+                prob,
+                IpoptOptions(max_iter = 50, print_level = 0);
+                benchmark_name = "scaling_N$(N)_d$(dim)_ipopt",
+            )
+            push!(results, r_ipopt)
+
+            prob = make_scaled_problem(; N = N, state_dim = dim)
+            r_madnlp = benchmark_solve!(
+                prob,
+                MadNLPSolverExt.MadNLPOptions(max_iter = 50, print_level = 1);
+                benchmark_name = "scaling_N$(N)_d$(dim)_madnlp",
+            )
+            push!(results, r_madnlp)
+
+            @printf(
+                "  %5d | %5d | %12.3f | %12d | %12.3f | %12d\n",
+                N,
+                dim,
+                r_ipopt.wall_time_s,
+                r_ipopt.total_allocations_bytes ÷ 1024,
+                r_madnlp.wall_time_s,
+                r_madnlp.total_allocations_bytes ÷ 1024
+            )
+        end
+    end
+
+    results_dir = joinpath(@__DIR__, "results")
+    save_results(results_dir, "memory_scaling", results)
+    println("\n  Saved $(length(results)) results to $results_dir/")
+end
diff --git a/test/compare_solvers.jl b/test/compare_solvers.jl
index 77ac9a1..7dca12c 100644
--- a/test/compare_solvers.jl
+++ b/test/compare_solvers.jl
@@ -7,16 +7,12 @@ using SparseArrays
 using NamedTrajectories
 using DirectTrajOpt
 
-const MadNLPSolverExt = [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
-
-function get_seeded_trajectory(seed;
-    N = 10,
-    Δt = 0.1,
-    u_bound = 0.1,
-    ω = 0.1,
-)
+const MadNLPSolverExt =
+    [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
+
+function get_seeded_trajectory(seed; N = 10, Δt = 0.1, u_bound = 0.1, ω = 0.1)
     Random.seed!(seed)
-    
+
     Gx = sparse(Float64[
         0 0 0 1;
         0 0 1 0;
@@ -59,7 +55,7 @@ function get_seeded_trajectory(seed;
         );
         controls = (:ddu, :Δt),
         timestep = :Δt,
-        bounds = (u = (-u_bound, u_bound), Δt = (1., 1.)), # timestep variability is a major source of error as in the "multiple comparisons problem" so we make them constant here
+        bounds = (u = (-u_bound, u_bound), Δt = (1.0, 1.0)), # timestep variability is a major source of error as in the "multiple comparisons problem" so we make them constant here
         initial = (x = x_init, u = zeros(2)),
         final = (u = zeros(2),),
         goal = (x = x_goal,),
@@ -144,7 +140,7 @@ function get_solver_comparison(seed)
     return err, (ti, tm)
 end
 
-wins = Dict(:ipopt => 0, :madnlp => 0,)
+wins = Dict(:ipopt => 0, :madnlp => 0)
 for seed = 0:99
     err, (ti, tm) = get_solver_comparison(seed)
     (err < 1e-3) || exit(1)
diff --git a/test/runtests.jl b/test/runtests.jl
index 9f95075..d57ccc9 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,5 +2,5 @@ using DirectTrajOpt
 using TestItemRunner
 
 
-# Run all testitem tests in package
-@run_package_tests
+# Exclude benchmark/ testitems — those run in a separate project environment
+@run_package_tests filter=ti -> !contains(ti.filename, "benchmark")