harmoniqs · jack-champagne · Apr 18, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -33,10 +33,13 @@ Manifest.toml
 docs/Manifest.toml
 
 # Project specific ignores below
-# generated example artifacts 
+# generated example artifacts
 /examples/**/plots/
 /examples/**/trajectories/
 
+# benchmark output artifacts
+/benchmark/results/
+
 # external pkgs and configs
 pardiso.lic
 /.CondaPkg/

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
@@ -0,0 +1,17 @@
+[deps]
+DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
+ExponentialAction = "e24c0720-ea99-47e8-929e-571b494574d3"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HarmoniqsBenchmarks = "f45d0b76-2d23-4568-9599-481e0da131db"
+Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
+MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[sources]
+DirectTrajOpt = {path = ".."}
+# TODO: drop rev pin once HarmoniqsBenchmarks.jl#1 (feat/alloc-profile) merges
+HarmoniqsBenchmarks = {url = "https://github.com/harmoniqs/HarmoniqsBenchmarks.jl", rev = "feat/alloc-profile"}
diff --git a/benchmark/alloc_profile.jl b/benchmark/alloc_profile.jl
@@ -0,0 +1,133 @@
+# =============================================================================
+# Ipopt + MadNLP allocation profile — bilinear toy problem
+#
+# Runs `solve!` once per solver under Profile.Allocs via benchmark_memory!
+# from HarmoniqsBenchmarks.jl and saves the sampled trace to
+# benchmark/results/allocs/ for hot-path triage. The Piccolissimo alloc-
+# profile testitem covers the Altissimo side; this script is the sibling
+# for the in-tree NLP solvers.
+#
+# Uses the same `bilinear_dynamics_and_trajectory` fixture the main test
+# suite uses, so the profiled problem is deterministic and small (N=10,
+# 4-state × 2-control) — we care about allocation *patterns*, not absolute
+# counts on a production-size problem.
+#
+# Run:
+#   julia --project=benchmark benchmark/alloc_profile.jl
+# =============================================================================
+
+using Random
+using NamedTrajectories
+using SparseArrays
+using LinearAlgebra
+using DirectTrajOpt
+using MathOptInterface
+const MOI = MathOptInterface
+using Ipopt
+using MadNLP
+using HarmoniqsBenchmarks
+
+# Resolve the MadNLPSolverExt extension module so MadNLPOptions is accessible
+# (matches the pattern used in Piccolissimo.jl/benchmark/benchmarks.jl).
+const MadNLPSolverExt = [
+    mod for mod in reverse(Base.loaded_modules_order)
+    if Symbol(mod) == :MadNLPSolverExt
+][1]
+
+# Pull in the bilinear fixture without duplicating it.
+include(joinpath(@__DIR__, "..", "test", "test_utils.jl"))
+
+Random.seed!(42)
+
+const RESULTS_DIR = joinpath(@__DIR__, "results", "allocs")
+mkpath(RESULTS_DIR)
+
+# ----------------------------------------------------------------------------
+# Problem builder — wraps the shared fixture with a QuadraticRegularizer-style
+# objective so both Ipopt and MadNLP see the same NLP.
+# ----------------------------------------------------------------------------
+function build_problem(; N = 10)
+    G, traj = bilinear_dynamics_and_trajectory(; N = N)
+
+    integrators = [
+        BilinearIntegrator(G, :x, :u, traj),
+        DerivativeIntegrator(:u, :du, traj),
+        DerivativeIntegrator(:du, :ddu, traj),
+    ]
+
+    J = TerminalObjective(x -> norm(x - traj.goal.x)^2, :x, traj)
+    J += QuadraticRegularizer(:u, traj, 1.0)
+
+    prob = DirectTrajOptProblem(traj, J, integrators)
+    return prob, traj
+end
+
+# ----------------------------------------------------------------------------
+# Profile one solver. Warmup runs on a throwaway deepcopy so JIT/compile
+# allocations stay out of the recorded trace.
+# ----------------------------------------------------------------------------
+function profile_solver(; solver_name, options_ctor, N = 10, sample_rate = 1.0)
+    prob_warmup,  traj = build_problem(; N = N)
+    prob_profiled, _   = build_problem(; N = N)
+
+    state_dim = traj.dims[:x]
+    ctrl_dim  = sum(traj.dims[cn] for cn in traj.control_names if cn != traj.timestep; init = 0)
+
+    println("\n[$(solver_name)] JIT warmup on throwaway problem copy...")
+    DirectTrajOpt.solve!(prob_warmup; options = options_ctor())
+
+    println("[$(solver_name)] Profiling allocations (sample_rate=$(sample_rate))...")
+    profile = benchmark_memory!(
+        package        = "DirectTrajOpt",
+        solver         = solver_name,
+        benchmark_name = "bilinear_N$(N)_$(lowercase(solver_name))",
+        N              = traj.N,
+        state_dim      = state_dim,
+        control_dim    = ctrl_dim,
+        sample_rate    = sample_rate,
+        warmup         = false,
+        runner         = "local",
+    ) do
+        DirectTrajOpt.solve!(prob_profiled; options = options_ctor())
+    end
+
+    mb = profile.total_bytes / (1024 * 1024)
+    println("[$(solver_name)] captured $(profile.total_count) samples, $(round(mb; digits=2)) MB total")
+
+    path = save_alloc_profile(RESULTS_DIR, profile.benchmark_name, profile)
+    println("[$(solver_name)] saved to $(path)")
+    return profile, path
+end
+
+# ----------------------------------------------------------------------------
+# Entry points
+#
+# sample_rate default is 0.01 because Ipopt/MadNLP generate orders of magnitude
+# more fine-grained allocations than the solve's wall-time budget accommodates
+# at sample_rate=1.0 (an N=10 bilinear toy can hang for 15+ minutes at 1.0).
+# 0.01 still gives statistically useful traces for hot-path triage.
+# ----------------------------------------------------------------------------
+function main(; N = 10, sample_rate = 0.01)
+    ipopt_profile, ipopt_path = profile_solver(;
+        solver_name   = "Ipopt",
+        options_ctor  = () -> IpoptOptions(max_iter = 50, print_level = 0),
+        N             = N,
+        sample_rate   = sample_rate,
+    )
+
+    madnlp_profile, madnlp_path = profile_solver(;
+        solver_name   = "MadNLP",
+        options_ctor  = () -> MadNLPSolverExt.MadNLPOptions(max_iter = 50, print_level = Int(MadNLP.ERROR)),
+        N             = N,
+        sample_rate   = sample_rate,
+    )
+
+    println("\nDone.")
+    println("  Ipopt  profile: $(ipopt_path)  ($(ipopt_profile.total_count) samples)")
+    println("  MadNLP profile: $(madnlp_path)  ($(madnlp_profile.total_count) samples)")
+    return (ipopt = ipopt_profile, madnlp = madnlp_profile)
+end
+
+if abspath(PROGRAM_FILE) == @__FILE__
+    main()
+end
diff --git a/benchmark/analyze_allocs.jl b/benchmark/analyze_allocs.jl
@@ -0,0 +1,136 @@
+using HarmoniqsBenchmarks
+using Printf
+
+const DEFAULT_RESULTS_DIR = joinpath(@__DIR__, "results", "allocs")
+results_dir() = isempty(ARGS) ? DEFAULT_RESULTS_DIR : ARGS[1]
+
+# Noise filters — frames / types from Profile.Allocs itself or the Julia
+# toplevel/runtime that do not tell us anything about user-code hotpaths.
+const NOISE_FRAME_PATTERNS = [
+    "Profile.Allocs",
+    "gc-alloc-profiler",
+    "gc-stock.c",
+    "gc.c:",
+    "jl_apply",
+    "jl_toplevel_",
+    "ijl_toplevel_",
+    "jl_interpret_toplevel_thunk",
+    "jl_repl_entrypoint",
+    "interpreter.c",
+    "_include(",
+    "include_string(",
+    "loading.jl",
+    "client.jl",
+    "_start() at sys.so",
+    "ip:0x",
+    "_start at ",
+    " at Base.jl:",
+    "true_main at jlapi.c",
+    "__libc_start_main",
+    "loader_exe.c",
+    "jl_system_image_data",
+    "macro expansion at Allocs.jl",
+    "boot.jl:",
+    "jl_f__call_latest",
+]
+
+const WRAPPER_FRAME_PATTERNS = [
+    "alloc_profile.jl",
+    "benchmark_memory!",
+    "HarmoniqsBenchmarks",
+]
+
+const NOISE_TYPE_PATTERNS = [
+    "Profile.Allocs",
+]
+
+_is_noise_frame(f) = any(p -> occursin(p, f), NOISE_FRAME_PATTERNS)
+_is_noise_type(t)  = any(p -> occursin(p, t), NOISE_TYPE_PATTERNS)
+
+function _first_user_frame(stack)
+    for f in stack
+        _is_noise_frame(f) && continue
+        any(p -> occursin(p, f), WRAPPER_FRAME_PATTERNS) && continue
+        return f
+    end
+    return isempty(stack) ? "<empty>" : stack[end]
+end
+
+_is_wrapper_frame(f) = any(p -> occursin(p, f), WRAPPER_FRAME_PATTERNS)
+
+function top_frames(profile; k = 25, scale_to_total = true, drop_wrappers = true)
+    by_frame = Dict{String, Tuple{Int, Int}}()
+    for s in profile.samples
+        _is_noise_type(s.type_name) && continue
+        for frame in s.stacktrace
+            _is_noise_frame(frame) && continue
+            drop_wrappers && _is_wrapper_frame(frame) && continue
+            cnt, bytes = get(by_frame, frame, (0, 0))
+            by_frame[frame] = (cnt + 1, bytes + s.size_bytes)
+        end
+    end
+    ranked = sort(collect(by_frame); by = x -> -x[2][2])[1:min(k, length(by_frame))]
+    scale = scale_to_total ? (1 / profile.sample_rate) : 1.0
+    println("\nTop $(length(ranked)) user frames by allocated bytes (scaled ×$(Int(scale))):")
+    println(rpad("  bytes", 14), rpad("samples", 10), "frame")
+    for (frame, (cnt, bytes)) in ranked
+        @printf "  %-12s %-8d %s\n" _fmt_bytes(bytes * scale) cnt _truncate(frame, 140)
+    end
+end
+
+function top_leaf_callsites(profile; k = 25, scale_to_total = true)
+    by_leaf = Dict{String, Tuple{Int, Int}}()
+    for s in profile.samples
+        _is_noise_type(s.type_name) && continue
+        leaf = _first_user_frame(s.stacktrace)
+        cnt, bytes = get(by_leaf, leaf, (0, 0))
+        by_leaf[leaf] = (cnt + 1, bytes + s.size_bytes)
+    end
+    ranked = sort(collect(by_leaf); by = x -> -x[2][2])[1:min(k, length(by_leaf))]
+    scale = scale_to_total ? (1 / profile.sample_rate) : 1.0
+    println("\nTop $(length(ranked)) leaf call sites by allocated bytes (scaled ×$(Int(scale))):")
+    println(rpad("  bytes", 14), rpad("samples", 10), "leaf")
+    for (leaf, (cnt, bytes)) in ranked
+        @printf "  %-12s %-8d %s\n" _fmt_bytes(bytes * scale) cnt _truncate(leaf, 140)
+    end
+end
+
+function top_types(profile; k = 15, scale_to_total = true)
+    by_type = Dict{String, Tuple{Int, Int}}()
+    for s in profile.samples
+        _is_noise_type(s.type_name) && continue
+        cnt, bytes = get(by_type, s.type_name, (0, 0))
+        by_type[s.type_name] = (cnt + 1, bytes + s.size_bytes)
+    end
+    ranked = sort(collect(by_type); by = x -> -x[2][2])[1:min(k, length(by_type))]
+    scale = scale_to_total ? (1 / profile.sample_rate) : 1.0
+    println("\nTop $(length(ranked)) allocated types (scaled ×$(Int(scale))):")
+    println(rpad("  bytes", 14), rpad("samples", 10), "type")
+    for (t, (cnt, bytes)) in ranked
+        @printf "  %-12s %-8d %s\n" _fmt_bytes(bytes * scale) cnt _truncate(t, 120)
+    end
+end
+
+_fmt_bytes(b) = b >= 1 << 30 ? @sprintf("%.2f GB", b / (1 << 30)) :
+                b >= 1 << 20 ? @sprintf("%.2f MB", b / (1 << 20)) :
+                b >= 1 << 10 ? @sprintf("%.2f KB", b / (1 << 10)) :
+                @sprintf("%d B", Int(round(b)))
+
+_truncate(s, n) = length(s) <= n ? s : string(first(s, n - 1), "…")
+
+function main()
+    dir = results_dir()
+    files = sort(filter(f -> endswith(f, "_allocs.jld2"), readdir(dir; join = true)))
+    isempty(files) && (println("no *_allocs.jld2 files under $dir"); return)
+    for path in files
+        profile = load_alloc_profile(path)
+        println("=" ^ 100)
+        println(basename(path))
+        @printf "  solver=%s  N=%d  sample_rate=%g  samples=%d  total=%s\n" profile.solver profile.N profile.sample_rate profile.total_count _fmt_bytes(profile.total_bytes)
+        top_types(profile; k = 10)
+        top_leaf_callsites(profile; k = 20)
+        top_frames(profile; k = 20)
+    end
+end
+
+main()