Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
59a2ccf
Add Metal 4 M5 scaffold
ivanfioravanti May 10, 2026
04bc09a
Improve Metal MPP diagnostics and safe defaults
ivanfioravanti May 10, 2026
2239241
Tune Metal MPP defaults and thinking checkpoints
ivanfioravanti May 11, 2026
2fa510f
Improve Metal MPP prefill throughput
ivanfioravanti May 11, 2026
95762cf
Add low-power Metal MPP Q8 profile
ivanfioravanti May 12, 2026
5d549e9
Add M5 Max drift-patch macro plumbing and --dump-logits tooling
ivanfioravanti May 13, 2026
97d966e
Stabilize HC mixer sigmoid behind DS4_METAL_HC_STABLE (default on)
ivanfioravanti May 13, 2026
ef4b2cc
Unify RMSNorm scale formula behind DS4_METAL_NORM_RSQRT_DISABLE (defa…
ivanfioravanti May 13, 2026
4ac218f
Add diagnostic DS4_METAL_KV_RAW_F32 to skip FP16 KV round-trip
ivanfioravanti May 13, 2026
2562846
Add diagnostic DS4_METAL_ROPE_EXP2_LOG2 RoPE angle path
ivanfioravanti May 13, 2026
63a35db
Fix DS4_METAL_TENSOR_MATMUL_DISABLE host dispatch
ivanfioravanti May 13, 2026
b78ae9c
Default Metal Tensor Q8_0 matmul OFF on M5 Max
ivanfioravanti May 13, 2026
9f1380c
Add DS4_METAL_MATH_SAFE diagnostic to pin shader library to IEEE-754
ivanfioravanti May 13, 2026
5c6a460
Fix: F16 compressor Tensor matmul incorrectly coupled to Q8 default
ivanfioravanti May 13, 2026
779fa5a
Fix Q8 MPP kernel test: reference must take the legacy path
ivanfioravanti May 13, 2026
568ae1b
Update README to match new M5 Tensor defaults and refreshed drift num…
ivanfioravanti May 13, 2026
7455051
Establish Metal Tensor prefill drift baseline
ivanfioravanti May 14, 2026
374df30
Tune routed MoE Tensor default window
ivanfioravanti May 14, 2026
5814d0c
Tune routed MoE down Tensor window
ivanfioravanti May 14, 2026
38cce28
Tune routed MoE gate up Tensor window
ivanfioravanti May 14, 2026
941f7c4
Document latest Tensor prefill candidate results
ivanfioravanti May 14, 2026
7312587
Record experimental MoE layout drift check
ivanfioravanti May 14, 2026
650851b
Document route-specific MoE Tensor sweep
ivanfioravanti May 14, 2026
96aa8fc
Document dense Q8 Tensor prototype results
ivanfioravanti May 14, 2026
ad32365
Document attention output direct RHS check
ivanfioravanti May 14, 2026
eaba5b8
Document wide F16 Tensor rejection
ivanfioravanti May 14, 2026
3ecbf46
Document Tensor prefill baseline tooling
ivanfioravanti May 16, 2026
7d878db
Fix Tensor drift test naming and vector path
ivanfioravanti May 16, 2026
61345a1
Tune routed MoE Tensor default window
ivanfioravanti May 16, 2026
b84dd2d
Expand safe routed MoE Tensor window
ivanfioravanti May 17, 2026
56c8c80
Use private Metal scratch on M5
ivanfioravanti May 17, 2026
1da4fc7
Document eval token-count drift gate
ivanfioravanti May 17, 2026
068d8dd
Move routed MoE up Tensor default to layer 37
ivanfioravanti May 17, 2026
18e3190
Lower routed MoE Tensor default layers
ivanfioravanti May 18, 2026
b109d85
Add ds4-eval trace regrading
ivanfioravanti May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
/gguf/
*.o
*.dSYM/
__pycache__/
*.pyc
/misc/
.*.swp
.DS_Store
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ else
$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
endif

test: ds4_test
test: ds4_test ds4-eval
./ds4-eval --self-test-extractors
./ds4_test

clean:
Expand Down
300 changes: 295 additions & 5 deletions README.md

Large diffs are not rendered by default.

715 changes: 427 additions & 288 deletions ds4.c

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions ds4.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ typedef enum {
DS4_BACKEND_CPU,
} ds4_backend;

typedef enum {
DS4_MPP_AUTO = 0,
DS4_MPP_ON,
DS4_MPP_OFF,
} ds4_mpp_mode;

typedef enum {
DS4_THINK_NONE,
DS4_THINK_HIGH,
Expand Down Expand Up @@ -71,6 +77,7 @@ typedef struct {
float directional_steering_ffn;
bool warm_weights;
bool quality;
ds4_mpp_mode mpp_mode;
} ds4_engine_options;

typedef void (*ds4_token_emit_fn)(void *ud, int token);
Expand All @@ -95,7 +102,9 @@ typedef struct {
int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt);
void ds4_engine_close(ds4_engine *e);
void ds4_engine_summary(ds4_engine *e);
int ds4_engine_vocab_size(ds4_engine *e);
const char *ds4_backend_name(ds4_backend backend);
const char *ds4_mpp_mode_name(ds4_mpp_mode mode);
bool ds4_think_mode_enabled(ds4_think_mode mode);
const char *ds4_think_mode_name(ds4_think_mode mode);
const char *ds4_think_max_prefix(void);
Expand Down Expand Up @@ -174,6 +183,7 @@ int ds4_session_argmax_excluding(ds4_session *s, int excluded_id);
int ds4_session_sample(ds4_session *s, float temperature, int top_k, float top_p, float min_p, uint64_t *rng);
int ds4_session_top_logprobs(ds4_session *s, ds4_token_score *out, int k);
int ds4_session_token_logprob(ds4_session *s, int token, ds4_token_score *out);
int ds4_session_copy_logits(ds4_session *s, float *out, int cap);
int ds4_session_eval(ds4_session *s, int token, char *err, size_t errlen);
int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token,
int max_tokens, int eos_token,
Expand Down
123 changes: 123 additions & 0 deletions ds4_bench.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ typedef struct {
int step_incr;
int gen_tokens;
double step_mul;
ds4_mpp_mode mpp_mode;
const char *dump_frontier_logits_dir;
bool warm_weights;
bool quality;
} bench_config;
Expand Down Expand Up @@ -67,6 +69,8 @@ static void usage(FILE *fp) {
" Select backend explicitly. Defaults to Metal on macOS, CUDA elsewhere.\n"
" -t, --threads N CPU helper threads.\n"
" --quality Prefer exact kernels where applicable.\n"
" -mt MODE, --mt MODE Metal Tensor route mode: auto, on, or off.\n"
" Legacy alias: --mpp MODE.\n"
" --warm-weights Touch mapped tensor pages before benchmarking.\n"
"\n"
"Sweep:\n"
Expand All @@ -79,6 +83,8 @@ static void usage(FILE *fp) {
"\n"
"Output:\n"
" --csv FILE Write CSV there instead of stdout.\n"
" --dump-frontier-logits-dir DIR\n"
" Write one full-logit JSON file per measured frontier. DIR must exist.\n"
" -h, --help Show this help.\n");
}

Expand Down Expand Up @@ -119,6 +125,15 @@ static ds4_backend parse_backend(const char *s, const char *opt) {
exit(2);
}

static ds4_mpp_mode parse_mpp_mode(const char *s, const char *opt) {
if (!strcmp(s, "auto")) return DS4_MPP_AUTO;
if (!strcmp(s, "on")) return DS4_MPP_ON;
if (!strcmp(s, "off")) return DS4_MPP_OFF;
fprintf(stderr, "ds4-bench: invalid value for %s: %s\n", opt, s);
fprintf(stderr, "ds4-bench: valid Metal Tensor modes are: auto, on, off\n");
exit(2);
}

static ds4_backend default_backend(void) {
#ifdef DS4_NO_GPU
return DS4_BACKEND_CPU;
Expand Down Expand Up @@ -178,6 +193,7 @@ static bench_config parse_options(int argc, char **argv) {
.step_incr = 2048,
.gen_tokens = 128,
.step_mul = 1.0,
.mpp_mode = DS4_MPP_AUTO,
};

for (int i = 1; i < argc; i++) {
Expand Down Expand Up @@ -207,6 +223,8 @@ static bench_config parse_options(int argc, char **argv) {
c.gen_tokens = parse_int(need_arg(&i, argc, argv, arg), arg);
} else if (!strcmp(arg, "--csv")) {
c.csv_path = need_arg(&i, argc, argv, arg);
} else if (!strcmp(arg, "--dump-frontier-logits-dir")) {
c.dump_frontier_logits_dir = need_arg(&i, argc, argv, arg);
} else if (!strcmp(arg, "-t") || !strcmp(arg, "--threads")) {
c.threads = parse_int(need_arg(&i, argc, argv, arg), arg);
} else if (!strcmp(arg, "--backend")) {
Expand All @@ -219,6 +237,8 @@ static bench_config parse_options(int argc, char **argv) {
c.backend = DS4_BACKEND_CPU;
} else if (!strcmp(arg, "--quality")) {
c.quality = true;
} else if (!strcmp(arg, "-mt") || !strcmp(arg, "--mt") || !strcmp(arg, "--mpp")) {
c.mpp_mode = parse_mpp_mode(need_arg(&i, argc, argv, arg), arg);
} else if (!strcmp(arg, "--warm-weights")) {
c.warm_weights = true;
} else {
Expand Down Expand Up @@ -256,6 +276,103 @@ static bench_config parse_options(int argc, char **argv) {
return c;
}

static void json_write_string(FILE *fp, const char *s) {
fputc('"', fp);
if (s) {
for (const unsigned char *p = (const unsigned char *)s; *p; p++) {
switch (*p) {
case '"': fputs("\\\"", fp); break;
case '\\': fputs("\\\\", fp); break;
case '\b': fputs("\\b", fp); break;
case '\f': fputs("\\f", fp); break;
case '\n': fputs("\\n", fp); break;
case '\r': fputs("\\r", fp); break;
case '\t': fputs("\\t", fp); break;
default:
if (*p < 0x20) fprintf(fp, "\\u%04x", (unsigned)*p);
else fputc((char)*p, fp);
break;
}
}
}
fputc('"', fp);
}

static int write_frontier_logits_json(
const bench_config *cfg,
ds4_engine *engine,
ds4_session *session,
int frontier,
int previous) {
if (!cfg->dump_frontier_logits_dir) return 0;

const int vocab = ds4_engine_vocab_size(engine);
float *logits = malloc((size_t)vocab * sizeof(logits[0]));
if (!logits) {
fprintf(stderr, "ds4-bench: out of memory copying frontier logits\n");
return 1;
}
if (ds4_session_copy_logits(session, logits, vocab) != vocab) {
fprintf(stderr, "ds4-bench: failed to copy frontier logits at %d\n", frontier);
free(logits);
return 1;
}

char path[PATH_MAX];
const int n = snprintf(path,
sizeof(path),
"%s/frontier_%06d.logits.json",
cfg->dump_frontier_logits_dir,
frontier);
if (n <= 0 || (size_t)n >= sizeof(path)) {
fprintf(stderr, "ds4-bench: frontier logits path is too long\n");
free(logits);
return 1;
}

FILE *fp = fopen(path, "wb");
if (!fp) {
fprintf(stderr, "ds4-bench: failed to open %s: %s\n", path, strerror(errno));
free(logits);
return 1;
}

const int argmax = ds4_session_argmax(session);
fprintf(fp, "{\n \"source\":\"ds4-bench\",\n \"model\":");
json_write_string(fp, cfg->model_path);
fprintf(fp,
",\n \"backend\":\"%s\",\n \"mt\":\"%s\",\n \"quality\":%s,\n"
" \"quant_bits\":%d,\n \"prompt_tokens\":%d,\n"
" \"frontier_tokens\":%d,\n \"prefill_tokens\":%d,\n"
" \"ctx\":%d,\n \"vocab\":%d,\n"
" \"argmax_id\":%d,\n \"argmax_logit\":%.9g,\n \"logits\":[",
ds4_backend_name(cfg->backend),
ds4_mpp_mode_name(cfg->mpp_mode),
cfg->quality ? "true" : "false",
ds4_engine_routed_quant_bits(engine),
frontier,
frontier,
frontier - previous,
cfg->ctx_alloc,
vocab,
argmax,
logits[argmax]);
for (int i = 0; i < vocab; i++) {
if (i) fputc(',', fp);
if ((i % 8) == 0) fputs("\n ", fp);
if (isfinite(logits[i])) fprintf(fp, "%.9g", logits[i]);
else fputs("null", fp);
}
fputs("\n ]\n}\n", fp);
if (fclose(fp) != 0) {
fprintf(stderr, "ds4-bench: failed to close %s\n", path);
free(logits);
return 1;
}
free(logits);
return 0;
}

static int next_frontier(const bench_config *c, int cur) {
if (cur >= c->ctx_max) return c->ctx_max;
int next;
Expand Down Expand Up @@ -293,6 +410,7 @@ int main(int argc, char **argv) {
.n_threads = cfg.threads,
.warm_weights = cfg.warm_weights,
.quality = cfg.quality,
.mpp_mode = cfg.mpp_mode,
};
ds4_engine *engine = NULL;
if (ds4_engine_open(&engine, &opt) != 0) return 1;
Expand Down Expand Up @@ -361,6 +479,11 @@ int main(int argc, char **argv) {
const double prefill_sec = prefill_t1 - prefill_t0;
const int prefill_tokens = frontier - previous;

if (write_frontier_logits_json(&cfg, engine, session, frontier, previous) != 0) {
rc = 1;
break;
}

if (ds4_session_save_snapshot(session, &snap, err, sizeof(err)) != 0) {
fprintf(stderr, "ds4-bench: snapshot at %d failed: %s\n", frontier, err);
rc = 1;
Expand Down
Loading