init

aolemila · aolemila · commit a21ab8c43346 · 2025-10-28T10:56:07.000+08:00
diff --git a/docs/evaluations/evals.md b/docs/evaluations/evals.md
@@ -0,0 +1,176 @@
+# Reproduce
+
+## Environment
+
+TPU v6e-4.
+
+## Commands
+
+## deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+
+### Commands
+
+**launch server**
+
+```bash
+# main: 8315531c7bb852b37934611deee051e22726a0ce
+JAX_COMPILATION_CACHE_DIR=/tmp/jit_cache python3 -u -m sgl_jax.launch_server \
+--model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+--trust-remote-code  \
+--tp-size=4 \
+--device=tpu \
+--mem-fraction-static=0.8 \
+--chunked-prefill-size=2048 \
+--download-dir=/tmp \
+--dtype=bfloat16 \
+--max-running-requests 256 \
+--skip-server-warmup \
+--page-size=128  \
+--disable-radix-cache
+```
+
+**eval**
+```bash
+# MATH-500 Pass@1
+# Note: 
+# 1. Sampling parameters refer to https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B.
+# 2. For all our models, the maximum generation length is set to 32,768 tokens. For benchmarks requiring sampling, we use a temperature of $0.6$, a top-p value of $0.95$, and generate 64 responses per query to estimate pass@1.
+# 3. It costs about 1 hour.
+
+evalscope eval  \
+--model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+--api-url http://127.0.0.1:30000/v1/chat/completions \
+--api-key EMPTY \
+--eval-type service \
+--datasets math_500  \
+--eval-batch-size 64 \
+--dataset-args '{"math_500":{"metric_list":["Pass@1"], "few_shot_num": 4}}' \
+--generation-config '{"max_tokens": 32768, "temperature": 0.6, "top_p": 0.95, "n":64}' \
+--timeout 120000
+
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| Model                         | Dataset   | Metric   | Subset   |   Num |   Score | Cat.0   |
++===============================+===========+==========+==========+=======+=========+=========+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 1  |    43 |  0.8837 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 2  |    90 |  0.8222 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 3  |   105 |  0.7619 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 4  |   128 |  0.6328 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 5  |   134 |  0.4328 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | OVERALL  |   500 |  0.662  | -       |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+```
+
+```bash
+# main: 8315531c7bb852b37934611deee051e22726a0ce
+# 3. It costs about 1.5 hour.
+JAX_COMPILATION_CACHE_DIR=/tmp/jit_cache python3 -u -m sgl_jax.launch_server \
+--model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+--trust-remote-code  \
+--tp-size=4 \
+--device=tpu \
+--mem-fraction-static=0.8 \
+--chunked-prefill-size=2048 \
+--download-dir=/tmp \
+--dtype=bfloat16 \
+--max-running-requests 256 \
+--skip-server-warmup \
+--page-size=128  \
+--disable-radix-cache \
+--precompile-token-paddings 2048 \
+--precompile-bs-paddings 256
+
+evalscope eval  \
+--model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+--api-url http://127.0.0.1:30000/v1/chat/completions \
+--api-key EMPTY \
+--eval-type service \
+--datasets math_500  \
+--eval-batch-size 64 \
+--dataset-args '{"math_500":{"metric_list":["Pass@1"], "few_shot_num": 4}}' \
+--generation-config '{"max_tokens": 32768, "temperature": 0.6, "top_p": 0.95, "n":64}' \
+--timeout 120000
+
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| Model                         | Dataset   | Metric   | Subset   |   Num |   Score | Cat.0   |
++===============================+===========+==========+==========+=======+=========+=========+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 1  |    43 |  0.814  | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 2  |    90 |  0.8111 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 3  |   105 |  0.8476 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 4  |   128 |  0.7266 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 5  |   134 |  0.4328 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | OVERALL  |   500 |  0.696  | -       |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+
+```
+
+```bash
+# main: 4fa24afba321579fb17cc813883f8ea9614b4c36
+JAX_COMPILATION_CACHE_DIR=/tmp/jit_cache python3 -u -m sgl_jax.launch_server \
+--model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+--trust-remote-code  \
+--tp-size=4 \
+--device=tpu \
+--mem-fraction-static=0.8 \
+--chunked-prefill-size=2048 \
+--download-dir=/tmp \
+--dtype=bfloat16 \
+--max-running-requests 256 \
+--skip-server-warmup \
+--page-size=128  \
+--disable-radix-cache \
+--precompile-token-paddings 2048 \
+--precompile-bs-paddings 256
+
+evalscope eval  \
+--model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+--api-url http://127.0.0.1:30000/v1/chat/completions \
+--api-key EMPTY \
+--eval-type service \
+--datasets math_500  \
+--eval-batch-size 64 \
+--dataset-args '{"math_500":{"metric_list":["Pass@1"], "few_shot_num": 4}}' \
+--generation-config '{"max_tokens": 32768, "temperature": 0.6, "top_p": 0.95, "n":64}' \
+--timeout 120000
+
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| Model                         | Dataset   | Metric   | Subset   |   Num |   Score | Cat.0   |
++===============================+===========+==========+==========+=======+=========+=========+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 1  |    43 |  0.9302 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 2  |    90 |  0.9222 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 3  |   105 |  0.9143 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 4  |   128 |  0.8047 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | Level 5  |   134 |  0.7164 | default |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+| DeepSeek-R1-Distill-Qwen-1.5B | math_500  | Pass@1   | OVERALL  |   500 |  0.836  | -       |
++-------------------------------+-----------+----------+----------+-------+---------+---------+
+
+```
+
+
+
+
+
+
+
+# Temp
+
+```bash
+JAX_COMPILATION_CACHE_DIR=/tmp/jit_cache python3 -u -m sgl_jax.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --trust-remote-code  --tp-size=4 --device=tpu --mem-fraction-static=0.8 --chunked-prefill-size=2048 --download-dir=/tmp --dtype=bfloat16 --max-running-requests 256 --skip-server-warmup --page-size=128  --disable-radix-cache --precompile-token-paddings 2048 --precompile-bs-paddings 256
+
+
+evalscope eval  --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --api-url http://127.0.0.1:30000/v1/chat/completions --api-key EMPTY --eval-type service --datasets math_500  --eval-batch-size 64 --dataset-args '{"math_500":{"metric_list":["Pass@1"], "few_shot_num": 4}}' --generation-config '{"max_tokens": 32768, "temperature": 0.6, "top_p": 0.95, "n":64}' --timeout 120000 --limit 15
+```