sgl-project
diff --git a/‎.github/workflows/pr-test-xpu.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/pr-test-xpu.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile.xpu_kernel‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.xpu_kernel‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/bench_flash_attn.py‎
Lines changed: 8 additions & 10 deletions b/‎benchmark/bench_flash_attn.py‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎benchmark/bench_moe_topk_softmax.py‎
Lines changed: 90 additions & 46 deletions b/‎benchmark/bench_moe_topk_softmax.py‎
Lines changed: 90 additions & 46 deletions
@@ -27,15 +27,15 @@ jobs:
           docker build \
             --build-arg SG_LANG_KERNEL_BRANCH=${{ github.head_ref }} \
             --build-arg SG_LANG_KERNEL_REPO=${{ github.event.pull_request.head.repo.clone_url }} \
-            --no-cache --progress=plain -f Dockerfile.xpu_kernel -t xpu_sglang:pvc .
+            --no-cache --progress=plain -f Dockerfile.xpu_kernel -t xpu_sglang:kernel .
 
       - name: Run container
         run: |
           docker run -dt \
             --device /dev/dri/ \
             --name ci_sglang_xpu \
             -e HF_TOKEN=$(cat ~/huggingface_token.txt) \
-            xpu_sglang:pvc
+            xpu_sglang:kernel
 
       - name: Install Dependency
         timeout-minutes: 20
@@ -49,13 +49,13 @@ jobs:
         timeout-minutes: 20
         run: |
           docker exec -w /root/sglang ci_sglang_xpu \
-            /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/tests &&  python3 -m pytest -v -s test_awq_dequant.py test_topk_softmax.py test_flash_attention.py"
+            /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/tests &&  python3 run_suite.py --suite per-commit "
 
       - name: Run Sglang Kernel Benchmarks
         timeout-minutes: 20
         run: |
           docker exec -w /root/sglang ci_sglang_xpu \
-            /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/benchmark &&  python3 bench_flash_attn.py "
+            /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/benchmark &&  python3 bench_flash_attn.py && python3 bench_moe_topk_softmax.py "
 
       - name: Run E2E Bfloat16 tests
         timeout-minutes: 20
 
@@ -38,7 +38,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable headers only mode in cutla
 FetchContent_Declare(
     repo-cutlass-sycl
     GIT_REPOSITORY https://github.com/intel/sycl-tla.git
-    GIT_TAG        d2292f0071125c32f92e8963f8dfba8ec3e491f7
+    GIT_TAG        5a0b7a8b7024175f223f4a47535650f317bcbbf3
     GIT_SHALLOW    OFF
 )
 
 
@@ -55,7 +55,7 @@ RUN --mount=type=secret,id=github_token \
     cd sgl-kernel-xpu && \
     pip install -v . &&\
     # Install required packages for sglang workloads
-    pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops matplotlib pandas --root-user-action=ignore && \
+    pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops matplotlib pandas --root-user-action=ignore aiohttp && \
     conda install libsqlite=3.48.0 -y && \
     echo ". /miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /root/" >> /root/.bashrc;
 
 
@@ -12,7 +12,7 @@ def flash_attn_baseline(
     causal,
     window_size,
     softmax_scale,
-    softmax_sink,
+    sinks,
     cache_seqlens,
     page_table,
     cu_seqlens_q,
@@ -24,7 +24,7 @@ def flash_attn_baseline(
         k_cache,
         v_cache,
         causal=causal,
-        softmax_sink=softmax_sink,
+        sinks=sinks,
         window_size=window_size,
         softmax_scale=softmax_scale,
         page_table=page_table,
@@ -39,7 +39,7 @@ def flash_attn_baseline(
 # Benchmark configurations
 causal = [True, False]
 local = [True, False]
-use_softmax_sink = [True, False]
+use_sinks = [True, False]
 batch_size = [1, 16]
 q_seq_length_range = [1, 512, 1024]
 kv_seq_length_range = [512, 1024, 2048, 4096, 8192, 16384]
@@ -50,7 +50,7 @@ def flash_attn_baseline(
         product(
             causal,
             local,
-            use_softmax_sink,
+            use_sinks,
             batch_size,
             q_seq_length_range,
             kv_seq_length_range,
@@ -65,7 +65,7 @@ def flash_attn_baseline(
         x_names=[
             "causal",
             "local",
-            "use_softmax_sink",
+            "use_sinks",
             "batch_size",
             "q_seq_length",
             "kv_seq_length",
@@ -84,7 +84,7 @@ def flash_attn_baseline(
 def benchmark(
     causal,
     local,
-    use_softmax_sink,
+    use_sinks,
     batch_size,
     q_seq_length,
     kv_seq_length,
@@ -127,9 +127,7 @@ def benchmark(
     max_seqlen_q = q_seq_length
     window_size = (-1, -1) if not local else torch.randint(0, kv_seq_length, (2,))
 
-    softmax_sink = (
-        torch.randn(num_heads, device=device, dtype=dtype) if use_softmax_sink else None
-    )
+    sinks = torch.randn(num_heads, device=device, dtype=dtype) if use_sinks else None
 
     softmax_scale = 1.0 / (head_dim**0.5)
 
@@ -144,7 +142,7 @@ def benchmark(
                 causal=causal,
                 window_size=window_size,
                 softmax_scale=softmax_scale,
-                softmax_sink=softmax_sink,
+                sinks=sinks,
                 cache_seqlens=cache_seqlens,
                 page_table=page_table,
                 cu_seqlens_q=cu_seqlens_q,
 
@@ -3,6 +3,7 @@
 import torch
 import triton
 from sgl_kernel import topk_softmax
+from utils import get_model_config, parse_args
 
 
 def vllm_topk_softmax(gating_output, topk):
@@ -23,7 +24,35 @@ def vllm_topk_softmax(gating_output, topk):
     return topk_weights, topk_indices
 
 
-def sglang_topk_softmax(gating_output, topk):
+def navtive_topk_softmax(
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    num_tokens, num_experts = gating_output.shape
+
+    import torch.nn.functional as F
+
+    topk_weights = torch.empty(
+        (num_tokens, topk), device=gating_output.device, dtype=torch.float32
+    )
+    topk_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
+    topk_weights = F.softmax(gating_output.float(), dim=-1)
+    topk_weights, topk_indices = torch.topk(topk_weights, topk, dim=-1)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_indices
+
+
+def sglang_topk_softmax(
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
     num_tokens, num_experts = gating_output.shape
 
     topk_weights = torch.empty(
@@ -37,18 +66,18 @@ def sglang_topk_softmax(gating_output, topk):
     )
 
     topk_softmax(
-        topk_weights=topk_weights,
-        topk_ids=topk_indices,
-        token_expert_indices=token_expert_indices,
-        gating_output=gating_output,
+        topk_weights,
+        topk_indices,
+        gating_output,
+        renormalize=renormalize,
     )
 
     return topk_weights, topk_indices
 
 
 def calculate_diff(num_tokens, num_experts, topk):
     gating_output = torch.randn(
-        (num_tokens, num_experts), device="cuda", dtype=torch.float32
+        (num_tokens, num_experts), device=gating_output.device, dtype=torch.float32
     )
     weights_vllm, indices_vllm = vllm_topk_softmax(gating_output.clone(), topk)
     weights_sglang, indices_sglang = sglang_topk_softmax(gating_output.clone(), topk)
@@ -67,52 +96,67 @@ def calculate_diff(num_tokens, num_experts, topk):
         )
 
 
-num_tokens_range = [128, 512, 1024, 2048, 4096, 8192, 16384, 32768]
-num_experts_range = [32, 64, 128, 256, 12, 512]
-topk_range = [1, 2, 4, 8]
+def get_benchmark(device="xpu"):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["num_tokens", "num_experts", "topk", "dtype", "renormalize"],
+            x_vals=configs,
+            line_arg="provider",
+            line_vals=["sglang", "native"],
+            line_names=["SGLang", "native"],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="Latency (us)",
+            plot_name="topk-softmax-performance",
+            args={},
+        )
+    )
+    def benchmark(num_tokens, num_experts, topk, dtype, renormalize, provider):
 
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+        gating_output = torch.randn(
+            (num_tokens, num_experts), device=device, dtype=dtype
+        )
 
+        if provider == "sglang" or provider == "sglang1":
+            fn = lambda: sglang_topk_softmax(gating_output, topk, renormalize)
+        elif provider == "native":
+            fn = lambda: navtive_topk_softmax(gating_output, topk, renormalize)
 
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["num_tokens", "num_experts", "topk"],
-        x_vals=configs,
-        line_arg="provider",
-        line_vals=["sglang", "vllm"],
-        line_names=["SGLang", "VLLM"],
-        styles=[("blue", "-"), ("green", "-")],
-        ylabel="Latency (us)",
-        plot_name="topk-softmax-performance",
-        args={},
-    )
-)
-def benchmark(num_tokens, num_experts, topk, provider):
+        quantiles = [0.5, 0.2, 0.8]
+        ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
 
-    gating_output = torch.randn(
-        (num_tokens, num_experts), device="cuda", dtype=torch.float32
-    )
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
 
-    if provider == "vllm" or provider == "vllm1":
-        fn = lambda: vllm_topk_softmax(gating_output, topk)
-    elif provider == "sglang" or provider == "sglang1":
-        fn = lambda: sglang_topk_softmax(gating_output, topk)
 
-    quantiles = [0.5, 0.2, 0.8]
-    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
+if __name__ == "__main__":
+    # Run correctness test on small configs if not using a real model
+    args = parse_args()
+    params = get_model_config(args)
+
+    sweep_params = {
+        "num_tokens": args.num_tokens,
+        "num_experts": params["num_experts"] or [64],
+        "top_k": params["top_k"] or [2, 4],
+        "dtype": [torch.bfloat16],
+        "renormalize": [False],
+    }
+
+    keys = sweep_params.keys()
+    configs = list(itertools.product(*sweep_params.values()))
+    print(f"Testing {len(configs)} configurations...")
+    for config in configs:
+        num_tokens, num_experts, topk, dtype, renormalize = config
+        print(
+            f"Config: num_tokens={num_tokens}, num_experts={num_experts}, topk={topk}, dtype={dtype}, renormalize={renormalize}"
+        )
 
-    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+        # calculate_diff(num_tokens, num_experts, topk)
 
+    global benchmark_configs
+    benchmark_configs = configs
 
-if __name__ == "__main__":
-    configs = [
-        (20, 256, 4),
-        (20, 256, 8),
-        (20, 12, 4),
-        (20, 12, 1),
-        (20, 512, 4),
-        (20, 512, 1),
-    ]
-    for num_tokens, num_experts, topk in configs:
-        calculate_diff(num_tokens, num_experts, topk)
-    benchmark.run(print_data=True)
+    # Run benchmark
+    print("Starting performance benchmark...")
+    benchmark = get_benchmark()
+    benchmark.run(print_data=True, show_plots=False, save_path=".")
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable headers only mode in cutla`
`38`	`38`	`FetchContent_Declare(`
`39`	`39`	`repo-cutlass-sycl`
`40`	`40`	`GIT_REPOSITORY https://github.com/intel/sycl-tla.git`
`41`		`- GIT_TAG d2292f0071125c32f92e8963f8dfba8ec3e491f7`
	`41`	`+ GIT_TAG 5a0b7a8b7024175f223f4a47535650f317bcbbf3`
`42`	`42`	`GIT_SHALLOW OFF`
`43`	`43`	`)`
`44`	`44`