chore: bump sgl-kernel version to 0.3.18.post2 (#14244)

sglang-bot · web-flow · commit 63b9300f00fe · 2025-12-01T23:14:12.000-08:00
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -9,7 +9,7 @@ ARG GRACE_BLACKWELL=0
 ARG GRACE_BLACKWELL_DEEPEP_BRANCH=gb200_blog_part_2
 ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
 ARG BUILD_AND_DOWNLOAD_PARALLEL=8
-ARG SGL_KERNEL_VERSION=0.3.18.post1
+ARG SGL_KERNEL_VERSION=0.3.18.post2
 ARG SGL_VERSION=0.5.5.post3
 ARG USE_LATEST_SGLANG=0
 ARG GDRCOPY_VERSION=2.5.1
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -57,7 +57,7 @@ dependencies = [
   "scipy",
   "sentencepiece",
   "setproctitle",
-  "sgl-kernel==0.3.18.post1",
+  "sgl-kernel==0.3.18.post2",
   "soundfile==0.13.1",
   "tiktoken",
   "timm==1.0.16",
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
@@ -737,7 +737,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
         assert_pkg_version(
             "sgl-kernel",
-            "0.3.18.post1",
+            "0.3.18.post2",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )
 
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_marlin_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_marlin_moe.py
@@ -1,4 +1,3 @@
-import functools
 from typing import Optional
 
 import torch
@@ -71,10 +70,7 @@ def fused_marlin_moe(
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
-    from sglang.srt.layers.moe.fused_moe_triton import (
-        moe_align_block_size,
-        try_get_optimal_moe_config,
-    )
+    from sglang.srt.layers.moe.fused_moe_triton import moe_align_block_size
 
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
@@ -98,17 +94,11 @@ def fused_marlin_moe(
     N = w2.shape[1] * 16
     topk = topk_ids.shape[1]
 
-    get_config_func = functools.partial(
-        try_get_optimal_moe_config,
-        w1.shape,
-        w2.shape,
-        topk_ids.shape[1],
-        None,
-        is_marlin=True,
-    )
-    config = get_config_func(M)
-
-    block_size_m = config["BLOCK_SIZE_M"]
+    # M block size selection logic
+    # TODO: tune this further for specific models
+    for block_size_m in [8, 16, 32, 48, 64]:
+        if M * topk / E / block_size_m < 0.9:
+            break
 
     if global_num_experts == -1:
         global_num_experts = E
@@ -154,7 +144,9 @@ def fused_marlin_moe(
         hidden_states,
         intermediate_cache1,
         w1,
+        None,  # b_bias_or_none
         w1_scale,
+        None,  # global_scale_or_none
         w1_zeros,
         g_idx1,
         sort_indices1,
@@ -186,7 +178,9 @@ def fused_marlin_moe(
         intermediate_cache2,
         intermediate_cache3,
         w2,
+        None,  # b_bias_or_none
         w2_scale,
+        None,  # global_scale_or_none
         w2_zeros,
         g_idx2,
         sort_indices2,

Original file line number	Diff line number	Diff line change
`@@ -737,7 +737,7 @@ def _set_envs_and_config(server_args: ServerArgs):`
`737`	`737`	`if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):`
`738`	`738`	`assert_pkg_version(`
`739`	`739`	`"sgl-kernel",`
`740`		`- "0.3.18.post1",`
	`740`	`+ "0.3.18.post2",`
`741`	`741`	"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
`742`	`742`	`)`
`743`	`743`