Skip to content

Commit 63b9300

Browse files
authored
chore: bump sgl-kernel version to 0.3.18.post2 (#14244)
1 parent 21ec99b commit 63b9300

File tree

4 files changed

+13
-19
lines changed

4 files changed

+13
-19
lines changed

docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ ARG GRACE_BLACKWELL=0
99
ARG GRACE_BLACKWELL_DEEPEP_BRANCH=gb200_blog_part_2
1010
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
1111
ARG BUILD_AND_DOWNLOAD_PARALLEL=8
12-
ARG SGL_KERNEL_VERSION=0.3.18.post1
12+
ARG SGL_KERNEL_VERSION=0.3.18.post2
1313
ARG SGL_VERSION=0.5.5.post3
1414
ARG USE_LATEST_SGLANG=0
1515
ARG GDRCOPY_VERSION=2.5.1

python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ dependencies = [
5757
"scipy",
5858
"sentencepiece",
5959
"setproctitle",
60-
"sgl-kernel==0.3.18.post1",
60+
"sgl-kernel==0.3.18.post2",
6161
"soundfile==0.13.1",
6262
"tiktoken",
6363
"timm==1.0.16",

python/sglang/srt/entrypoints/engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -737,7 +737,7 @@ def _set_envs_and_config(server_args: ServerArgs):
737737
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
738738
assert_pkg_version(
739739
"sgl-kernel",
740-
"0.3.18.post1",
740+
"0.3.18.post2",
741741
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
742742
)
743743

python/sglang/srt/layers/moe/fused_moe_triton/fused_marlin_moe.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import functools
21
from typing import Optional
32

43
import torch
@@ -71,10 +70,7 @@ def fused_marlin_moe(
7170
Returns:
7271
- torch.Tensor: The output tensor after applying the MoE layer.
7372
"""
74-
from sglang.srt.layers.moe.fused_moe_triton import (
75-
moe_align_block_size,
76-
try_get_optimal_moe_config,
77-
)
73+
from sglang.srt.layers.moe.fused_moe_triton import moe_align_block_size
7874

7975
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
8076
assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
@@ -98,17 +94,11 @@ def fused_marlin_moe(
9894
N = w2.shape[1] * 16
9995
topk = topk_ids.shape[1]
10096

101-
get_config_func = functools.partial(
102-
try_get_optimal_moe_config,
103-
w1.shape,
104-
w2.shape,
105-
topk_ids.shape[1],
106-
None,
107-
is_marlin=True,
108-
)
109-
config = get_config_func(M)
110-
111-
block_size_m = config["BLOCK_SIZE_M"]
97+
# M block size selection logic
98+
# TODO: tune this further for specific models
99+
for block_size_m in [8, 16, 32, 48, 64]:
100+
if M * topk / E / block_size_m < 0.9:
101+
break
112102

113103
if global_num_experts == -1:
114104
global_num_experts = E
@@ -154,7 +144,9 @@ def fused_marlin_moe(
154144
hidden_states,
155145
intermediate_cache1,
156146
w1,
147+
None, # b_bias_or_none
157148
w1_scale,
149+
None, # global_scale_or_none
158150
w1_zeros,
159151
g_idx1,
160152
sort_indices1,
@@ -186,7 +178,9 @@ def fused_marlin_moe(
186178
intermediate_cache2,
187179
intermediate_cache3,
188180
w2,
181+
None, # b_bias_or_none
189182
w2_scale,
183+
None, # global_scale_or_none
190184
w2_zeros,
191185
g_idx2,
192186
sort_indices2,

0 commit comments

Comments
 (0)