Skip to content

Commit f82c54d

Browse files
ch-wanssssnow
authored andcommitted
[hotfix] fix mixtral with tensor-level compressed-tensor quantization (#8721)
1 parent b59d39e commit f82c54d

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from sglang.srt.utils import is_cpu, is_cuda, is_hip, is_npu, set_weight_attrs
2424

2525
if TYPE_CHECKING:
26+
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
2627
from sglang.srt.layers.moe.topk import TopKOutput
2728
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
2829
CompressedTensorsConfig,
@@ -189,7 +190,7 @@ def create_weights(
189190
layer.w13_input_scale = None
190191
layer.w2_input_scale = None
191192

192-
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
193+
def process_weights_after_loading(self, layer: FusedMoE) -> None:
193194
# Fp8 moe kernels require a single activation scale.
194195
# We take the max of all the scales in case they differ.
195196
if self.static_input_scales:
@@ -246,7 +247,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
246247
assert layer.w13_weight_scale is not None
247248
shard_size = layer.intermediate_size_per_partition
248249
max_w13_scales = layer.w13_weight_scale.max(dim=1).values
249-
for expert_id in range(layer.local_num_experts):
250+
for expert_id in range(layer.num_local_experts):
250251
start = 0
251252
for shard_id in range(2):
252253
dq_weight = per_tensor_dequantize(

0 commit comments

Comments
 (0)