move test func

AniZpZ · AniZpZ · commit c896e6e0d42e · 2025-07-14T21:25:32.000+08:00
diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py
@@ -1,7 +1,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
 
 from types import MappingProxyType
-from typing import List, Mapping, Optional, Tuple, Union
+from typing import List, Mapping, Tuple, Union
 
 import numpy
 import torch
@@ -210,99 +210,3 @@ def unpack_cols(
     q_res = q_res.contiguous()
 
     return q_res
-
-
-# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
-def quantize_weights(
-    w: torch.Tensor,
-    quant_type: ScalarType,
-    group_size: Optional[int],
-    zero_points: bool = False,
-    ref_zero_points_after_scales: bool = False,
-):
-    assert (
-        quant_type.is_integer()
-    ), "Floating point quantization may work but has not been tested"
-    assert not zero_points or group_size is not None, (
-        "to have group zero points, group_size must be provided "
-        "(-1 group_size is channelwise)"
-    )
-
-    orig_device = w.device
-    orig_type = w.dtype
-    size_k, size_n = w.shape
-
-    assert w.is_floating_point(), "w must be float"
-
-    if group_size == -1:
-        group_size = size_k
-
-    # Reshape to [groupsize, -1]
-    if group_size is not None and group_size < size_k:
-        w = w.reshape((-1, group_size, size_n))
-        w = w.permute(1, 0, 2)
-        w = w.reshape((group_size, -1))
-
-    # Compute scale for each group
-    max_val = torch.max(w, 0, keepdim=True).values
-    min_val = torch.min(w, 0, keepdim=True).values
-
-    max_q_val = quant_type.max()
-    min_q_val = quant_type.min()
-
-    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
-    maybe_w_zp = None
-    if group_size is not None:
-        if zero_points:
-            assert not quant_type.is_signed() and quant_type.max() > 0
-            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
-            maybe_w_zp = (
-                torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
-            )
-        else:
-            # If the bias is such that there are no possible negative/positive
-            #  values, set the max value to inf to avoid divide by 0
-            w_s = torch.max(
-                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
-                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
-            )
-
-    # Quantize
-    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
-    w_q = torch.clamp(w_q, min_q_val, max_q_val)
-
-    # Compute ref (dequantized)
-    # For some kernels (namely Machete) the zero-points are applied after the
-    # scales are applied, for this case computing the reference in similar way
-    # allows us to use tighter error tolerances in our unit tests.
-    if ref_zero_points_after_scales and maybe_w_zp is not None:
-        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
-    else:
-        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
-
-    if quant_type.has_bias():
-        w_q += quant_type.bias
-
-    # Restore original shapes
-    if group_size is not None and group_size < size_k:
-
-        def reshape_w(w):
-            w = w.reshape((group_size, -1, size_n))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((size_k, size_n)).contiguous()
-            return w
-
-        w_q = reshape_w(w_q)
-        w_ref = reshape_w(w_ref)
-        w_s = w_s.reshape((-1, size_n)).contiguous()
-
-    if maybe_w_zp is not None:
-        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
-        maybe_w_zp = maybe_w_zp.to(device=orig_device)
-
-    return (
-        w_ref.to(device=orig_device),
-        w_q.to(device=orig_device),
-        w_s if group_size is not None else None,
-        maybe_w_zp,
-    )
diff --git a/sgl-kernel/tests/test_marlin_repack.py b/sgl-kernel/tests/test_marlin_repack.py
@@ -1,20 +1,113 @@
 import math
+from typing import Optional
 
 import numpy as np
 import pytest
 import torch
 from sgl_kernel import awq_marlin_repack
-from sgl_kernel.scalar_type import scalar_types
+from sgl_kernel.scalar_type import ScalarType, scalar_types
 
-from sglang.srt.layers.quantization.utils import (
-    get_pack_factor,
-    pack_cols,
-    quantize_weights,
-)
+from sglang.srt.layers.quantization.utils import get_pack_factor, pack_cols
 
 GPTQ_MARLIN_TILE = 16
 
 
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
+def quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: Optional[int],
+    zero_points: bool = False,
+    ref_zero_points_after_scales: bool = False,
+):
+    assert (
+        quant_type.is_integer()
+    ), "Floating point quantization may work but has not been tested"
+    assert not zero_points or group_size is not None, (
+        "to have group zero points, group_size must be provided "
+        "(-1 group_size is channelwise)"
+    )
+
+    orig_device = w.device
+    orig_type = w.dtype
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+
+    if group_size == -1:
+        group_size = size_k
+
+    # Reshape to [groupsize, -1]
+    if group_size is not None and group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+    # Compute scale for each group
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+
+    max_q_val = quant_type.max()
+    min_q_val = quant_type.min()
+
+    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
+    maybe_w_zp = None
+    if group_size is not None:
+        if zero_points:
+            assert not quant_type.is_signed() and quant_type.max() > 0
+            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+            maybe_w_zp = (
+                torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
+            )
+        else:
+            # If the bias is such that there are no possible negative/positive
+            #  values, set the max value to inf to avoid divide by 0
+            w_s = torch.max(
+                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
+            )
+
+    # Quantize
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
+
+    # Compute ref (dequantized)
+    # For some kernels (namely Machete) the zero-points are applied after the
+    # scales are applied, for this case computing the reference in similar way
+    # allows us to use tighter error tolerances in our unit tests.
+    if ref_zero_points_after_scales and maybe_w_zp is not None:
+        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
+    else:
+        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+
+    if quant_type.has_bias():
+        w_q += quant_type.bias
+
+    # Restore original shapes
+    if group_size is not None and group_size < size_k:
+
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        w_q = reshape_w(w_q)
+        w_ref = reshape_w(w_ref)
+        w_s = w_s.reshape((-1, size_n)).contiguous()
+
+    if maybe_w_zp is not None:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
+
+    return (
+        w_ref.to(device=orig_device),
+        w_q.to(device=orig_device),
+        w_s if group_size is not None else None,
+        maybe_w_zp,
+    )
+
+
 def awq_pack(
     q_w: torch.Tensor,
     num_bits: int,