Update quantization kernels

danieldk · danieldk · commit d2d0e1101294 · 2025-07-07T06:12:42.000Z
diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "hf-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    hf-nix.url = "github:huggingface/hf-nix";
+    hf-nix.url = "github:huggingface/hf-nix/quantization-0.1.0";
     nixpkgs.follows = "hf-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
@@ -33,7 +33,7 @@
         };
         pkgs = import nixpkgs {
           inherit system;
-          inherit (hf-nix.lib) config;
+          config = hf-nix.lib.config system;
           overlays = [
             rust-overlay.overlays.default
             hf-nix.overlays.default
diff --git a/server/text_generation_server/layers/marlin/fp8.py b/server/text_generation_server/layers/marlin/fp8.py
@@ -76,15 +76,21 @@ def forward(self, A: torch.Tensor) -> torch.Tensor:
         assert quantization is not None
 
         A_flat = A.view(-1, A.shape[-1])
-        C = quantization.fp8_marlin_gemm(
-            A_flat,
-            self.qweight,
-            self.scales,
-            self.workspace,
-            8,
-            A_flat.shape[0],
-            self.scales.shape[1],
-            A_flat.shape[1],
+        C = quantization.gptq_marlin_gemm(
+            a=A_flat,
+            c=None,
+            b_q_weight=self.qweight,
+            b_scales=self.scales,
+            global_scale=None,
+            b_zeros=None,
+            g_idx=None,
+            perm=None,
+            workspace=self.workspace,
+            b_q_type=quantization.scalar_type.scalar_types.float8_e4m3fn,
+            size_m=A_flat.shape[0],
+            size_n=self.scales.shape[1],
+            size_k=A_flat.shape[1],
+            use_fp32_reduce=True,
         )
         C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
 
@@ -143,5 +149,6 @@ def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor):
     )
 
     scales = permute_scales(scales)
+    scales = quantization.marlin_utils_fp8.fp8_fused_exponent_bias_into_scales(scales)
 
     return repacked, scales
diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py
@@ -256,7 +256,7 @@ class GPTQMarlinWeight(Weight):
     """
 
     qweight: torch.Tensor
-    qzeros: torch.Tensor
+    qzeros: Optional[torch.Tensor]
     scales: torch.Tensor
     g_idx: torch.Tensor
     perm: torch.Tensor
@@ -268,6 +268,7 @@ def __post_init__(self):
         assert self.scales.dtype in (torch.float16, torch.bfloat16)
         assert self.g_idx.dtype == torch.int32
         assert self.perm.dtype == torch.int32
+        assert self.qzeros is None or self.qzeros.numel() > 0
 
     def get_linear(self, bias: torch.Tensor):
         return GPTQMarlinLinear(
@@ -350,9 +351,6 @@ def repack_gptq_for_marlin(
             qweight, perm, in_features, out_features, bits
         )
 
-    if qzeros is None:
-        qzeros = torch.empty(0, dtype=torch.int, device=qweight.device)
-
     scales = permute_scales(scales)
 
     is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)
@@ -392,7 +390,7 @@ def __init__(
         if weight.bits not in (4, 8):
             raise ValueError("GPTQMarlinLinear only supports 4 and 8-bit quantization")
 
-        if weight.qzeros.numel() > 0:
+        if weight.qzeros is not None:
             if weight.bits == 4:
                 self.quant_type = quantization.scalar_types.uint4
             else:
@@ -424,20 +422,21 @@ def forward(self, A: torch.Tensor) -> torch.Tensor:
 
         A_flat = A.view(-1, A.shape[-1])
         C = quantization.gptq_marlin_gemm(
-            A_flat,
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.perm,
-            self.workspace,
-            self.quant_type,
-            A_flat.shape[0],
-            self.scales.shape[1],
-            A_flat.shape[1],
-            self.is_full_k,
-            self.qzeros.numel() > 0,
-            True,
+            a=A_flat,
+            c=None,
+            b_q_weight=self.qweight,
+            b_scales=self.scales,
+            global_scale=None,
+            b_zeros=self.qzeros,
+            g_idx=self.g_idx,
+            perm=self.perm,
+            workspace=self.workspace,
+            b_q_type=self.quant_type,
+            size_m=A_flat.shape[0],
+            size_n=self.scales.shape[1],
+            size_k=A_flat.shape[1],
+            is_k_full=self.is_full_k,
+            use_fp32_reduce=True,
         )
         C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
 
diff --git a/server/text_generation_server/layers/moe/gptq_marlin.py b/server/text_generation_server/layers/moe/gptq_marlin.py
@@ -202,9 +202,13 @@ def _pack_weight(
             device=weight.qweight.device,
         )
         qzeros = torch.empty(
-            (n_experts,) + weight.qzeros.shape,
-            dtype=weight.qzeros.dtype,
-            device=weight.qzeros.device,
+            (n_experts,) + ((0,) if weight.qzeros is None else weight.qzeros.shape),
+            dtype=(
+                weight.qweight.dtype if weight.qzeros is None else weight.qzeros.dtype
+            ),
+            device=(
+                weight.qweight.device if weight.qzeros is None else weight.qzeros.device
+            ),
         )
         scales = torch.empty(
             (n_experts,) + weight.scales.shape,
@@ -232,7 +236,13 @@ def _pack_weight(
         )
 
     moe_weight.qweight[expert] = weight.qweight
-    moe_weight.qzeros[expert] = weight.qzeros
+    moe_weight.qzeros[expert] = (
+        torch.zeros(
+            (0,), device=moe_weight.qzeros.device, dtype=moe_weight.qzeros.dtype
+        )
+        if weight.qzeros is None
+        else weight.qzeros
+    )
     moe_weight.scales[expert] = weight.scales
     moe_weight.g_idx[expert] = weight.g_idx
     moe_weight.perm[expert] = weight.perm