Add Group Aware Reordering (GAR) support and config option (#1656)

tgafni · web-flow · commit 037c5c0f6c9e · 2025-07-17T11:36:53.000+08:00
diff --git a/README.md b/README.md
@@ -396,6 +396,23 @@ dynamic = {
 * Pass `auto_gc = False` to `quantize()` api to speed up quantization if gpu has plenty of vram and does not need to call slow gc.
 * Pass `buffered_fwd = True` to `quantize()` api to potentially speed up quantization if gpu has plenty of vram and can hold all fwd inputs in vram.
 
+#### Group Aware Reordering (GAR)
+
+Group Aware Reordering (GAR) is an enhanced activation reordering scheme designed to significantly improve the accuracy of quantized models without incurring additional inference overhead. Unlike traditional activation reordering, GAR restricts permutations to within individual groups or rearrangements of entire groups. This ensures each group's associated scales and zero-points remain efficiently accessible during inference, thereby avoiding any inference-time overhead.
+
+How to enable GAR:
+
+Set the `hyb_act` parameter to `True` and disable the default activation reordering by setting `desc_act` to `False` in your `QuantizeConfig`. For example:
+
+```python
+quant_config = QuantizeConfig(bits=4, group_size=128, desc_act=False, hyb_act=True)
+```
+
+This feature is based on the method introduced in:
+
+[T Gafni, A Karnieli, Y Hanani, "Dual Precision Quantization for Efficient and Accurate Deep Neural Networks Inference," CVPR Workshop, 2025.](https://openaccess.thecvf.com/content/CVPR2025W/eLVM/html/Gafni_Dual_Precision_Quantization_for_Efficient_and_Accurate_Deep_Neural_Networks_CVPRW_2025_paper.html)
+
+
 
 ### Attribution of Quantization Methods:
 
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
@@ -170,6 +170,7 @@ class QuantizeConfig():
     damp_auto_increment: float = field(default=0.01)
 
     desc_act: bool = field(default=True)
+    hyb_act: bool = field(default=False)
     static_groups: bool = field(default=False)
     sym: bool = field(default=True)
     true_sequential: bool = field(default=True)
@@ -461,6 +462,7 @@ def to_dict(self):
             "dynamic": self.dynamic,
             "group_size": self.group_size,
             "desc_act": self.desc_act,
+            "hyb_act": self.hyb_act,
             "sym": self.sym,
             "lm_head": self.lm_head,
             QUANT_METHOD_FIELD:self.quant_method,
diff --git a/gptqmodel/quantization/gar.py b/gptqmodel/quantization/gar.py
@@ -0,0 +1,89 @@
+import torch
+
+
+def compute_local_perms(diag_H, groupsize):
+    """
+    For each group, compute a permutation that orders the indices in descending order
+    based on the corresponding diagonal values of H.
+
+    Args:
+        diag_H (Tensor): 1D tensor representing the diagonal of the Hessian.
+        groupsize (int): Number of columns/weights per group.
+
+    Returns:
+        local_perms (list of Tensors): Each element is a permutation (indices) for that group.
+    """
+    n = diag_H.numel()
+    num_groups = n // groupsize
+    local_perms = []
+    for g in range(num_groups):
+        start = g * groupsize
+        end = start + groupsize
+        sub_diag = diag_H[start:end]
+        # Get local permutation: indices that would sort sub_diag in descending order.
+        local_perm = torch.argsort(sub_diag, descending=True)
+        local_perms.append(local_perm)
+    return local_perms
+
+def compute_global_perm(diag_H, groupsize):
+    """
+    Compute a permutation for the groups themselves. Here we choose the maximum diagonal value
+    within each group as the group metric and sort the groups in descending order.
+
+    Args:
+        diag_H (Tensor): 1D tensor representing the diagonal of the Hessian.
+        groupsize (int): Number of columns/weights per group.
+
+    Returns:
+        global_perm (Tensor): 1D tensor of length num_groups with the new order of groups.
+    """
+    n = diag_H.numel()
+    num_groups = n // groupsize
+    group_metric = []
+    for g in range(num_groups):
+        start = g * groupsize
+        end = start + groupsize
+        group_metric.append(diag_H[start:end].max().item())
+    # Create a tensor on the same device as diag_H.
+    group_metric = torch.tensor(group_metric, device=diag_H.device)
+    global_perm = torch.argsort(group_metric, descending=True)
+    return global_perm
+
+def compose_final_perm(local_perms, global_perm, groupsize):
+    """
+    Compose the final overall permutation from the local and global permutations.
+
+    Args:
+        local_perms (list of Tensors): Local permutation for each group.
+        global_perm (Tensor): Global group permutation.
+        groupsize (int): Number of indices per group.
+
+    Returns:
+        final_perm (Tensor): 1D tensor that maps original indices to new positions.
+    """
+    num_groups = len(local_perms)
+    final_perm = []
+    # Process groups in the order specified by global_perm.
+    for new_group in range(num_groups):
+        # Get the original group index.
+        orig_group = global_perm[new_group].item()
+        offset = orig_group * groupsize
+        local_perm = local_perms[orig_group]
+        # Adjust local indices to the full index space.
+        for idx in local_perm:
+            final_perm.append(idx.item() + offset)
+    return torch.tensor(final_perm, dtype=torch.long)
+
+def invert_perm(perm):
+    """
+    Compute the inverse of a permutation vector.
+
+    Args:
+        perm (Tensor): A 1D tensor containing a permutation of indices.
+
+    Returns:
+        inv (Tensor): The inverse permutation such that inv[perm] == torch.arange(len(perm)).
+    """
+    inv = torch.empty_like(perm)
+    inv[perm] = torch.arange(perm.numel(), device=perm.device)
+    return inv
diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
@@ -333,6 +333,14 @@ def quantize(
             H = H[perm][:, perm]
             invperm = torch.argsort(perm)
 
+        if hasattr(self.qcfg, "hyb_act") and self.qcfg.hyb_act and not self.qcfg.desc_act:
+            from .gar import compute_local_perms, compute_global_perm, compose_final_perm
+            local_perms = compute_local_perms(torch.diag(H), self.qcfg.group_size)
+            global_perm = compute_global_perm(torch.diag(H), self.qcfg.group_size)
+            final_perm = compose_final_perm(local_perms, global_perm, self.qcfg.group_size)
+            W = W[:, final_perm]
+            H = H[final_perm][:, final_perm]
+
         Losses = torch.zeros_like(W)
         Q = torch.zeros_like(W)
 
@@ -416,6 +424,17 @@ def quantize(
             Q = Q[:, invperm]
             g_idx = g_idx[invperm]
 
+        if hasattr(self.qcfg, "hyb_act") and self.qcfg.hyb_act and not self.qcfg.desc_act:
+            from .gar import invert_perm
+            inv_final = invert_perm(final_perm)
+            Q = Q[:, inv_final]
+            inv_global_perm = invert_perm(global_perm)
+            inv_global_perm_list = inv_global_perm.tolist()
+            temp_scale = [ scale[i] for i in inv_global_perm_list ]
+            scale = temp_scale
+            temp_zero = [ zero[i] for i in inv_global_perm_list ]
+            zero = temp_zero
+
         if isinstance(self.module, transformers.Conv1D):
             Q = Q.t()