ModelCloud
diff --git a/‎gptqmodel/nn_modules/qlinear/exllama_eora.py
Lines changed: 1 addition & 1 deletion b/‎gptqmodel/nn_modules/qlinear/exllama_eora.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎gptqmodel_ext/exllama2-vllm/.gitignore renamed to ‎gptqmodel_ext/exllama_eora/.gitignore b/‎gptqmodel_ext/exllama2-vllm/.gitignore renamed to ‎gptqmodel_ext/exllama_eora/.gitignore
diff --git a/‎gptqmodel_ext/exllama2-vllm/README.md renamed to ‎gptqmodel_ext/exllama_eora/README.md b/‎gptqmodel_ext/exllama2-vllm/README.md renamed to ‎gptqmodel_ext/exllama_eora/README.md
diff --git a/‎gptqmodel_ext/exllama2-vllm/benchmark.py renamed to ‎gptqmodel_ext/exllama_eora/benchmark.py b/‎gptqmodel_ext/exllama2-vllm/benchmark.py renamed to ‎gptqmodel_ext/exllama_eora/benchmark.py
diff --git a/‎gptqmodel_ext/exllama2-vllm/eora/__init__.py renamed to ‎gptqmodel_ext/exllama_eora/eora/__init__.py b/‎gptqmodel_ext/exllama2-vllm/eora/__init__.py renamed to ‎gptqmodel_ext/exllama_eora/eora/__init__.py
diff --git a/‎gptqmodel_ext/exllama2-vllm/eora/compat.cuh renamed to ‎gptqmodel_ext/exllama_eora/eora/compat.cuh
Lines changed: 0 additions & 2 deletions b/‎gptqmodel_ext/exllama2-vllm/eora/compat.cuh renamed to ‎gptqmodel_ext/exllama_eora/eora/compat.cuh
Lines changed: 0 additions & 2 deletions
diff --git a/‎gptqmodel_ext/exllama2-vllm/eora/matrix_view.cuh renamed to ‎gptqmodel_ext/exllama_eora/eora/matrix_view.cuh
Lines changed: 0 additions & 2 deletions b/‎gptqmodel_ext/exllama2-vllm/eora/matrix_view.cuh renamed to ‎gptqmodel_ext/exllama_eora/eora/matrix_view.cuh
Lines changed: 0 additions & 2 deletions
diff --git a/‎gptqmodel_ext/exllama2-vllm/eora/ops.h renamed to ‎gptqmodel_ext/exllama_eora/eora/ops.h b/‎gptqmodel_ext/exllama2-vllm/eora/ops.h renamed to ‎gptqmodel_ext/exllama_eora/eora/ops.h
diff --git a/‎gptqmodel_ext/exllama2-vllm/eora/pybind.cu renamed to ‎gptqmodel_ext/exllama_eora/eora/pybind.cu b/‎gptqmodel_ext/exllama2-vllm/eora/pybind.cu renamed to ‎gptqmodel_ext/exllama_eora/eora/pybind.cu
diff --git a/‎gptqmodel_ext/exllama2-vllm/eora/q_gemm.cu renamed to ‎gptqmodel_ext/exllama_eora/eora/q_gemm.cu
Lines changed: 5 additions & 7 deletions b/‎gptqmodel_ext/exllama2-vllm/eora/q_gemm.cu renamed to ‎gptqmodel_ext/exllama_eora/eora/q_gemm.cu
Lines changed: 5 additions & 7 deletions
@@ -54,7 +54,7 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
 
 
 class ExllamaEoraQuantLinear(BaseQuantLinear):
-    SUPPORTS_BITS = [4, 8]
+    SUPPORTS_BITS = [4] # fused eora only validated for 4 bits
     SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128]
     SUPPORTS_DESC_ACT = [True, False]
     SUPPORTS_SYM = [True] # TODO: validate False
 
@@ -5,7 +5,6 @@ Copied from https://github.com/turboderp/exllamav2
 #ifndef _compat_cuh
 #define _compat_cuh
 
-namespace vllm {
 namespace gptq {
 // atomicAdd for half types, to support CC < 7.x
 
@@ -60,5 +59,4 @@ __device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
 #endif
 
 }  // namespace gptq
-}  // namespace vllm
 #endif
@@ -11,7 +11,6 @@ https://github.com/turboderp/exllama
 
 #include "qdq_util.cuh"
 
-namespace vllm {
 namespace gptq {
 
 class MatrixView_half {
@@ -291,5 +290,4 @@ class MatrixView_q8_row {
 };
 
 }  // namespace gptq
-}  // namespace vllm
 #endif
@@ -19,7 +19,6 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #include "qdq_4.cuh"
 #include "qdq_8.cuh"
 
-namespace vllm {
 namespace gptq {
 
 #define BLOCK_KN_SIZE 128
@@ -336,8 +335,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel_eora(
         for (int j = 0; j < 4; ++j) {
 #pragma unroll
             for (int m = 0; m < m_count; m++) {
-                auto a1 = __half2float(*(Ax_.item_ptr(offset_m + m, r)));
-                auto a2 = __half2float(*(eora_b_.item_ptr(r, n + j)));
+                float a1 = __half2float(*(Ax_.item_ptr(offset_m + m, r)));
+                float a2 = __half2float(*(eora_b_.item_ptr(r, n + j)));
                 float product = a1 * a2;
                 block_c[m][j] = block_c[m][j] + product;
             }
@@ -2074,7 +2073,6 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
 }
 
 }  // namespace gptq
-}  // namespace vllm
 
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
@@ -2086,7 +2084,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
   at::Tensor temp_dq = torch::empty(
       {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
 
-  vllm::gptq::gemm_half_q_half_cuda(
+ gptq::gemm_half_q_half_cuda(
       at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(),
       (const uint32_t*)b_q_weight.data_ptr(),
       (const uint32_t*)b_gptq_qzeros.data_ptr(),
@@ -2112,7 +2110,7 @@ torch::Tensor gptq_gemm_lora(torch::Tensor a, torch::Tensor b_q_weight,
     at::Tensor temp_dq = torch::empty(
             {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
 
-    vllm::gptq::gemm_half_q_half_cuda_eora(
+    gptq::gemm_half_q_half_cuda_eora(
             at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(),
             (const uint32_t*)b_q_weight.data_ptr(),
             (const uint32_t*)b_gptq_qzeros.data_ptr(),
@@ -2133,7 +2131,7 @@ torch::Tensor gptq_gemm_lora(torch::Tensor a, torch::Tensor b_q_weight,
 
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
-  vllm::gptq::shuffle_exllama_weight(
+  gptq::shuffle_exllama_weight(
       (uint32_t*)q_weight.data_ptr(),
       q_perm.device().is_meta() || q_perm.numel() == 0
           ? NULL