kvcache-ai · johnnynunez · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/setup.py b/setup.py
@@ -178,37 +178,40 @@ def get_cpu_instruct(self,):
             return "avx2"
         else:
             print("Using native cpu instruct")
+
         if sys.platform.startswith("linux"):
             with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
                 cpuinfo = cpu_f.read()
-            flags_line = [line for line in cpuinfo.split(
-                '\n') if line.startswith('flags')][0]
-            flags = flags_line.split(':')[1].strip().split(' ')
-            # fancy with AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI
-            for flag in flags:
-                if 'avx512bw' in flag:
-                    return 'fancy'
-            for flag in flags:
-                if 'avx512' in flag:
-                    return 'avx512'
-            for flag in flags:
-                if 'avx2' in flag:
-                    return 'avx2'
-            raise ValueError(
-                "Unsupported cpu Instructions: {}".format(flags_line))
-        elif sys.platform == "win32":
-            from cpufeature.extension import CPUFeature
-
-            if CPUFeature.get("AVX512bw", False):
-                return 'fancy'
-            if CPUFeature.get("AVX512f", False):
-                return 'avx512'
-            if CPUFeature.get("AVX2", False):
-                return 'avx2'
-            raise ValueError(
-                "Unsupported cpu Instructions: {}".format(str(CPUFeature)))
-        else:
-            raise ValueError("Unsupported platform: {}".format(sys.platform))
+            if platform.machine() == "aarch64":
+                # Adapt this part based on GH200's /proc/cpuinfo
+                for line in cpuinfo.split('\n'):
+                    if line.startswith('Features'):
+                        features_line = line
+                        features = features_line.split(':')[1].strip().split(' ')
+                        if 'sve' in features: # Example: Scalable Vector Extension
+                            return 'sve' # Or a custom label
+                        elif 'neon' in features:
+                            return 'neon'
+                        else:
+                            print("Using generic Arm CPU instructions")
+                            return 'native_arm' # Or a default Arm label
+                print("Warning: Could not find 'Features' line in /proc/cpuinfo on aarch64. Using native.")
+                return 'native' # Fallback for aarch64 if 'Features' not found
+            else: # Assume x86-like if not aarch64
+                for line in cpuinfo.split('\n'):
+                    if line.startswith('flags'):
+                        flags_line = line
+                        flags = flags_line.split(':')[1].strip().split(' ')
+                        if 'avx512bw' in flags:
+                            return 'fancy'
+                        elif 'avx512' in flags:
+                            return 'avx512'
+                        elif 'avx2' in flags:
+                            return 'avx2'
+                        raise ValueError(
+                            "Unsupported cpu Instructions: {}".format(flags_line))
+                print("Warning: Could not find 'flags' line in /proc/cpuinfo on x86-like. Using native.")
+                return 'native' # Fallback for x86-like if 'flags' not found
 
     def get_torch_version(self,):
         torch_version_raw = parse(torch.__version__)
@@ -501,6 +504,13 @@ def build_extension(self, ext) -> None:
             f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
         ]
 
+        # Add ARM-specific flags for aarch64 Linux systems
+        if platform.system() == "Linux" and platform.machine() == "aarch64":
+            cmake_args += [
+                "-DCMAKE_C_FLAGS=-march=armv8.6-a+fp+simd+sve+sve2+i8mm+f32mm+f64mm+fp16+bf16+fp16fml+crc",
+                "-DCMAKE_CXX_FLAGS=-march=armv8.6-a+fp+simd+sve+sve2+i8mm+f32mm+f64mm+fp16+bf16+fp16fml+crc",
+            ]
+
         if CUDA_HOME is not None:
             cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
         elif MUSA_HOME is not None:

diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc
@@ -170,7 +170,7 @@ struct MulMat {
             funcs[n_left-1](n, vx, bx, info, nrc_x);
         }
     }
-    static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
+    static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int Ny);
 private:
     template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
 };
@@ -1020,8 +1020,9 @@ bool iqk_mul_mat(long Nx, long Ny, long ne00,
     float * C, long stride_C, int ith, int nth) {
 
         MulMat mm;
+        int row_size_q8;
 
-        if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) {
+        if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
             return false;
         }
 
@@ -4444,13 +4445,14 @@ template <int nrc> struct Q80 {
     }
 
     inline const int8_t * quant_data(int iy, int i) const {
-        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
+        const block_q8_0 * y4 = (const block_q8_0 *)y[iy] + i;
         return y4->qs;
     }
 
     inline float16x4_t load_scales(int iy, int i) const {
-        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
-        return vld1_f16((const float16_t *)y4->d);
+        const block_q8_0 * y4 = (const block_q8_0 *)y[iy] + i;
+        float16_t d_val = GGML_FP16_TO_FP32(y4->d);
+        return vdup_n_f16(d_val);
     }
 
     template <typename Dequantizer>
@@ -4485,13 +4487,17 @@ template <int nrc> struct Q81 {
     }
 
     inline const int8_t * quant_data(int iy, int i) const {
-        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
+        const block_q8_1 * y4 = (const block_q8_1 *)y[iy] + i;
         return y4->qs;
     }
 
     inline float16x8_t load_scales(int iy, int i) const {
-        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
-        return vld1q_f16((const float16_t *)y4->d);
+        const block_q8_1 * y4 = (const block_q8_1 *)y[iy] + i;
+        float16_t d_val = GGML_FP16_TO_FP32(y4->d);
+        float16_t s_val = GGML_FP16_TO_FP32(y4->s);
+        float16x4_t d_vec = vdup_n_f16(d_val);
+        float16x4_t s_vec = vdup_n_f16(s_val);
+        return vcombine_f16(d_vec, s_vec);
     }
 
     template <typename Dequantizer>

diff --git a/third_party/llamafile/sgemm.cpp b/third_party/llamafile/sgemm.cpp
@@ -25,7 +25,7 @@
 // #include <cpuid.h>
 // #include <libc/sysv/consts/hwcap.h>
 #include <stdio.h>
-// #include <sys/auxv.h>
+#include <sys/auxv.h>
 #include <cassert>
 // #include "llamafile.h"
 
@@ -37,7 +37,7 @@ static const struct GemmFuncs {
     // typeof(llamafile_mixmul)* mixmul;
     // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
     GemmFuncs() {
-#if defined(__x86_64__) || defined(_M_X64)
+//#if defined(__x86_64__) || defined(_M_X64)
         // if (X86_HAVE(AVX)) {
         //     if (X86_HAVE(FMA)) {
         //         if (X86_HAVE(AVX2)) {
@@ -90,74 +90,18 @@ static const struct GemmFuncs {
         //     mixmul = llamafile_mixmul_unsupported;
         // }
 
-#if defined(__AVX__)
-#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
-#if defined(__AVX2__)
-#if defined(__AVX512F__)
-#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
-        // AMD Zen4+ (2023-)
-        sgemm = llamafile_sgemm_amd_zen4;
-        mixmul = llamafile_mixmul_amd_zen4;
-        iqk_mixmul = iqk_mul_mat_moe_zen4;
-#else
-        // Intel Xeon Skylake+ (2015-)
-        sgemm = llamafile_sgemm_amd_avx512f;
-        mixmul = llamafile_mixmul_amd_avx512f;
-        iqk_mixmul = iqk_mul_mat_moe;
-#endif
-#elif defined(__AVXVNNI__)
-        // Intel Alderlake (2021-)
-        sgemm = llamafile_sgemm_amd_avxvnni;
-        mixmul = llamafile_mixmul_amd_avxvnni;
-        iqk_mixmul = iqk_mul_mat_moe;
-#else
-        // Intel Haswell/Broadwell/Skylake (2013-2020)
-        // AMD Excavator (2015-2022)
-        sgemm = llamafile_sgemm_amd_avx2;
-        mixmul = llamafile_mixmul_amd_avx2;
-#if defined(__F16C__)
-        iqk_mixmul = iqk_mul_mat_moe;
-#endif
-#endif
-#else
-        // AMD Piledriver (2011-2014)
-        sgemm = llamafile_sgemm_amd_fma;
-        mixmul = llamafile_mixmul_amd_fma;
-#if defined(__F16C__)
-        iqk_mixmul = iqk_mul_mat_moe;
-#endif
-#endif
-#else
-        // Intel Sandybridge/Ivybridge (2010-2012)
-        // AMD Bulldozer (2011)
-        sgemm = llamafile_sgemm_amd_avx;
-        mixmul = llamafile_mixmul_amd_avx;
-#endif
-#else
-        // AMD K8/Barcelona (2003-2010)
-        // Intel Core/Nehalem (2006-2009)
-        sgemm = llamafile_sgemm_unsupported;
-        mixmul = llamafile_mixmul_unsupported;
-#endif
 
-#elif defined(__aarch64__)
-        long hwcap = getauxval(AT_HWCAP);
-        if ((hwcap & HWCAP_FPHP) &&     // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
-            (hwcap & HWCAP_ASIMDHP) &&  // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
-            (hwcap & HWCAP_ASIMDDP)) {  // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
+//#elif defined(__aarch64__)
+        //long hwcap = getauxval(AT_HWCAP);
+        //if ((hwcap & HWCAP_FPHP) &&     // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
+        //   (hwcap & HWCAP_ASIMDHP) &&  // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
+        //    (hwcap & HWCAP_ASIMDDP)) {  // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
             // e.g. Apple M1, Raspberry Pi 5
             sgemm = llamafile_sgemm_arm82;
             mixmul = llamafile_mixmul_arm82;
             iqk_mixmul = iqk_mul_mat_moe_arm82;
-        } else {
-            // ARM64 baseline ISA
-            sgemm = llamafile_sgemm_arm80;
-            mixmul = llamafile_mixmul_arm80;
-        }
-#else
-        sgemm = llamafile_sgemm_unsupported;
-        mixmul = llamafile_mixmul_unsupported;
-#endif
+
+//#endif
     }
 } funcs;
 

diff --git a/third_party/llamafile/tinyblas_cpu_sgemm.inc b/third_party/llamafile/tinyblas_cpu_sgemm.inc
@@ -337,14 +337,14 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
 #endif
 #elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
     if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
-        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
+        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, lda, Btype, B, ldb, (float*)C, ldc, ith, nth)) {
             return true;
         }
     }
     if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
         // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
         assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
-        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
+        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, lda, Btype, B, ldb, (float*)C, ldc, ith, nth)) {
             return true;
         }
     }