From 5baa276f7ce10b84337b9594089089b09bf49804 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 23 Oct 2025 15:00:39 -0700 Subject: [PATCH 1/8] fix arm --- third_party/llamafile/sgemm.cpp | 74 ++++----------------------------- 1 file changed, 9 insertions(+), 65 deletions(-) diff --git a/third_party/llamafile/sgemm.cpp b/third_party/llamafile/sgemm.cpp index 38f6d189..c466d47f 100644 --- a/third_party/llamafile/sgemm.cpp +++ b/third_party/llamafile/sgemm.cpp @@ -25,7 +25,7 @@ // #include // #include #include -// #include +#include #include // #include "llamafile.h" @@ -37,7 +37,7 @@ static const struct GemmFuncs { // typeof(llamafile_mixmul)* mixmul; // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported; GemmFuncs() { -#if defined(__x86_64__) || defined(_M_X64) +//#if defined(__x86_64__) || defined(_M_X64) // if (X86_HAVE(AVX)) { // if (X86_HAVE(FMA)) { // if (X86_HAVE(AVX2)) { @@ -90,74 +90,18 @@ static const struct GemmFuncs { // mixmul = llamafile_mixmul_unsupported; // } -#if defined(__AVX__) -#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))) -#if defined(__AVX2__) -#if defined(__AVX512F__) -#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) - // AMD Zen4+ (2023-) - sgemm = llamafile_sgemm_amd_zen4; - mixmul = llamafile_mixmul_amd_zen4; - iqk_mixmul = iqk_mul_mat_moe_zen4; -#else - // Intel Xeon Skylake+ (2015-) - sgemm = llamafile_sgemm_amd_avx512f; - mixmul = llamafile_mixmul_amd_avx512f; - iqk_mixmul = iqk_mul_mat_moe; -#endif -#elif defined(__AVXVNNI__) - // Intel Alderlake (2021-) - sgemm = llamafile_sgemm_amd_avxvnni; - mixmul = llamafile_mixmul_amd_avxvnni; - iqk_mixmul = iqk_mul_mat_moe; -#else - // Intel Haswell/Broadwell/Skylake (2013-2020) - // AMD Excavator (2015-2022) - sgemm = llamafile_sgemm_amd_avx2; - mixmul = llamafile_mixmul_amd_avx2; -#if defined(__F16C__) - iqk_mixmul = iqk_mul_mat_moe; -#endif -#endif -#else - // AMD Piledriver (2011-2014) - sgemm = llamafile_sgemm_amd_fma; - mixmul = llamafile_mixmul_amd_fma; -#if defined(__F16C__) - iqk_mixmul = iqk_mul_mat_moe; -#endif -#endif -#else - // Intel Sandybridge/Ivybridge (2010-2012) - // AMD Bulldozer (2011) - sgemm = llamafile_sgemm_amd_avx; - mixmul = llamafile_mixmul_amd_avx; -#endif -#else - // AMD K8/Barcelona (2003-2010) - // Intel Core/Nehalem (2006-2009) - sgemm = llamafile_sgemm_unsupported; - mixmul = llamafile_mixmul_unsupported; -#endif -#elif defined(__aarch64__) - long hwcap = getauxval(AT_HWCAP); - if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1) - (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1) - (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1) +//#elif defined(__aarch64__) + //long hwcap = getauxval(AT_HWCAP); + //if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1) + // (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1) + // (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1) // e.g. Apple M1, Raspberry Pi 5 sgemm = llamafile_sgemm_arm82; mixmul = llamafile_mixmul_arm82; iqk_mixmul = iqk_mul_mat_moe_arm82; - } else { - // ARM64 baseline ISA - sgemm = llamafile_sgemm_arm80; - mixmul = llamafile_mixmul_arm80; - } -#else - sgemm = llamafile_sgemm_unsupported; - mixmul = llamafile_mixmul_unsupported; -#endif + +//#endif } } funcs; From 0c81f259d2c52e0b856f6f9905003c047f8bce51 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 23 Oct 2025 15:06:46 -0700 Subject: [PATCH 2/8] fix arm --- setup.py | 59 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/setup.py b/setup.py index c91d9dc2..ff914eca 100644 --- a/setup.py +++ b/setup.py @@ -178,37 +178,40 @@ def get_cpu_instruct(self,): return "avx2" else: print("Using native cpu instruct") + if sys.platform.startswith("linux"): with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f: cpuinfo = cpu_f.read() - flags_line = [line for line in cpuinfo.split( - '\n') if line.startswith('flags')][0] - flags = flags_line.split(':')[1].strip().split(' ') - # fancy with AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI - for flag in flags: - if 'avx512bw' in flag: - return 'fancy' - for flag in flags: - if 'avx512' in flag: - return 'avx512' - for flag in flags: - if 'avx2' in flag: - return 'avx2' - raise ValueError( - "Unsupported cpu Instructions: {}".format(flags_line)) - elif sys.platform == "win32": - from cpufeature.extension import CPUFeature - - if CPUFeature.get("AVX512bw", False): - return 'fancy' - if CPUFeature.get("AVX512f", False): - return 'avx512' - if CPUFeature.get("AVX2", False): - return 'avx2' - raise ValueError( - "Unsupported cpu Instructions: {}".format(str(CPUFeature))) - else: - raise ValueError("Unsupported platform: {}".format(sys.platform)) + if platform.machine() == "aarch64": + # Adapt this part based on GH200's /proc/cpuinfo + for line in cpuinfo.split('\n'): + if line.startswith('Features'): + features_line = line + features = features_line.split(':')[1].strip().split(' ') + if 'sve' in features: # Example: Scalable Vector Extension + return 'sve' # Or a custom label + elif 'neon' in features: + return 'neon' + else: + print("Using generic Arm CPU instructions") + return 'native_arm' # Or a default Arm label + print("Warning: Could not find 'Features' line in /proc/cpuinfo on aarch64. Using native.") + return 'native' # Fallback for aarch64 if 'Features' not found + else: # Assume x86-like if not aarch64 + for line in cpuinfo.split('\n'): + if line.startswith('flags'): + flags_line = line + flags = flags_line.split(':')[1].strip().split(' ') + if 'avx512bw' in flags: + return 'fancy' + elif 'avx512' in flags: + return 'avx512' + elif 'avx2' in flags: + return 'avx2' + raise ValueError( + "Unsupported cpu Instructions: {}".format(flags_line)) + print("Warning: Could not find 'flags' line in /proc/cpuinfo on x86-like. Using native.") + return 'native' # Fallback for x86-like if 'flags' not found def get_torch_version(self,): torch_version_raw = parse(torch.__version__) From 5824bac89a03828af6109ee17f0c623e29c1f08e Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 23 Oct 2025 15:11:27 -0700 Subject: [PATCH 3/8] fix arm --- third_party/llamafile/iqk_mul_mat.inc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc index a4e8c418..08c5cb99 100644 --- a/third_party/llamafile/iqk_mul_mat.inc +++ b/third_party/llamafile/iqk_mul_mat.inc @@ -170,7 +170,7 @@ struct MulMat { funcs[n_left-1](n, vx, bx, info, nrc_x); } } - static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny); + static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int Ny); private: template static IQK_NOINLINE void set_functions(MulMat& m); }; @@ -4444,12 +4444,12 @@ template struct Q80 { } inline const int8_t * quant_data(int iy, int i) const { - const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i; + const block_q8_0 * y4 = (const block_q8_0 *)y[iy] + i; return y4->qs; } inline float16x4_t load_scales(int iy, int i) const { - const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i; + const block_q8_0 * y4 = (const block_q8_0 *)y[iy] + i; return vld1_f16((const float16_t *)y4->d); } @@ -4485,12 +4485,12 @@ template struct Q81 { } inline const int8_t * quant_data(int iy, int i) const { - const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i; + const block_q8_1 * y4 = (const block_q8_1 *)y[iy] + i; return y4->qs; } inline float16x8_t load_scales(int iy, int i) const { - const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i; + const block_q8_1 * y4 = (const block_q8_1 *)y[iy] + i; return vld1q_f16((const float16_t *)y4->d); } From 5433ab27feee077cddc7ddf837efc1b388bb8e04 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 23 Oct 2025 15:13:31 -0700 Subject: [PATCH 4/8] fix arm --- third_party/llamafile/iqk_mul_mat.inc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc index 08c5cb99..ebfd233c 100644 --- a/third_party/llamafile/iqk_mul_mat.inc +++ b/third_party/llamafile/iqk_mul_mat.inc @@ -1020,8 +1020,9 @@ bool iqk_mul_mat(long Nx, long Ny, long ne00, float * C, long stride_C, int ith, int nth) { MulMat mm; + int row_size_q8; - if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) { + if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) { return false; } @@ -4450,7 +4451,8 @@ template struct Q80 { inline float16x4_t load_scales(int iy, int i) const { const block_q8_0 * y4 = (const block_q8_0 *)y[iy] + i; - return vld1_f16((const float16_t *)y4->d); + float16_t d_val = GGML_FP16_TO_FP32(y4->d); + return vdup_n_f16(d_val); } template @@ -4491,7 +4493,11 @@ template struct Q81 { inline float16x8_t load_scales(int iy, int i) const { const block_q8_1 * y4 = (const block_q8_1 *)y[iy] + i; - return vld1q_f16((const float16_t *)y4->d); + float16_t d_val = GGML_FP16_TO_FP32(y4->ds[0]); + float16_t s_val = GGML_FP16_TO_FP32(y4->ds[1]); + float16x4_t d_vec = vdup_n_f16(d_val); + float16x4_t s_vec = vdup_n_f16(s_val); + return vcombine_f16(d_vec, s_vec); } template From 4eb6bffb15e19385965167570dd5bdc68f9da373 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 23 Oct 2025 15:15:20 -0700 Subject: [PATCH 5/8] fix arm --- third_party/llamafile/iqk_mul_mat.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc index ebfd233c..cea49e28 100644 --- a/third_party/llamafile/iqk_mul_mat.inc +++ b/third_party/llamafile/iqk_mul_mat.inc @@ -4493,8 +4493,8 @@ template struct Q81 { inline float16x8_t load_scales(int iy, int i) const { const block_q8_1 * y4 = (const block_q8_1 *)y[iy] + i; - float16_t d_val = GGML_FP16_TO_FP32(y4->ds[0]); - float16_t s_val = GGML_FP16_TO_FP32(y4->ds[1]); + float16_t d_val = GGML_FP16_TO_FP32(y4->d); + float16_t s_val = GGML_FP16_TO_FP32(y4->s); float16x4_t d_vec = vdup_n_f16(d_val); float16x4_t s_vec = vdup_n_f16(s_val); return vcombine_f16(d_vec, s_vec); From 8d7d9bf14f2d6dfc61fb9df1bf6c63bb7670eb27 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 23 Oct 2025 15:27:09 -0700 Subject: [PATCH 6/8] fix arm --- setup.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/setup.py b/setup.py index ff914eca..b39b44d1 100644 --- a/setup.py +++ b/setup.py @@ -504,6 +504,13 @@ def build_extension(self, ext) -> None: f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm ] + # Add ARM-specific flags for aarch64 Linux systems + if platform.system() == "Linux" and platform.machine() == "aarch64": + cmake_args += [ + "-DCMAKE_C_FLAGS=-march=armv8.6-a+fp+simd+sve+sve2+i8mm+f32mm+f64mm+fp16+bf16+fp16fml+crc", + "-DCMAKE_CXX_FLAGS=-march=armv8.6-a+fp+simd+sve+sve2+i8mm+f32mm+f64mm+fp16+bf16+fp16fml+crc", + ] + if CUDA_HOME is not None: cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"] elif MUSA_HOME is not None: From 0d786090dfefe247d2837620a2d7026443c6c0fb Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 23 Oct 2025 15:28:56 -0700 Subject: [PATCH 7/8] fix arm --- third_party/llamafile/tinyblas_cpu_sgemm.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/llamafile/tinyblas_cpu_sgemm.inc b/third_party/llamafile/tinyblas_cpu_sgemm.inc index 9ed8f35a..0eecf46f 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm.inc +++ b/third_party/llamafile/tinyblas_cpu_sgemm.inc @@ -337,14 +337,14 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void #endif #elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) { - if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) { + if (iqk_mul_mat(m, n, k * QK_K, Atype, A, lda, Btype, B, ldb, (float*)C, ldc, ith, nth)) { return true; } } if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) { // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32); assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32)); - if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) { + if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, lda, Btype, B, ldb, (float*)C, ldc, ith, nth)) { return true; } } From 6b6a34f9609bb5e53b396309a048aecd4be88801 Mon Sep 17 00:00:00 2001 From: Johnny Date: Tue, 28 Oct 2025 19:39:13 -0700 Subject: [PATCH 8/8] Update setup.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b39b44d1..279ca1b0 100644 --- a/setup.py +++ b/setup.py @@ -506,9 +506,10 @@ def build_extension(self, ext) -> None: # Add ARM-specific flags for aarch64 Linux systems if platform.system() == "Linux" and platform.machine() == "aarch64": + # Using -march=native is more portable as it optimizes for the host CPU. cmake_args += [ - "-DCMAKE_C_FLAGS=-march=armv8.6-a+fp+simd+sve+sve2+i8mm+f32mm+f64mm+fp16+bf16+fp16fml+crc", - "-DCMAKE_CXX_FLAGS=-march=armv8.6-a+fp+simd+sve+sve2+i8mm+f32mm+f64mm+fp16+bf16+fp16fml+crc", + "-DCMAKE_C_FLAGS=-march=native", + "-DCMAKE_CXX_FLAGS=-march=native", ] if CUDA_HOME is not None: