Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 38 additions & 28 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,37 +178,40 @@ def get_cpu_instruct(self,):
return "avx2"
else:
print("Using native cpu instruct")

if sys.platform.startswith("linux"):
with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
cpuinfo = cpu_f.read()
flags_line = [line for line in cpuinfo.split(
'\n') if line.startswith('flags')][0]
flags = flags_line.split(':')[1].strip().split(' ')
# fancy with AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI
for flag in flags:
if 'avx512bw' in flag:
return 'fancy'
for flag in flags:
if 'avx512' in flag:
return 'avx512'
for flag in flags:
if 'avx2' in flag:
return 'avx2'
raise ValueError(
"Unsupported cpu Instructions: {}".format(flags_line))
elif sys.platform == "win32":
from cpufeature.extension import CPUFeature

if CPUFeature.get("AVX512bw", False):
return 'fancy'
if CPUFeature.get("AVX512f", False):
return 'avx512'
if CPUFeature.get("AVX2", False):
return 'avx2'
raise ValueError(
"Unsupported cpu Instructions: {}".format(str(CPUFeature)))
else:
raise ValueError("Unsupported platform: {}".format(sys.platform))
if platform.machine() == "aarch64":
# Adapt this part based on GH200's /proc/cpuinfo
for line in cpuinfo.split('\n'):
if line.startswith('Features'):
features_line = line
features = features_line.split(':')[1].strip().split(' ')
if 'sve' in features: # Example: Scalable Vector Extension
return 'sve' # Or a custom label
elif 'neon' in features:
return 'neon'
else:
print("Using generic Arm CPU instructions")
return 'native_arm' # Or a default Arm label
print("Warning: Could not find 'Features' line in /proc/cpuinfo on aarch64. Using native.")
return 'native' # Fallback for aarch64 if 'Features' not found
else: # Assume x86-like if not aarch64
for line in cpuinfo.split('\n'):
if line.startswith('flags'):
flags_line = line
flags = flags_line.split(':')[1].strip().split(' ')
if 'avx512bw' in flags:
return 'fancy'
elif 'avx512' in flags:
return 'avx512'
elif 'avx2' in flags:
return 'avx2'
raise ValueError(
"Unsupported cpu Instructions: {}".format(flags_line))
print("Warning: Could not find 'flags' line in /proc/cpuinfo on x86-like. Using native.")
return 'native' # Fallback for x86-like if 'flags' not found
Comment on lines 181 to +214
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This change removes support for Windows (win32) and other non-Linux platforms from the get_cpu_instruct method. The previous implementation had specific logic for Windows and a fallback for other platforms. On non-Linux systems, this function will now return None, which will likely cause failures later in the build process. While this PR focuses on ARM (which is often Linux-based), breaking existing platform support is a significant regression. Please restore the support for other platforms like Windows to avoid breaking builds for other users.


def get_torch_version(self,):
torch_version_raw = parse(torch.__version__)
Expand Down Expand Up @@ -501,6 +504,13 @@ def build_extension(self, ext) -> None:
f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
]

# Add ARM-specific flags for aarch64 Linux systems
if platform.system() == "Linux" and platform.machine() == "aarch64":
cmake_args += [
"-DCMAKE_C_FLAGS=-march=armv8.6-a+fp+simd+sve+sve2+i8mm+f32mm+f64mm+fp16+bf16+fp16fml+crc",
"-DCMAKE_CXX_FLAGS=-march=armv8.6-a+fp+simd+sve+sve2+i8mm+f32mm+f64mm+fp16+bf16+fp16fml+crc",
]

if CUDA_HOME is not None:
cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
elif MUSA_HOME is not None:
Expand Down
22 changes: 14 additions & 8 deletions third_party/llamafile/iqk_mul_mat.inc
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ struct MulMat {
funcs[n_left-1](n, vx, bx, info, nrc_x);
}
}
static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int Ny);
private:
template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
};
Expand Down Expand Up @@ -1020,8 +1020,9 @@ bool iqk_mul_mat(long Nx, long Ny, long ne00,
float * C, long stride_C, int ith, int nth) {

MulMat mm;
int row_size_q8;

if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) {
if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
return false;
}

Expand Down Expand Up @@ -4444,13 +4445,14 @@ template <int nrc> struct Q80 {
}

inline const int8_t * quant_data(int iy, int i) const {
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
const block_q8_0 * y4 = (const block_q8_0 *)y[iy] + i;
return y4->qs;
}

inline float16x4_t load_scales(int iy, int i) const {
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
return vld1_f16((const float16_t *)y4->d);
const block_q8_0 * y4 = (const block_q8_0 *)y[iy] + i;
float16_t d_val = GGML_FP16_TO_FP32(y4->d);
return vdup_n_f16(d_val);
Comment on lines +4453 to +4455
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The conversion from ggml_fp16_t to float16_t via GGML_FP16_TO_FP32 is inefficient. It involves a round trip from half-precision to single-precision and back to half-precision, which is unnecessary and adds overhead in an inline function. You can directly use y4->d with vdup_n_f16 as the compiler can handle the cast.

        return vdup_n_f16(y4->d);

Copy link
Contributor

@KMSorSMS KMSorSMS Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@johnnynunez Since you have tested on your device, could you please attach your running screenshot and your platform info from nvidia-smi and lscpu, for example? ( Because our next version will drop the support for llamafile ( for performance consideration)

Copy link
Author

@johnnynunez johnnynunez Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GH200 @KMSorSMS
Screenshot 2025-10-24 at 07 44 36

Architecture:             aarch64
  CPU op-mode(s):         64-bit
  Byte Order:             Little Endian
CPU(s):                   64
  On-line CPU(s) list:    0-63
Vendor ID:                ARM
  Model name:             Neoverse-V2
    Model:                0
    Thread(s) per core:   1
    Core(s) per socket:   64
    Socket(s):            1
    Stepping:             r0p0
    BogoMIPS:             2000.00
    Flags:                fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm jscvt fcma lrcpc dcpop sha3 sm3 sm4 asimddp sha512 sve asimdfhm dit uscat ilrcpc f
                          lagm sb paca pacg dcpodp sve2 sveaes svepmull svebitperm svesha3 svesm4 flagm2 frint svei8mm svebf16 i8mm bf16 dgh bti
NUMA:                     
  NUMA node(s):           9
  NUMA node0 CPU(s):      0-63
  NUMA node1 CPU(s):      
  NUMA node2 CPU(s):      
  NUMA node3 CPU(s):      
  NUMA node4 CPU(s):      
  NUMA node5 CPU(s):      
  NUMA node6 CPU(s):      
  NUMA node7 CPU(s):      
  NUMA node8 CPU(s):      
Vulnerabilities:          
  Gather data sampling:   Not affected
  Itlb multihit:          Not affected
  L1tf:                   Not affected
  Mds:                    Not affected
  Meltdown:               Not affected
  Mmio stale data:        Not affected
  Reg file data sampling: Not affected
  Retbleed:               Not affected
  Spec rstack overflow:   Not affected
  Spec store bypass:      Mitigation; Speculative Store Bypass disabled via prctl
  Spectre v1:             Mitigation; __user pointer sanitization
  Spectre v2:             Not affected
  Srbds:                  Not affected
  Tsx async abort:        Not affected

}

template <typename Dequantizer>
Expand Down Expand Up @@ -4485,13 +4487,17 @@ template <int nrc> struct Q81 {
}

inline const int8_t * quant_data(int iy, int i) const {
const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
const block_q8_1 * y4 = (const block_q8_1 *)y[iy] + i;
return y4->qs;
}

inline float16x8_t load_scales(int iy, int i) const {
const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
return vld1q_f16((const float16_t *)y4->d);
const block_q8_1 * y4 = (const block_q8_1 *)y[iy] + i;
float16_t d_val = GGML_FP16_TO_FP32(y4->d);
float16_t s_val = GGML_FP16_TO_FP32(y4->s);
float16x4_t d_vec = vdup_n_f16(d_val);
float16x4_t s_vec = vdup_n_f16(s_val);
return vcombine_f16(d_vec, s_vec);
}

template <typename Dequantizer>
Expand Down
74 changes: 9 additions & 65 deletions third_party/llamafile/sgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
// #include <cpuid.h>
// #include <libc/sysv/consts/hwcap.h>
#include <stdio.h>
// #include <sys/auxv.h>
#include <sys/auxv.h>
#include <cassert>
// #include "llamafile.h"

Expand All @@ -37,7 +37,7 @@ static const struct GemmFuncs {
// typeof(llamafile_mixmul)* mixmul;
// typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
GemmFuncs() {
#if defined(__x86_64__) || defined(_M_X64)
//#if defined(__x86_64__) || defined(_M_X64)
// if (X86_HAVE(AVX)) {
// if (X86_HAVE(FMA)) {
// if (X86_HAVE(AVX2)) {
Expand Down Expand Up @@ -90,74 +90,18 @@ static const struct GemmFuncs {
// mixmul = llamafile_mixmul_unsupported;
// }

#if defined(__AVX__)
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX2__)
#if defined(__AVX512F__)
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
// AMD Zen4+ (2023-)
sgemm = llamafile_sgemm_amd_zen4;
mixmul = llamafile_mixmul_amd_zen4;
iqk_mixmul = iqk_mul_mat_moe_zen4;
#else
// Intel Xeon Skylake+ (2015-)
sgemm = llamafile_sgemm_amd_avx512f;
mixmul = llamafile_mixmul_amd_avx512f;
iqk_mixmul = iqk_mul_mat_moe;
#endif
#elif defined(__AVXVNNI__)
// Intel Alderlake (2021-)
sgemm = llamafile_sgemm_amd_avxvnni;
mixmul = llamafile_mixmul_amd_avxvnni;
iqk_mixmul = iqk_mul_mat_moe;
#else
// Intel Haswell/Broadwell/Skylake (2013-2020)
// AMD Excavator (2015-2022)
sgemm = llamafile_sgemm_amd_avx2;
mixmul = llamafile_mixmul_amd_avx2;
#if defined(__F16C__)
iqk_mixmul = iqk_mul_mat_moe;
#endif
#endif
#else
// AMD Piledriver (2011-2014)
sgemm = llamafile_sgemm_amd_fma;
mixmul = llamafile_mixmul_amd_fma;
#if defined(__F16C__)
iqk_mixmul = iqk_mul_mat_moe;
#endif
#endif
#else
// Intel Sandybridge/Ivybridge (2010-2012)
// AMD Bulldozer (2011)
sgemm = llamafile_sgemm_amd_avx;
mixmul = llamafile_mixmul_amd_avx;
#endif
#else
// AMD K8/Barcelona (2003-2010)
// Intel Core/Nehalem (2006-2009)
sgemm = llamafile_sgemm_unsupported;
mixmul = llamafile_mixmul_unsupported;
#endif

#elif defined(__aarch64__)
long hwcap = getauxval(AT_HWCAP);
if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
(hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
(hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
//#elif defined(__aarch64__)
//long hwcap = getauxval(AT_HWCAP);
//if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
// (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
// (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
// e.g. Apple M1, Raspberry Pi 5
sgemm = llamafile_sgemm_arm82;
mixmul = llamafile_mixmul_arm82;
iqk_mixmul = iqk_mul_mat_moe_arm82;
} else {
// ARM64 baseline ISA
sgemm = llamafile_sgemm_arm80;
mixmul = llamafile_mixmul_arm80;
}
#else
sgemm = llamafile_sgemm_unsupported;
mixmul = llamafile_mixmul_unsupported;
#endif

//#endif
}
} funcs;

Expand Down
4 changes: 2 additions & 2 deletions third_party/llamafile/tinyblas_cpu_sgemm.inc
Original file line number Diff line number Diff line change
Expand Up @@ -337,14 +337,14 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
#endif
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
if (iqk_mul_mat(m, n, k * QK_K, Atype, A, lda, Btype, B, ldb, (float*)C, ldc, ith, nth)) {
return true;
}
}
if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
// assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, lda, Btype, B, ldb, (float*)C, ldc, ith, nth)) {
return true;
}
}
Expand Down