microsoft · snnn · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -233,6 +233,7 @@ option(onnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH "Path to external transformer s
 option(onnxruntime_ENABLE_CUDA_PROFILING "Enable CUDA kernel profiling" OFF)
 
 option(onnxruntime_ENABLE_CPUINFO "Enable cpuinfo" ON)
+cmake_dependent_option(onnxruntime_ENABLE_CPUINFO "Enable cpuinfo" ON "WIN32" OFF)
 
 # ATen fallback support
 option(onnxruntime_ENABLE_ATEN "Enable ATen fallback" OFF)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -809,6 +809,9 @@ endif()
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
     target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
+    if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+      onnxruntime_add_include_to_target(${mlas_target} cpuinfo::cpuinfo)
+    endif()
 
     target_compile_definitions(${mlas_target} PRIVATE ${mlas_private_compile_definitions})
 

diff --git a/include/onnxruntime/core/framework/endian.h → include/onnxruntime/core/common/endian.h b/include/onnxruntime/core/framework/endian.h → include/onnxruntime/core/common/endian.h
diff --git a/include/onnxruntime/core/framework/float16.h → include/onnxruntime/core/common/float16.h b/include/onnxruntime/core/framework/float16.h → include/onnxruntime/core/common/float16.h
@@ -4,7 +4,7 @@
 
 #include <math.h>
 
-#include "endian.h"
+#include "core/common/endian.h"
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 #include "cuda_bf16.h"
 #endif

diff --git a/include/onnxruntime/core/framework/float8.h → include/onnxruntime/core/common/float8.h b/include/onnxruntime/core/framework/float8.h → include/onnxruntime/core/common/float8.h
@@ -16,7 +16,7 @@
 
 #if !defined(DISABLE_FLOAT8_TYPES)
 
-#include "endian.h"
+#include "core/common/endian.h"
 
 #if defined(__CUDACC__)
 // Needed for CUDA_VERSION check below

diff --git a/include/onnxruntime/core/framework/data_types.h b/include/onnxruntime/core/framework/data_types.h
@@ -12,9 +12,9 @@
 #include <gsl/gsl>
 #include "core/common/common.h"
 #include "core/common/exceptions.h"
-#include "core/framework/endian.h"
-#include "core/framework/float8.h"
-#include "core/framework/float16.h"
+#include "core/common/endian.h"
+#include "core/common/float8.h"
+#include "core/common/float16.h"
 #include "core/framework/int4.h"
 #include "core/framework/float4.h"
 #include "core/graph/onnx_protobuf.h"

diff --git a/include/onnxruntime/core/framework/to_tensor_proto_element_type.h b/include/onnxruntime/core/framework/to_tensor_proto_element_type.h
@@ -11,8 +11,8 @@
 #endif
 
 #include "core/framework/float4.h"
-#include "core/framework/float8.h"
-#include "core/framework/float16.h"
+#include "core/common/float8.h"
+#include "core/common/float16.h"
 #include "core/framework/int4.h"
 
 namespace onnxruntime {

diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_cpu.cc
@@ -9,7 +9,7 @@
 #include "core/providers/cpu/math/gemm_helper.h"
 #include "core/util/math_cpuonly.h"
 #include "core/mlas/inc/mlas.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/framework/allocator.h"
 #include "core/platform/threadpool.h"
 #include "core/common/narrow.h"

diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
@@ -3,7 +3,7 @@
 
 #include "contrib_ops/cpu/moe/moe_quantization_cpu.h"
 #include "core/framework/allocator.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/mlas/inc/mlas_q4.h"
 #include "core/platform/threadpool.h"

diff --git a/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h b/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h
@@ -8,7 +8,7 @@
 #include <vector>
 
 #include "core/common/safeint.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/platform/threadpool.h"
 #include <iostream>
 

diff --git a/onnxruntime/contrib_ops/cpu/quantization/gather_block_quantized.cc b/onnxruntime/contrib_ops/cpu/quantization/gather_block_quantized.cc
@@ -7,7 +7,7 @@
 #include "core/common/common.h"
 #include "core/common/narrow.h"
 #include "core/common/safeint.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/framework/int4.h"
 #include "core/framework/op_kernel.h"
 #include "core/platform/threadpool.h"

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
@@ -9,7 +9,7 @@
 #include <type_traits>
 
 #include "core/common/common.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/providers/common.h"
 #include "core/platform/threadpool.h"
 

diff --git a/onnxruntime/contrib_ops/cpu/utils/console_dumper.h b/onnxruntime/contrib_ops/cpu/utils/console_dumper.h
@@ -5,7 +5,7 @@
 #include <string>
 #include <iostream>
 #include "core/framework/ort_value.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "contrib_ops/cpu/utils/debug_macros.h"
 
 namespace onnxruntime {

diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
@@ -6,7 +6,7 @@
 #include <cstdint>
 
 #include "core/common/status.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/cuda/cuda_type_conversion.h"
 #include "contrib_ops/cuda/utils/dump_cuda_tensor.h"

diff --git a/onnxruntime/contrib_ops/rocm/bert/gemm_fast_gelu_impl.h b/onnxruntime/contrib_ops/rocm/bert/gemm_fast_gelu_impl.h
@@ -5,7 +5,7 @@
 
 #include "contrib_ops/rocm/bert/gemm_fast_gelu_common.h"
 #include "core/common/status.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 
 namespace onnxruntime {
 namespace contrib {

diff --git a/onnxruntime/contrib_ops/rocm/math/gemm_float8.cu b/onnxruntime/contrib_ops/rocm/math/gemm_float8.cu
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/common/common.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/providers/rocm/rocm_kernel.h"
 #include "contrib_ops/rocm/math/gemm_float8_ck.cuh"
 

diff --git a/onnxruntime/contrib_ops/rocm/math/gemm_float8_ck.cuh b/onnxruntime/contrib_ops/rocm/math/gemm_float8_ck.cuh
@@ -19,7 +19,7 @@
 #endif
 
 #if !defined(DISABLE_FLOAT8_TYPES)
-#include "core/framework/float8.h"
+#include "core/common/float8.h"
 #endif
 #include "core/providers/rocm/tunable/gemm_common.h"
 

diff --git a/onnxruntime/core/framework/element_type_lists.h b/onnxruntime/core/framework/element_type_lists.h
@@ -9,8 +9,8 @@
 #include "boost/mp11.hpp"
 
 #include "core/common/type_list.h"
-#include "core/framework/float8.h"
-#include "core/framework/float16.h"
+#include "core/common/float8.h"
+#include "core/common/float16.h"
 #include "core/framework/int4.h"
 #include "core/framework/float4.h"
 

diff --git a/onnxruntime/core/framework/endian_utils.cc b/onnxruntime/core/framework/endian_utils.cc
@@ -6,7 +6,7 @@
 #include <cassert>
 #include <cstring>
 
-#include "core/framework/endian.h"
+#include "core/common/endian.h"
 
 namespace onnxruntime {
 namespace utils {

diff --git a/onnxruntime/core/framework/murmurhash3.cc b/onnxruntime/core/framework/murmurhash3.cc
@@ -15,7 +15,7 @@
 
 /* Modifications Copyright (c) Microsoft. */
 
-#include "core/framework/endian.h"
+#include "core/common/endian.h"
 
 #include "core/util/force_inline.h"
 

diff --git a/onnxruntime/core/graph/contrib_ops/onnx_function_util.cc b/onnxruntime/core/graph/contrib_ops/onnx_function_util.cc
@@ -1,7 +1,7 @@
 #include "core/graph/contrib_ops/onnx_function_util.h"
 #include "core/util/math.h"
-#include "core/framework/float8.h"
-#include "core/framework/float16.h"
+#include "core/common/float8.h"
+#include "core/common/float16.h"
 
 namespace ONNX_NAMESPACE {
 

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -1865,13 +1865,13 @@ MlasHalfGemmConvertPackB(
     void* PackedB
     );
 
-#if defined(__aarch64__) && defined(__linux__)
+
 /**
  * @brief Whether current CPU supports Bfloat16(bf16) acceleration.
  */
 bool MLASCALL
 MlasBf16AccelerationSupported();
-
+#if defined(__aarch64__) && defined(__linux__)
 /**
  * @brief Interface for bf16 gemm post processors.
  *

diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
@@ -162,7 +162,7 @@ MLAS_FORCEINLINE void
 #include "core/common/cpuid_info.h"
 using MLAS_CPUIDINFO = onnxruntime::CPUIDInfo;
 
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 
 #else  // BUILD_MLAS_NO_ONNXRUNTIME
 

diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -25,6 +25,10 @@ Module Name:
 
 #include <thread>
 #include <mutex>
+#if defined(MLAS_TARGET_AMD64_IX86)
+#include <cpuinfo.h>
+#endif
+
 
 #if defined(MLAS_TARGET_POWER)
 #if defined(__linux__)
@@ -781,6 +785,22 @@ Return Value:
 #endif
 }
 
+bool MLASCALL
+MlasBf16AccelerationSupported()
+{
+#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
+    return MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_BF16();
+#elif defined(MLAS_TARGET_AMD64_IX86)
+    // cpuinfo is initialized early by the Env singleton (platform specific).
+    // Just query the feature flags here; if cpuinfo was unavailable initialization would have failed and
+    // the feature queries will safely return false.
+    return cpuinfo_has_x86_avx512bf16() || cpuinfo_has_x86_amx_bf16();
+#else
+    return false;
+#endif
+}
+
+
 #ifdef MLAS_TARGET_AMD64_IX86
 
 bool

diff --git a/onnxruntime/core/mlas/lib/sbgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sbgemm_kernel_neon.cpp
@@ -29,16 +29,6 @@ struct MLAS_SBGEMM_KERNEL_NEON {
     static constexpr MLAS_SBGEMM_STRIDES Strides{128, 128, 256};  // M:N:K
 };
 
-bool MLASCALL
-MlasBf16AccelerationSupported()
-{
-#if defined(MLAS_TARGET_ARM64)
-    return MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_BF16();
-#else
-    return false;
-#endif
-}
-
 /*
     This routine converts fp32 to bf16 and copies elements from the source
      matrix to the destination packed buffer.

diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
@@ -866,11 +866,19 @@
   InitializeCpuInfo();
 }
 
+
-
-
 /*
 Discover all cores in a windows system.
 Note - every "id" here, given it be group id, core id, or logical processor id, starts from 0.
 */
 void WindowsEnv::InitializeCpuInfo() {
+  // Initialize cpuinfo once on Windows similar to PosixEnv constructor.
+  (void)cpuinfo_initialize(); //Ignore the error if it failed to initialize
-  (void)cpuinfo_initialize(); //Ignore the error if it failed to initialize
+  (void)cpuinfo_initialize();  // Ignore the error if it failed to initialize
-  (void)cpuinfo_initialize(); //Ignore the error if it failed to initialize
+  (void)cpuinfo_initialize();  // Ignore the error if it failed to initialize
+  // TODO: we should also call cpuinfo_deinitialize()
+  // TODO: the cpuinfo_initialize() function also gets called when creating ort thread pool, it would be better to
+  // put them in one place.
+  // TODO: test how it works in ARM64EC.
+
   DWORD returnLength = 0;
   GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &returnLength);
   auto last_error = GetLastError();

diff --git a/onnxruntime/core/platform/windows/env.h b/onnxruntime/core/platform/windows/env.h
@@ -18,6 +18,9 @@ limitations under the License.
 #include "core/platform/windows/telemetry.h"
 #include "core/common/inlined_containers.h"
 #include <Windows.h>
+#if defined(CPUINFO_SUPPORTED)
+#include <cpuinfo.h>
+#endif
 
 namespace onnxruntime {
 

diff --git a/onnxruntime/core/providers/cann/cann_common.h b/onnxruntime/core/providers/cann/cann_common.h
@@ -6,7 +6,7 @@
 
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/cann/cann_call.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 
 namespace onnxruntime {
 namespace cann {

diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -689,10 +689,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, string, Expand);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Gemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Gemm);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, BFloat16, Gemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, MatMul);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, BFloat16, MatMul);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Min);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Max);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Mean);
@@ -2426,13 +2428,14 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                   MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
                                                                   MatMul)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                  MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, BFloat16, MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Min)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Max)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Mean)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, BFloat16, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Sign)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Size)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sum)>,

diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_activations.h b/onnxruntime/core/providers/cpu/fp16/fp16_activations.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "core/mlas/inc/mlas.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/providers/cpu/activation/activations.h"
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED

diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@@ -10,7 +10,7 @@
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 
 #include "core/common/safeint.h"
-#include "core/framework/float16.h"
+#include "core/common/float16.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/nn/conv_attributes.h"