proper dispatch the kernel with atomic_add but can only in runtime

yhmtsai · yhmtsai · commit 66748e15bd0e · 2025-05-28T13:31:34.000+02:00
diff --git a/common/cuda_hip/matrix/coo_kernels.cpp b/common/cuda_hip/matrix/coo_kernels.cpp
@@ -271,10 +271,20 @@ void spmv2(std::shared_ptr<const DefaultExecutor> exec,
         return;
     }
 // not support 16 bit atomic
-#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+#if !defined(CUDA_VERSION)
     if constexpr (sizeof(remove_complex<ValueType>) == sizeof(int16)) {
         GKO_NOT_SUPPORTED(c);
     } else
+#else
+    auto compute_capability = as<CudaExecutor>(exec)->get_major_version() * 10 +
+                              as<CudaExecutor>(exec)->get_minor_version();
+    if (compute_capability < 70 &&
+        std::is_same_v<remove_complex<ValueType>, half>) {
+        GKO_NOT_SUPPORTED(c);
+    } else if (compute_capability < 80 &&
+               std::is_same_v<remove_complex<ValueType>, bfloat16>) {
+        GKO_NOT_SUPPORTED(c);
+    } else
 #endif
     {
         // TODO: b_ncols needs to be tuned for ROCm.
@@ -323,10 +333,20 @@ void advanced_spmv2(std::shared_ptr<const DefaultExecutor> exec,
         return;
     }
     // not support 16 bit atomic
-#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+#if !defined(CUDA_VERSION)
     if constexpr (sizeof(remove_complex<ValueType>) == sizeof(int16)) {
         GKO_NOT_SUPPORTED(c);
     } else
+#else
+    auto compute_capability = as<CudaExecutor>(exec)->get_major_version() * 10 +
+                              as<CudaExecutor>(exec)->get_minor_version();
+    if (compute_capability < 70 &&
+        std::is_same_v<remove_complex<ValueType>, half>) {
+        GKO_NOT_SUPPORTED(c);
+    } else if (compute_capability < 80 &&
+               std::is_same_v<remove_complex<ValueType>, bfloat16>) {
+        GKO_NOT_SUPPORTED(c);
+    } else
 #endif
     {
         // TODO: b_ncols needs to be tuned for ROCm.
diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -2091,11 +2091,21 @@ bool load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
     using arithmetic_type =
         highest_precision<InputValueType, OutputValueType, MatrixValueType>;
 
-    // not support 16 bit atomic
-#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+// not support 16 bit atomic
+#if !defined(CUDA_VERSION)
     if constexpr (sizeof(remove_complex<OutputValueType>) == sizeof(int16)) {
         return false;
     } else
+#else
+    auto compute_capability = as<CudaExecutor>(exec)->get_major_version() * 10 +
+                              as<CudaExecutor>(exec)->get_minor_version();
+    if (compute_capability < 70 &&
+        std::is_same_v<remove_complex<OutputValueType>, half>) {
+        return false;
+    } else if (compute_capability < 80 &&
+               std::is_same_v<remove_complex<OutputValueType>, bfloat16>) {
+        return false;
+    } else
 #endif
     {
         if (beta) {
diff --git a/common/cuda_hip/matrix/ell_kernels.cpp b/common/cuda_hip/matrix/ell_kernels.cpp
@@ -252,7 +252,7 @@ void abstract_spmv(syn::value_list<int, info>,
                          b->get_size()[1], 1);
 
 // not support 16 bit atomic
-#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+#if !defined(CUDA_VERSION)
     // We do atomic on shared memory when num_thread_per_worker is not 1.
     // If atomic is also true, we also do atomic on out_vector.
     constexpr bool shared_half =
@@ -263,6 +263,25 @@ void abstract_spmv(syn::value_list<int, info>,
                   (shared_half || atomic_half_out)) {
         GKO_KERNEL_NOT_FOUND;
     } else
+#else
+    constexpr bool shared_half =
+        sizeof(remove_complex<arithmetic_type>) == sizeof(half);
+    constexpr bool atomic_half_out =
+        atomic && sizeof(remove_complex<OutputValueType>) == sizeof(half);
+    constexpr bool shared_bfloat16 =
+        sizeof(remove_complex<arithmetic_type>) == sizeof(bfloat16);
+    constexpr bool atomic_bfloat16_out =
+        atomic && sizeof(remove_complex<OutputValueType>) == sizeof(bfloat16);
+    auto compute_capability = as<CudaExecutor>(exec)->get_major_version() * 10 +
+                              as<CudaExecutor>(exec)->get_minor_version();
+    if (num_thread_per_worker != 1 && (shared_half || atomic_half_out) &&
+        compute_capability < 70) {
+        GKO_KERNEL_NOT_FOUND;
+    } else if (num_thread_per_worker != 1 &&
+               (shared_bfloat16 || atomic_bfloat16_out) &&
+               compute_capability < 80) {
+        GKO_KERNEL_NOT_FOUND;
+    } else
 #endif
     {
         const auto a_vals = acc::range<a_accessor>(
diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp
@@ -454,11 +454,22 @@ void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
         if (nrhs > 1 || is_complex<ValueType>()) {
             components::fill_array(exec, alpha->get_values(), nrhs,
                                    zero<ValueType>());
-            // not support 16 bit atomic
-#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+// not support 16 bit atomic
+#if !defined(CUDA_VERSION)
             if constexpr (sizeof(remove_complex<ValueType>) == sizeof(int16)) {
                 GKO_NOT_SUPPORTED(alpha);
             } else
+#else
+            auto compute_capability =
+                as<CudaExecutor>(exec)->get_major_version() * 10 +
+                as<CudaExecutor>(exec)->get_minor_version();
+            if (compute_capability < 70 &&
+                std::is_same_v<remove_complex<ValueType>, half>) {
+                GKO_NOT_SUPPORTED(alpha);
+            } else if (compute_capability < 80 &&
+                       std::is_same_v<remove_complex<ValueType>, bfloat16>) {
+                GKO_NOT_SUPPORTED(alpha);
+            } else
 #endif
             {
                 multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
@@ -513,11 +524,23 @@ void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
         auto m_i = m->get_values() + i * m_stride + k * nrhs;
         if (nrhs > 1 || is_complex<ValueType>()) {
             components::fill_array(exec, m_i, nrhs, zero<ValueType>());
-            // not support 16 bit atomic
-#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
+
+// not support 16 bit atomic
+#if !defined(CUDA_VERSION)
             if constexpr (sizeof(remove_complex<ValueType>) == sizeof(int16)) {
                 GKO_NOT_SUPPORTED(m_i);
             } else
+#else
+            auto compute_capability =
+                as<CudaExecutor>(exec)->get_major_version() * 10 +
+                as<CudaExecutor>(exec)->get_minor_version();
+            if (compute_capability < 70 &&
+                std::is_same_v<remove_complex<ValueType>, half>) {
+                GKO_NOT_SUPPORTED(m_i);
+            } else if (compute_capability < 80 &&
+                       std::is_same_v<remove_complex<ValueType>, bfloat16>) {
+                GKO_NOT_SUPPORTED(m_i);
+            } else
 #endif
             {
                 multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(